In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, StratifiedKFold

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier  
from sklearn.tree import DecisionTreeClassifier
#from lightgbm import LGBMClassifier 
#from catboost import CatBoostClassifier 

# Metrics
from sklearn.metrics import classification_report




## EXTRA
pd.set_option('display.max_columns', None)

# Warnings
import warnings
warnings.filterwarnings("ignore")

import NA_outliers as n
import play_song as song


In [2]:
# FUNCTION

import pandas as pd
import numpy as np
import time
import joblib
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, FunctionTransformer
from sklearn.metrics import classification_report

#from imblearn.over_sampling import SMOTE
#from imblearn.under_sampling import RandomUnderSampler
#from imblearn.pipeline import Pipeline as ImbPipeline

import NA_outliers as n
import play_song as song


In [3]:
df = pd.read_csv('./project_data/out_eda1.csv', 
                 index_col = 'Claim Identifier')

## 1. Train

<a href="#top">Top &#129033;</a>

In [4]:
def k_fold(df, features, target, model_name, k = 5, 
           model = LogisticRegression(), patience=2,
           randoms = False, param_grid = None):
    
    start_time = time.time()
    
    X = df[features]
    y = df[target]
    
    kf = StratifiedKFold(n_splits= k, shuffle=True, random_state=1)
    predictions = []
    
    # Initialize variables for early stopping
    best_macro_avg = 0  # Track the best macro average score
    no_improvement_count = 0  # Counter for early stopping

    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y), 1):
        
        print(f'----------FOLD {fold}----------')
        ### SPLIT
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        original_columns = X_train.columns
        
        ### PIPELINE
        
        pipeline = Pipeline([
        ('imputer', FunctionTransformer(n.custom_impute, validate=False)), 
        ('log_transform', FunctionTransformer(n.log_transform, validate=False)),  
        ('scaler', RobustScaler()),
        ])
        
        X_train = pipeline.fit_transform(X_train, y_train)
        X_val = pipeline.transform(X_val)
        X_train = pd.DataFrame(X_train, columns=original_columns)
        X_val = pd.DataFrame(X_val, columns=original_columns)

        if randoms:
            random_search = RandomizedSearchCV(model, param_grid, n_iter=10, 
                                               scoring='f1_macro', cv=3, random_state=1)
            random_search.fit(X_train, y_train)
            best_model = random_search.best_estimator_
            print(f"Best hyperparameters for fold {fold}: {random_search.best_params_}")
        else:
            best_model = model
            best_model.fit(X_train, y_train)


        # make predictions
        train_pred = best_model.predict(X_train)
        val_pred = best_model.predict(X_val)

        # Compute metrics
        train_report = classification_report(y_train, train_pred, output_dict=True)
        val_report = classification_report(y_val, val_pred, output_dict=True)
        
        print(f"Fold {fold} - Training Report:\n", classification_report(y_train, train_pred))
        print(f"Fold {fold} - Validation Report:\n", classification_report(y_val, val_pred))
        
        val_macro_avg = val_report['macro avg']['f1-score']
        
        if val_macro_avg > best_macro_avg:
            best_macro_avg = val_macro_avg
            no_improvement_count = 0
            
            # Save the best model 
            joblib.dump(best_model, f'./models/{model_name}.joblib')
        else:
            no_improvement_count += 1
            print(f"No improvement for {no_improvement_count} fold(s)")

        if no_improvement_count >= patience:
            print(f"Early stopping at fold {fold} due to no improvement in macro average for {patience} folds")
            break
        
        
        # save predictions and best model's parameters
        predictions.append({'Train Predictions': train_pred, 'Validation Predictions': val_pred})
    
    
    # Time
    end_time = time.time()
    elapsed_time = round((end_time - start_time) / 60, 2)
    print(f'This run took {elapsed_time} minutes')
    
    # Play Warning Song
    song.play_('audio.mp3')
        
    return predictions


## 1.1Model

**Selected Features to use for predictions**

In [5]:
features = df.drop('Claim Injury Type', axis = 1).columns

**Random Search Parameters**

In [6]:
param_grid = {
    'n_estimators': [50, 100, 200, 300, 500],  # Number of trees in the forest
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider at every split
    'max_depth': [None, 10, 20, 30, 40, 50],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4, 5, 10],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False],  # Whether bootstrap samples are used when building trees
    'class_weight': [None, 'balanced'],  # Weights associated with classes
    'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
}

**Model**

In [8]:
# XGB Only
# class_mapping = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}
# df['Claim Injury Type'] = df['Claim Injury Type'].map(class_mapping)

# model_name_featureselection_log_SMOTE&UNDER_nfolds_randomsearch
target = 'Claim Injury Type'
model_name = 'RF_noFS_LOG_10F;RS'
model = RandomForestClassifier()


k_fold(df, features, target, model_name, k = 5,
           model = model, randoms = True, 
           param_grid = param_grid )

----------FOLD 1----------
Best hyperparameters for fold 1: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 40, 'criterion': 'entropy', 'class_weight': 'balanced', 'bootstrap': True}
Fold 1 - Training Report:
               precision    recall  f1-score   support

           1       0.78      1.00      0.88      9981
           2       0.98      0.96      0.97    232862
           3       0.95      0.85      0.90     55124
           4       0.93      0.95      0.94    118806
           5       0.86      0.98      0.92     38624
           6       0.99      1.00      1.00      3369
           7       1.00      1.00      1.00        78
           8       0.83      1.00      0.90       376

    accuracy                           0.95    459220
   macro avg       0.92      0.97      0.94    459220
weighted avg       0.95      0.95      0.95    459220

Fold 1 - Validation Report:
               precision    recall  f1-score   suppor

KeyboardInterrupt: 

**Load Model**

In [None]:
# Load the model from the file
model = joblib.load(f'./models/{model_name}.joblib')

# 2. Test

In [None]:
test = pd.read_csv('./project_data/test_treated.csv', 
                   index_col = 'Claim Identifier')

test = test[features]

**Predictions**

In [None]:
test['Claim Injury Type'] = model.predict(test_features)

**Map to Original**

In [None]:
# ## XGB ONLY
# label_mapping = {
#     0: "1. CANCELLED",
#     1: "2. NON-COMP",
#     2: "3. MED ONLY",
#     3: "4. TEMPORARY",
#     4: "5. PPD SCH LOSS",
#     5: "6. PPD NSL",
#     6: "7. PTD",
#     7: "8. DEATH"
# }

# test['Claim Injury Type'] = test['Claim Injury Type'].replace(label_mapping)

In [None]:
label_mapping = {
    1: "1. CANCELLED",
    2: "2. NON-COMP",
    3: "3. MED ONLY",
    4: "4. TEMPORARY",
    5: "5. PPD SCH LOSS",
    6: "6. PPD NSL",
    7: "7. PTD",
    8: "8. DEATH"
}

test['Claim Injury Type'] = test['Claim Injury Type'].replace(label_mapping)

Check each category inside target

In [None]:
test['Claim Injury Type'].value_counts() 

Select only the predictions column and the index

In [None]:
predictions = test['Claim Injury Type']

# 3. Export

In [None]:
predictions.to_csv(f'./predictions/{model_name}.csv')

**Results**

__*<center>Models K-Fold*__ 

| Model | Feature Selection | Log | Parameters | Kaggle Score | Fold |
| ----- | ------------------ | --- | ---------- | ------------ | ---- |
| LogReg | - | - | -  | 0.21122 | 5 |
| RF | 1 | X | - | 0.29078 | 5 |
| XGB | 1 | X | - | 0.20642 | 10 |
| RF | - | - | - | 0.26616 | 5 |
    
<br><br>
    
__*<center>Models w/ Stratified K-Fold*__   
    
| Model | Feature Selection | Log | Parameters | Kaggle Score | Fold | 
| ----- | ------------------ | --- | ---------- | ------------ | ---- |
| RF | - | - | - | 0.26912 | 10 |
| DT | - | - | - | 0.14236 | 10 |
| DT | ------------------ | X | ---------- | 0.15589 | 10 |
| ----- | ------------------ | --- | ---------- | ------------ | ---- |
| ----- | ------------------ | --- | ---------- | ------------ | ---- |
| ----- | ------------------ | --- | ---------- | ------------ | ---- |
| ----- | ------------------ | --- | ---------- | ------------ | ---- |
    
<br><br>
    
**Features for Feature Selection 1**

['C-2 Day', 'Accident Year', 'Birth Year', 'Assembly Month',
            'C-2 Month', 'Average Weekly Wage', 'Age at Injury', 
            'C-2 Year', 'Number of Dependents', 'Accident Day', 
            'Assembly Year', 'First Hearing Year', 'IME-4 Count', 
            'Assembly Day', 'Accident Month', 
            'WCIO Cause of Injury Code', 'Gender', 
            'COVID-19 Indicator', 'WCIO Part Of Body Code', 
            'County of Injury', 'Attorney/Representative', 
            'Carrier Type', 'District Name', 'Medical Fee Region', 
            'Zip Code', 'Carrier Name', 'C-3 Date Binary', 
            'Alternative Dispute Resolution', 
            'WCIO Nature of Injury Code', 'Industry Code']
