In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, StratifiedKFold

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier  
from sklearn.tree import DecisionTreeClassifier
#from lightgbm import LGBMClassifier 
#from catboost import CatBoostClassifier 

# Metrics
from sklearn.metrics import classification_report




## EXTRA
pd.set_option('display.max_columns', None)

# Warnings
import warnings
warnings.filterwarnings("ignore")

import NA_outliers as n
import play_song as song

In [None]:
# FUNCTION

import pandas as pd
import numpy as np
import time
import joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, FunctionTransformer
from sklearn.metrics import classification_report

import NA_outliers as n
import play_song as song


In [13]:
df = pd.read_csv('./project_data/out_eda1.csv', 
                 index_col = 'Claim Identifier')

## 1. Train

<a href="#top">Top &#129033;</a>

In [31]:
def k_fold(df, features, target, model_name, k = 5, 
           model = LogisticRegression(), patience=2):
    
    start_time = time.time()
    
    X = df[features]
    y = df[target]
    
    kf = StratifiedKFold(n_splits= k, shuffle=True, random_state=1)
    predictions = []
    
    # Initialize variables for early stopping
    best_macro_avg = 0  # Track the best macro average score
    no_improvement_count = 0  # Counter for early stopping

    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
        
        print(f'----------FOLD {fold}----------')
        ### SPLIT
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        original_columns = X_train.columns
        
        ### PIPELINE
        
        pipeline = Pipeline([
        ('imputer', FunctionTransformer(n.custom_impute, validate=False)), 
        #('log_transform', FunctionTransformer(n.log_transform, validate=False)),  
        ('scaler', RobustScaler()),
        ])
        
        X_train = pipeline.fit_transform(X_train, y_train)
        X_val = pipeline.transform(X_val)
        X_train = pd.DataFrame(X_train, columns=original_columns)
        X_val = pd.DataFrame(X_val, columns=original_columns)

        
        # fit model
        model = model
        model.fit(X_train, y_train)

        # make predictions
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)

        # Compute metrics
        train_report = classification_report(y_train, train_pred, output_dict=True)
        val_report = classification_report(y_val, val_pred, output_dict=True)
        
        print(f"Fold {fold} - Training Report:\n", classification_report(y_train, train_pred))
        print(f"Fold {fold} - Validation Report:\n", classification_report(y_val, val_pred))
        
        val_macro_avg = val_report['macro avg']['f1-score']
        
        if val_macro_avg > best_macro_avg:
            best_macro_avg = val_macro_avg
            no_improvement_count = 0
            
            # Save the best model 
            joblib.dump(model, f'./models/{model_name}.joblib')
        else:
            no_improvement_count += 1
            print(f"No improvement for {no_improvement_count} fold(s)")

        if no_improvement_count >= patience:
            print(f"Early stopping at fold {fold} due to no improvement in macro average for {patience} folds")
            break
        
        
        # save predictions and best model's parameters
        predictions.append({'Train Predictions': train_pred, 'Validation Predictions': val_pred})
    
    
    # Time
    end_time = time.time()
    elapsed_time = round((end_time - start_time) / 60, 2)
    print(f'This run took {elapsed_time} minutes')
    
    # Play Warning Song
    song.play_('audio.mp3')
        
    return predictions


**Selected Features to use for predictions**

In [44]:
features = df.drop('Claim Injury Type', axis = 1).columns

**Model**

In [45]:
# XGB Only
# class_mapping = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}
# df['Claim Injury Type'] = df['Claim Injury Type'].map(class_mapping)

model_name = 'RF_noFS_noLOG'
model = RandomForestClassifier()

k_fold(df, features = features, 
       target = 'Claim Injury Type', model_name = model_name,
       k = 5, model = model)

----------FOLD 1----------
Fold 1 - Training Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00      9962
           2       1.00      1.00      1.00    232856
           3       1.00      1.00      1.00     55027
           4       1.00      1.00      1.00    118775
           5       1.00      1.00      1.00     38774
           6       1.00      1.00      1.00      3370
           7       1.00      1.00      1.00        77
           8       1.00      1.00      1.00       379

    accuracy                           1.00    459220
   macro avg       1.00      1.00      1.00    459220
weighted avg       1.00      1.00      1.00    459220

Fold 1 - Validation Report:
               precision    recall  f1-score   support

           1       0.77      0.48      0.59      2514
           2       0.85      0.96      0.90     58222
           3       0.55      0.07      0.13     13879
           4       0.70      0.90      0.78     2

Input #0, wav, from '/var/folders/mm/fxsq_1490x9dd2w76tqvt3kr0000gn/T/tmp76kl_0d1.wav':
  Duration: 00:00:10.00, bitrate: 1536 kb/s
  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 48000 Hz, 2 channels, s16, 1536 kb/s
   9.95 M-A:  0.000 fd=   0 aq=    0KB vq=    0KB sq=    0B 




[{'Train Predictions': array([2, 4, 4, ..., 2, 4, 2]),
  'Validation Predictions': array([4, 2, 4, ..., 4, 2, 2])},
 {'Train Predictions': array([2, 2, 3, ..., 2, 4, 2]),
  'Validation Predictions': array([4, 4, 2, ..., 2, 2, 2])},
 {'Train Predictions': array([4, 4, 2, ..., 4, 2, 2]),
  'Validation Predictions': array([2, 4, 2, ..., 2, 2, 2])},
 {'Train Predictions': array([2, 4, 4, ..., 4, 2, 2]),
  'Validation Predictions': array([2, 2, 4, ..., 4, 2, 4])}]

**Load Model**

In [48]:
# Load the model from the file
model = joblib.load('./models/RF_noFS_noLOG.joblib')

# 2. Test

In [49]:
test = pd.read_csv('./project_data/test_treated.csv', 
                   index_col = 'Claim Identifier')

test = test[features]

**Predictions**

In [50]:
test['Claim Injury Type'] = model.predict(test_features)

**Map to Original**

In [81]:
# ## XGB ONLY
# label_mapping = {
#     0: "1. CANCELLED",
#     1: "2. NON-COMP",
#     2: "3. MED ONLY",
#     3: "4. TEMPORARY",
#     4: "5. PPD SCH LOSS",
#     5: "6. PPD NSL",
#     6: "7. PTD",
#     7: "8. DEATH"
# }

# test['Claim Injury Type'] = test['Claim Injury Type'].replace(label_mapping)

In [51]:
label_mapping = {
    1: "1. CANCELLED",
    2: "2. NON-COMP",
    3: "3. MED ONLY",
    4: "4. TEMPORARY",
    5: "5. PPD SCH LOSS",
    6: "6. PPD NSL",
    7: "7. PTD",
    8: "8. DEATH"
}

test['Claim Injury Type'] = test['Claim Injury Type'].replace(label_mapping)

Check each category inside target

In [52]:
test['Claim Injury Type'].value_counts() 

Claim Injury Type
2. NON-COMP        323085
4. TEMPORARY        42302
5. PPD SCH LOSS     11389
3. MED ONLY         11199
Name: count, dtype: int64

Select only the predictions column and the index

In [54]:
predictions = test['Claim Injury Type']

# 3. Export

In [55]:
predictions.to_csv(f'./predictions/{model_name}.csv')

**Results**

__*<center>Models K-Fold*__ 

| Model | Feature Selection? | Log | Parameters | Kaggle Score |
| ----- | ------------------ | --- | ---------- | ------------ |
| LogReg | - | - | -  | 0.21122 |
| RF | 1 | X | - | 0.29078 |
| XGB | 1 | X | - | 0.20642 |
| RF | - | - | - | ------------ |
    
<br><br>
    
__*<center>Models w/ Stratified K-Fold*__   
    
| Model | Feature Selection? | Log | Parameters | Kaggle Score |    
| ----- | ------------------ | --- | ---------- | ------------ |
| ----- | ------------------ | --- | ---------- | ------------ |
| ----- | ------------------ | --- | ---------- | ------------ |
| ----- | ------------------ | --- | ---------- | ------------ |
| ----- | ------------------ | --- | ---------- | ------------ |
| ----- | ------------------ | --- | ---------- | ------------ |    
| ----- | ------------------ | --- | ---------- | ------------ | 
    
<br><br>
    
**Features for Feature Selection 1**

['C-2 Day', 'Accident Year', 'Birth Year', 'Assembly Month',
            'C-2 Month', 'Average Weekly Wage', 'Age at Injury', 
            'C-2 Year', 'Number of Dependents', 'Accident Day', 
            'Assembly Year', 'First Hearing Year', 'IME-4 Count', 
            'Assembly Day', 'Accident Month', 
            'WCIO Cause of Injury Code', 'Gender', 
            'COVID-19 Indicator', 'WCIO Part Of Body Code', 
            'County of Injury', 'Attorney/Representative', 
            'Carrier Type', 'District Name', 'Medical Fee Region', 
            'Zip Code', 'Carrier Name', 'C-3 Date Binary', 
            'Alternative Dispute Resolution', 
            'WCIO Nature of Injury Code', 'Industry Code']
