In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier  
#from lightgbm import LGBMClassifier 
#from catboost import CatBoostClassifier 

# Metrics
from sklearn.metrics import classification_report


pd.set_option('display.max_columns', None)

# Warnings
import warnings
warnings.filterwarnings("ignore")

import NA_outliers as n
import play_song as song

In [13]:
df = pd.read_csv('./project_data/out_eda1.csv', 
                 index_col = 'Claim Identifier')

## 1. Train

<a href="#top">Top &#129033;</a>

In [87]:
from sklearn.metrics import classification_report
#from collections import Counter
import time
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
#from functools import partial
from sklearn.preprocessing import RobustScaler
import joblib


In [None]:
def k_fold(df, features, target, k = 5, model = LogisticRegression()):
    
    start_time = time.time()
    
    X = df[features]
    y = df[target]
    
    kf = KFold(n_splits= k, shuffle=True, random_state=1)
    predictions = []
    
    for train_idx, val_idx in kf.split(X):

        ### SPLIT
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        original_columns = X_train.columns
        
        ### PIPELINE
        
        pipeline = Pipeline([
        ('imputer', FunctionTransformer(n.custom_impute, validate=False)), 
        #('log_transform', FunctionTransformer(n.log_transform, validate=False)),  
        ('scaler', RobustScaler()),
        ])
        
        X_train = pipeline.fit_transform(X_train, y_train)
        X_val = pipeline.transform(X_val)
        X_train = pd.DataFrame(X_train, columns=original_columns)
        X_val = pd.DataFrame(X_val, columns=original_columns)

        
        # fit model
        model = model
        model.fit(X_train, y_train)

        # make predictions
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)

        # compute metrics
        print(classification_report(y_train, train_pred))
        print(classification_report(y_val, val_pred))

        # save predictions and best model's parameters

        predictions.append({'Train Predictions': train_pred, 'Validation Predictions': val_pred})
    
    model_name = type(model).__name__
    print(model_name)
    joblib.dump(model, f'./models/{model_name}.joblib')  

    end_time = time.time()
    elapsed_time = (end_time - start_time) / 60
    print(elapsed_time)
    song.play_('audio.mp3')
        
    return predictions


In [None]:
train_report = classification_report(y_train, train_pred)
        val_report = classification_report(y_val, val_pred, 
                                           output_dict=True)
        
        print(train_report)
        print(val_report)


**Selected Features to use for predictions**

In [17]:
features = df.columns

**Model**

In [90]:
# XGB Only
# class_mapping = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}
# df['Claim Injury Type'] = df['Claim Injury Type'].map(class_mapping)


k_fold(df, features = features, 
       target = 'Claim Injury Type', k = 10, 
       model = RandomForestClassifier())

              precision    recall  f1-score   support

           1       0.73      0.49      0.58     11164
           2       0.85      0.96      0.90    262023
           3       0.51      0.06      0.10     61970
           4       0.69      0.89      0.78    133587
           5       0.70      0.56      0.62     43569
           6       0.69      0.00      0.01      3800
           7       0.08      0.01      0.02        86
           8       0.58      0.15      0.24       423

    accuracy                           0.78    516622
   macro avg       0.60      0.39      0.41    516622
weighted avg       0.75      0.78      0.74    516622

              precision    recall  f1-score   support

           1       0.79      0.39      0.52      1312
           2       0.84      0.96      0.90     29055
           3       0.50      0.06      0.10      6936
           4       0.70      0.88      0.78     14920
           5       0.68      0.56      0.61      4711
           6       0.00 

              precision    recall  f1-score   support

           1       0.73      0.49      0.58     11303
           2       0.85      0.96      0.90    261954
           3       0.50      0.06      0.11     61950
           4       0.70      0.89      0.78    133604
           5       0.70      0.56      0.62     43505
           6       0.64      0.00      0.01      3806
           7       0.16      0.03      0.06        89
           8       0.61      0.12      0.21       412

    accuracy                           0.78    516623
   macro avg       0.61      0.39      0.41    516623
weighted avg       0.75      0.78      0.74    516623

              precision    recall  f1-score   support

           1       0.70      0.49      0.58      1173
           2       0.85      0.96      0.90     29124
           3       0.50      0.06      0.11      6956
           4       0.69      0.89      0.78     14903
           5       0.71      0.55      0.62      4775
           6       0.25 

[{'Train Predictions': array([2, 5, 4, ..., 2, 4, 2]),
  'Validation Predictions': array([2, 4, 5, ..., 2, 4, 2])},
 {'Train Predictions': array([2, 5, 4, ..., 4, 2, 2]),
  'Validation Predictions': array([2, 4, 5, ..., 2, 4, 2])},
 {'Train Predictions': array([2, 5, 4, ..., 4, 2, 2]),
  'Validation Predictions': array([4, 2, 2, ..., 2, 2, 2])},
 {'Train Predictions': array([2, 4, 2, ..., 2, 4, 2]),
  'Validation Predictions': array([5, 4, 2, ..., 4, 2, 2])},
 {'Train Predictions': array([5, 4, 4, ..., 4, 2, 2]),
  'Validation Predictions': array([2, 4, 4, ..., 2, 2, 2])},
 {'Train Predictions': array([2, 5, 4, ..., 4, 2, 2]),
  'Validation Predictions': array([2, 4, 4, ..., 2, 4, 4])},
 {'Train Predictions': array([2, 5, 4, ..., 4, 2, 2]),
  'Validation Predictions': array([2, 5, 2, ..., 2, 2, 4])},
 {'Train Predictions': array([2, 5, 4, ..., 4, 2, 2]),
  'Validation Predictions': array([2, 2, 5, ..., 4, 4, 4])},
 {'Train Predictions': array([2, 5, 4, ..., 4, 2, 2]),
  'Validation Pre

**Load Model**

In [None]:
# Load the model from the file
model = joblib.load('./models/GradientBoostingClassifier.joblib')

# 2. Test

In [14]:
test = pd.read_csv('./project_data/test_treated.csv', 
                   index_col = 'Claim Identifier')

test = test[df.drop('Claim Injury Type', axis = 1).columns]

**Predictions**

In [96]:
test['Claim Injury Type'] = model.predict(test)

**Map to Original**

In [81]:
# ## XGB ONLY
# label_mapping = {
#     0: "1. CANCELLED",
#     1: "2. NON-COMP",
#     2: "3. MED ONLY",
#     3: "4. TEMPORARY",
#     4: "5. PPD SCH LOSS",
#     5: "6. PPD NSL",
#     6: "7. PTD",
#     7: "8. DEATH"
# }

# test['Claim Injury Type'] = test['Claim Injury Type'].replace(label_mapping)

In [97]:
label_mapping = {
    1: "1. CANCELLED",
    2: "2. NON-COMP",
    3: "3. MED ONLY",
    4: "4. TEMPORARY",
    5: "5. PPD SCH LOSS",
    6: "6. PPD NSL",
    7: "7. PTD",
    8: "8. DEATH"
}

test['Claim Injury Type'] = test['Claim Injury Type'].replace(label_mapping)

Check each category inside target

In [98]:
test['Claim Injury Type'].value_counts() 

Claim Injury Type
8. DEATH           253631
3. MED ONLY         74303
2. NON-COMP         51639
6. PPD NSL           4500
4. TEMPORARY         2969
7. PTD                932
5. PPD SCH LOSS         1
Name: count, dtype: int64

Select only the predictions column and the index

In [99]:
predictions = test['Claim Injury Type']
predictions

Claim Identifier
6165911       8. DEATH
6166141       8. DEATH
6165907       8. DEATH
6166047       8. DEATH
6166102       8. DEATH
              ...     
6553137    2. NON-COMP
6553119    3. MED ONLY
6553542    3. MED ONLY
6553455    3. MED ONLY
6553594    3. MED ONLY
Name: Claim Injury Type, Length: 387975, dtype: object

# 3. Export

In [101]:
predictions.to_csv('./predictions/pred4_GB.csv')

**Results**

__*<center>Models*__ <br>

| Model | Feature Selection? | Log | Parameters | Kaggle Score |
| ----- | ------------------ | --- | ---------- | ------------ |
| LogReg | - | - | -  | 0.21122 |
| RF | 1 | X | - | 0.29078 |
| XGB | 1 | X | - | 0.20642 |
| ----- | ------------------ | --- | ---------- | ------------ |
| ----- | ------------------ | --- | ---------- | ------------ |

<br><br>
    
**Features for Feature Selection 1**

['C-2 Day', 'Accident Year', 'Birth Year', 'Assembly Month',
            'C-2 Month', 'Average Weekly Wage', 'Age at Injury', 
            'C-2 Year', 'Number of Dependents', 'Accident Day', 
            'Assembly Year', 'First Hearing Year', 'IME-4 Count', 
            'Assembly Day', 'Accident Month', 
            'WCIO Cause of Injury Code', 'Gender', 
            'COVID-19 Indicator', 'WCIO Part Of Body Code', 
            'County of Injury', 'Attorney/Representative', 
            'Carrier Type', 'District Name', 'Medical Fee Region', 
            'Zip Code', 'Carrier Name', 'C-3 Date Binary', 
            'Alternative Dispute Resolution', 
            'WCIO Nature of Injury Code', 'Industry Code']
