In [85]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier  
#from lightgbm import LGBMClassifier 
#from catboost import CatBoostClassifier 

# Metrics
from sklearn.metrics import classification_report


pd.set_option('display.max_columns', None)

# Warnings
import warnings
warnings.filterwarnings("ignore")

import NA_outliers as n
import play_song as song

In [86]:
df = pd.read_csv('./project_data/out_eda1.csv', 
                 index_col = 'Claim Identifier')

In [75]:
test = pd.read_csv('./project_data/test_treated.csv', index_col = 'Claim Identifier')

## 1. K Fold

<a href="#top">Top &#129033;</a>

In [87]:
from sklearn.metrics import classification_report
#from collections import Counter
import time
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
#from functools import partial
from sklearn.preprocessing import RobustScaler
import joblib


In [88]:
def k_fold(df, features, target, k = 5, model = LogisticRegression()):
    
    start_time = time.time()
    
    X = df[features]
    y = df[target]
    
    kf = KFold(n_splits= k, shuffle=True, random_state=1)
    predictions = []
    
    for train_idx, val_idx in kf.split(X):

        ### SPLIT
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        original_columns = X_train.columns
        
        ### PIPELINE
        
        pipeline = Pipeline([
        ('imputer', FunctionTransformer(n.custom_impute, validate=False)), 
        ('log_transform', FunctionTransformer(n.log_transform, validate=False)),  
        ('scaler', RobustScaler()),
        ])
        
        X_train = pipeline.fit_transform(X_train, y_train)
        X_val = pipeline.transform(X_val)
        X_train = pd.DataFrame(X_train, columns=original_columns)
        X_val = pd.DataFrame(X_val, columns=original_columns)

        
        # fit model
        model = model
        model.fit(X_train, y_train)

        # make predictions
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)

        # compute metrics
        print(classification_report(y_train, train_pred))
        print(classification_report(y_val, val_pred))

        # save predictions and best model's parameters

        predictions.append({'Train Predictions': train_pred, 'Validation Predictions': val_pred})
    
    model_name = type(model).__name__
    print(model_name)
    joblib.dump(model, f'./models/{model_name}.joblib')  

    end_time = time.time()
    elapsed_time = (end_time - start_time) / 60
    print(elapsed_time)
        
    return predictions


In [89]:
features = ['C-2 Day', 'Accident Year', 'Birth Year', 'Assembly Month',
            'C-2 Month', 'Average Weekly Wage', 'Age at Injury', 
            'C-2 Year', 'Number of Dependents', 'Accident Day', 
            'Assembly Year', 'First Hearing Year', 'IME-4 Count', 
            'Assembly Day', 'Accident Month', 
            'WCIO Cause of Injury Code', 'Gender', 
            'COVID-19 Indicator', 'WCIO Part Of Body Code', 
            'County of Injury', 'Attorney/Representative', 
            'Carrier Type', 'District Name', 'Medical Fee Region', 
            'Zip Code', 'Carrier Name', 'C-3 Date Binary', 
            'Alternative Dispute Resolution', 
            'WCIO Nature of Injury Code', 'Industry Code']



In [None]:
# XGB Only
# class_mapping = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}
# df['Claim Injury Type'] = df['Claim Injury Type'].map(class_mapping)


k_fold(df, features = features, 
       target = 'Claim Injury Type', k = 10, 
       model = GradientBoostingClassifier())

In [None]:
song.play_('audio.mp3')

## test

In [76]:
test = test[features]

In [71]:
# Load the model from the file
model = joblib.load('./models/XGBClassifier.joblib')

In [77]:
for col in test.columns:
    test = n.custom_impute(test)

ValueError: Found array with 0 sample(s) (shape=(0, 29)) while a minimum of 1 is required by NearestNeighbors.

In [78]:
test.isna().sum()

C-2 Day                           0
Accident Year                     0
Birth Year                        0
Assembly Month                    0
C-2 Month                         0
Average Weekly Wage               0
Age at Injury                     0
C-2 Year                          0
Number of Dependents              0
Accident Day                      0
Assembly Year                     0
First Hearing Year                0
IME-4 Count                       0
Assembly Day                      0
Accident Month                    0
WCIO Cause of Injury Code         0
Gender                            0
COVID-19 Indicator                0
WCIO Part Of Body Code            0
County of Injury                  0
Attorney/Representative           0
Carrier Type                      0
District Name                     0
Medical Fee Region                0
Zip Code                          0
Carrier Name                      0
C-3 Date Binary                   0
Alternative Dispute Resoluti

**Predictions**

In [80]:
test['Claim Injury Type'] = model.predict(test)

**Map to Original**

In [81]:
# ## XGB ONLY
# label_mapping = {
#     0: "1. CANCELLED",
#     1: "2. NON-COMP",
#     2: "3. MED ONLY",
#     3: "4. TEMPORARY",
#     4: "5. PPD SCH LOSS",
#     5: "6. PPD NSL",
#     6: "7. PTD",
#     7: "8. DEATH"
# }

# test['Claim Injury Type'] = test['Claim Injury Type'].replace(label_mapping)

In [67]:
label_mapping = {
    1: "1. CANCELLED",
    2: "2. NON-COMP",
    3: "3. MED ONLY",
    4: "4. TEMPORARY",
    5: "5. PPD SCH LOSS",
    6: "6. PPD NSL",
    7: "7. PTD",
    8: "8. DEATH"
}

test['Claim Injury Type'] = test['Claim Injury Type'].replace(label_mapping)

In [82]:
test['Claim Injury Type'].value_counts() 

Claim Injury Type
2. NON-COMP        326853
3. MED ONLY         53605
4. TEMPORARY         5764
6. PPD NSL            839
5. PPD SCH LOSS       709
8. DEATH              119
1. CANCELLED           86
Name: count, dtype: int64

In [83]:
predictions = test['Claim Injury Type']
predictions

Claim Identifier
6165911    2. NON-COMP
6166141    2. NON-COMP
6165907    2. NON-COMP
6166047    2. NON-COMP
6166102    2. NON-COMP
              ...     
6553137    2. NON-COMP
6553119    3. MED ONLY
6553542    2. NON-COMP
6553455    3. MED ONLY
6553594    3. MED ONLY
Name: Claim Injury Type, Length: 387975, dtype: object

In [84]:
predictions.to_csv('./predictions/pred3_XGB.csv')