In [17]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import sklearn.metrics

pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore")

In [18]:
df = pd.read_csv('./project_data/out_eda1.csv', 
                 index_col = 'Claim Identifier')

In [107]:
test = pd.read_csv('./project_data/test_treated', index_col = 'Claim Identifier')

## 1. K Fold

<a href="#top">Top &#129033;</a>

In [113]:
X = df.drop('Claim Injury Type', axis = 1)
y = df['Claim Injury Type']

In [115]:
kf = KFold(n_splits=5, shuffle=True, random_state=1)

In [116]:
features = ['C-2 Day', 'Accident Year', 'Birth Year', 'Assembly Month', 
            'C-2 Month', 'Average Weekly Wage', 'Age at Injury', 'C-2 Year', 
            'Number of Dependents', 'Accident Day', 'Assembly Year', 
            'First Hearing Year', 'IME-4 Count', 'Assembly Day', 
            'Accident Month', 'WCIO Cause of Injury Code', 'Gender', 
            'COVID-19 Indicator', 'WCIO Part Of Body Code', 'County of Injury', 'Attorney/Representative', 'Carrier Type', 'District Name', 'Medical Fee Region', 'Zip Code', 'Carrier Name', 'C-3 Date Binary', 'Alternative Dispute Resolution', 'WCIO Nature of Injury Code', 'Industry Code']
predictions = []


def k_fold(df, features):
    for train_index, val_index in kf.split(X):

        # Split data into training and validation folds
        X_train = X.iloc[train_index]
        X_val = X.iloc[val_index]
        y_train = y.iloc[train_index]
        y_val = y.iloc[val_index]

        # fit model
        model = LogisticRegression()
        model.fit(X_train, y_train)

        # make predictions
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)

        # compute metrics
        print(classification_report(y_train, train_pred))
        print(classification_report(y_val, val_pred))

        # save predictions and best model's parameters

        predictions.append({'Train Predictions': train_pred, 'Validation Predictions': val_pred})
        
    return predictions


                 precision    recall  f1-score   support

   1. CANCELLED       0.00      0.00      0.00      9906
    2. NON-COMP       0.60      0.98      0.74    232910
    3. MED ONLY       0.00      0.00      0.00     55214
   4. TEMPORARY       0.56      0.32      0.41    118608
5. PPD SCH LOSS       0.16      0.05      0.07     38661
     6. PPD NSL       0.00      0.00      0.00      3336
         7. PTD       0.00      0.00      0.00        69
       8. DEATH       0.02      0.00      0.00       375

       accuracy                           0.58    459079
      macro avg       0.17      0.17      0.15    459079
   weighted avg       0.46      0.58      0.49    459079

                 precision    recall  f1-score   support

   1. CANCELLED       0.00      0.00      0.00      2512
    2. NON-COMP       0.60      0.98      0.74     58063
    3. MED ONLY       0.00      0.00      0.00     13687
   4. TEMPORARY       0.56      0.32      0.41     29891
5. PPD SCH LOSS       0.15 

## test

In [117]:
for col in test.columns:
    test = custom_impute(test, col)

In [118]:
test = test.fillna(0)
test.isna().sum()

Age at Injury                     0
Alternative Dispute Resolution    0
Attorney/Representative           0
Average Weekly Wage               0
Birth Year                        0
Carrier Name                      0
Carrier Type                      0
County of Injury                  0
COVID-19 Indicator                0
District Name                     0
Gender                            0
IME-4 Count                       0
Industry Code                     0
Medical Fee Region                0
WCIO Cause of Injury Code         0
WCIO Nature of Injury Code        0
WCIO Part Of Body Code            0
Zip Code                          0
Number of Dependents              0
Accident Year                     0
Accident Month                    0
Accident Day                      0
Assembly Year                     0
Assembly Month                    0
Assembly Day                      0
C-2 Year                          0
C-2 Month                         0
C-2 Day                     

In [122]:
test['Claim Injury Type'] = model.predict(test)


In [124]:
test['Claim Injury Type'].value_counts() 

Claim Injury Type
2. NON-COMP        345305
4. TEMPORARY        33482
5. PPD SCH LOSS      9132
1. CANCELLED           52
6. PPD NSL              4
Name: count, dtype: int64

In [125]:
predictions = test['Claim Injury Type']
predictions

Claim Identifier
6165911    2. NON-COMP
6166141    2. NON-COMP
6165907    2. NON-COMP
6166047    2. NON-COMP
6166102    2. NON-COMP
              ...     
6553137    2. NON-COMP
6553119    2. NON-COMP
6553542    2. NON-COMP
6553455    2. NON-COMP
6553594    2. NON-COMP
Name: Claim Injury Type, Length: 387975, dtype: object

In [128]:
predictions.to_csv('./predictions/pred1.csv')