In [92]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import sklearn.metrics

pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore")

In [73]:
df = pd.read_csv('./project_data/out_eda1.csv', 
                 index_col = 'Claim Identifier')

In [93]:
test = pd.read_csv('./project_data/test_treated', index_col = 'Claim Identifier')

In [74]:
#df[var_name].fillna(df[var_name].median())

In [75]:
def custom_impute(df, var_name):
      
    
    if any(word in var_name for word in ['Year', 'Month', 'Day']) and var_name != 'Birth Year':
        df[var_name] = df[var_name].fillna(df[var_name].median())
         
    
    # Birth Year
    if var_name == 'Birth Year':
    # Only perform imputation for rows where both columns are not NaN and Birth Year is NaN or 0
        mask = df['Accident Year'].notna() & df['Age at Injury'].notna()
        df.loc[mask & (df[var_name].isna() | (df[var_name] == 0)), 
                   var_name] = df['Accident Year'] - df['Age at Injury']
    
  
    
    # Zip Code
    if var_name == 'Zip Code':
        df[var_name] = df[var_name].fillna(99999)
        
    #TEMP
    if var_name == 'Average Weekly Wage':
        df[var_name] = df[var_name].fillna(0)
         
        
    # for all 'code' variables    
    code_columns = df.filter(regex='Code$', axis=1).columns
    df[code_columns] = df[code_columns].fillna(0)
        
    return df

def log_transform(X):
    return np.where(X > 0, np.log1p(X), X)

In [78]:
df = df.dropna()

In [79]:
df.isna().sum()

Age at Injury                     0
Alternative Dispute Resolution    0
Attorney/Representative           0
Average Weekly Wage               0
Birth Year                        0
Carrier Name                      0
Carrier Type                      0
Claim Injury Type                 0
County of Injury                  0
COVID-19 Indicator                0
District Name                     0
Gender                            0
IME-4 Count                       0
Industry Code                     0
Medical Fee Region                0
WCIO Cause of Injury Code         0
WCIO Nature of Injury Code        0
WCIO Part Of Body Code            0
Zip Code                          0
Number of Dependents              0
Accident Year                     0
Accident Month                    0
Accident Day                      0
Assembly Year                     0
Assembly Month                    0
Assembly Day                      0
C-2 Year                          0
C-2 Month                   

In [24]:
begin = df.index[df['Birth Year'].isna()].tolist()
df = custom_impute(df, 'Birth Year')
after = df.index[df['Birth Year'].isna()].tolist()

In [33]:
[i for i, value in enumerate(after) if value not in begin]

[]

In [76]:
for col in df.columns:
    df = custom_impute(df, col)

In [14]:
df[df['Accident Year'].notna() & df['Age at Injury'].notna() & df['Birth Year'].isna()]

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,Carrier Name,Carrier Type,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,Gender,IME-4 Count,Industry Code,Medical Fee Region,WCIO Cause of Injury Code,WCIO Nature of Injury Code,WCIO Part Of Body Code,Zip Code,Number of Dependents,Accident Year,Accident Month,Accident Day,Assembly Year,Assembly Month,Assembly Day,C-2 Year,C-2 Month,C-2 Day,C-3 Date Binary,First Hearing Year
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
5395789,0.0,571411.0,1.0,0.0,,942.0,52788.0,1,39618.0,0.0,270778.0,0.0,0.0,45.0,265981.0,98.0,52.0,22.0,10456,6.0,2021.0,7.0,15.0,2020,1,6,2020.0,3.0,9.0,1,2020
5400472,0.0,571411.0,1.0,0.0,,11038.0,285367.0,1,58333.0,0.0,270778.0,0.0,0.0,51.0,265981.0,52.0,72.0,13.0,11354,2.0,2021.0,7.0,15.0,2020,1,10,2020.0,2.0,19.0,1,2020
5401700,0.0,571411.0,1.0,0.0,,17540.0,285367.0,1,53207.0,0.0,270778.0,1.0,2.0,56.0,265981.0,99.0,59.0,42.0,11214,1.0,2021.0,7.0,15.0,2020,1,13,2020.0,2.0,10.0,1,2020
5401542,0.0,571411.0,1.0,0.0,,469.0,285367.0,1,60430.0,0.0,60536.0,0.0,0.0,23.0,265981.0,98.0,31.0,13.0,11730,2.0,2021.0,7.0,15.0,2020,1,13,2021.0,4.0,13.0,1,2021
5408856,0.0,571411.0,1.0,0.0,,8224.0,121920.0,1,53207.0,0.0,270778.0,1.0,0.0,48.0,265981.0,99.0,31.0,13.0,11236,0.0,2021.0,7.0,15.0,2020,1,22,2020.0,1.0,30.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6135328,0.0,571411.0,1.0,0.0,,136.0,121920.0,0,30874.0,0.0,45605.0,0.0,0.0,0.0,85033.0,0.0,0.0,0.0,14221,5.0,2021.0,7.0,15.0,2022,11,18,2021.0,7.0,16.0,1,0
6138100,0.0,571411.0,1.0,,,8224.0,121920.0,1,30014.0,0.0,270778.0,0.0,0.0,48.0,265981.0,99.0,31.0,13.0,10003,6.0,2021.0,7.0,15.0,2022,11,21,2022.0,11.0,30.0,1,0
6150079,0.0,571411.0,1.0,0.0,,1258.0,285367.0,0,7248.0,0.0,45605.0,0.0,0.0,44.0,135885.0,0.0,0.0,0.0,14092,1.0,2021.0,7.0,15.0,2022,12,9,2021.0,7.0,16.0,1,0
6150445,0.0,571411.0,1.0,0.0,,2862.0,285367.0,0,53207.0,0.0,270778.0,0.0,0.0,0.0,265981.0,0.0,0.0,0.0,11203,4.0,2021.0,7.0,15.0,2022,12,9,2021.0,7.0,16.0,1,0


In [66]:
len(df)

574025

In [43]:
df[df['Birth Year'].isna()]

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,Carrier Name,Carrier Type,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,Gender,IME-4 Count,Industry Code,Medical Fee Region,WCIO Cause of Injury Code,WCIO Nature of Injury Code,WCIO Part Of Body Code,Zip Code,Number of Dependents,Accident Year,Accident Month,Accident Day,Assembly Year,Assembly Month,Assembly Day,C-2 Year,C-2 Month,C-2 Day,C-3 Date Binary,First Hearing Year
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
5395789,0.0,571411.0,1.0,0.0,,942.0,52788.0,1,39618.0,0.0,270778.0,0.0,0.0,45.0,265981.0,98.0,52.0,22.0,10456,6.0,2021.0,7.0,15.0,2020,1,6,2020.0,3.0,9.0,1,2020
5400472,0.0,571411.0,1.0,0.0,,11038.0,285367.0,1,58333.0,0.0,270778.0,0.0,0.0,51.0,265981.0,52.0,72.0,13.0,11354,2.0,2021.0,7.0,15.0,2020,1,10,2020.0,2.0,19.0,1,2020
5401700,0.0,571411.0,1.0,0.0,,17540.0,285367.0,1,53207.0,0.0,270778.0,1.0,2.0,56.0,265981.0,99.0,59.0,42.0,11214,1.0,2021.0,7.0,15.0,2020,1,13,2020.0,2.0,10.0,1,2020
5401542,0.0,571411.0,1.0,0.0,,469.0,285367.0,1,60430.0,0.0,60536.0,0.0,0.0,23.0,265981.0,98.0,31.0,13.0,11730,2.0,2021.0,7.0,15.0,2020,1,13,2021.0,4.0,13.0,1,2021
5408856,0.0,571411.0,1.0,0.0,,8224.0,121920.0,1,53207.0,0.0,270778.0,1.0,0.0,48.0,265981.0,99.0,31.0,13.0,11236,0.0,2021.0,7.0,15.0,2020,1,22,2020.0,1.0,30.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6135328,0.0,571411.0,1.0,0.0,,136.0,121920.0,0,30874.0,0.0,45605.0,0.0,0.0,0.0,85033.0,0.0,0.0,0.0,14221,5.0,2021.0,7.0,15.0,2022,11,18,2021.0,7.0,16.0,1,0
6138100,0.0,571411.0,1.0,,,8224.0,121920.0,1,30014.0,0.0,270778.0,0.0,0.0,48.0,265981.0,99.0,31.0,13.0,10003,6.0,2021.0,7.0,15.0,2022,11,21,2022.0,11.0,30.0,1,0
6150079,0.0,571411.0,1.0,0.0,,1258.0,285367.0,0,7248.0,0.0,45605.0,0.0,0.0,44.0,135885.0,0.0,0.0,0.0,14092,1.0,2021.0,7.0,15.0,2022,12,9,2021.0,7.0,16.0,1,0
6150445,0.0,571411.0,1.0,0.0,,2862.0,285367.0,0,53207.0,0.0,270778.0,0.0,0.0,0.0,265981.0,0.0,0.0,0.0,11203,4.0,2021.0,7.0,15.0,2022,12,9,2021.0,7.0,16.0,1,0


## 1. K Fold

<a href="#top">Top &#129033;</a>

In [80]:
X = df.drop('Claim Injury Type', axis = 1)
y = df['Claim Injury Type']

In [81]:
X.isna().sum()

Age at Injury                     0
Alternative Dispute Resolution    0
Attorney/Representative           0
Average Weekly Wage               0
Birth Year                        0
Carrier Name                      0
Carrier Type                      0
County of Injury                  0
COVID-19 Indicator                0
District Name                     0
Gender                            0
IME-4 Count                       0
Industry Code                     0
Medical Fee Region                0
WCIO Cause of Injury Code         0
WCIO Nature of Injury Code        0
WCIO Part Of Body Code            0
Zip Code                          0
Number of Dependents              0
Accident Year                     0
Accident Month                    0
Accident Day                      0
Assembly Year                     0
Assembly Month                    0
Assembly Day                      0
C-2 Year                          0
C-2 Month                         0
C-2 Day                     

In [82]:
kf = KFold(n_splits=5, shuffle=True, random_state=1)

In [83]:
predictions = []

for train_index, val_index in kf.split(X):
    
    # Split data into training and validation folds
    X_train = X.iloc[train_index]
    X_val = X.iloc[val_index]
    y_train = y.iloc[train_index]
    y_val = y.iloc[val_index]
    
    # missing values
    for col in X_train.columns:
        X_train = custom_impute(X_train, col)
        
    for col in X_val.columns:
        X_val = custom_impute(X_val, col)
        
        
    # outliers (log) & scaler (robust)
    
#     preprocessor = Pipeline(steps=[
#         ('log_transform', FunctionTransformer(func=log_transform, validate=False)),
#         ('scaler', RobustScaler())
#     ])
    
#     X_train = preprocessor.fit_transform(X_train)
#     X_val = preprocessor.transform(X_val)
    
    
    # feature selection - selecionar só colunas boas antes
    
    
    
    # fit model
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    # make predictions
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_val)

    # compute metrics
    print(classification_report(y_train, train_pred))
    print(classification_report(y_val, val_pred))
    
    # save predictions and best model's parameters
    
    predictions.append({'Train Predictions': train_pred, 'Validation Predictions': val_pred})


              precision    recall  f1-score   support

           0       0.00      0.00      0.00      9906
           1       0.60      0.98      0.74    232910
           2       0.00      0.00      0.00     55214
           3       0.56      0.32      0.41    118608
           4       0.16      0.05      0.07     38661
           5       0.00      0.00      0.00      3336
           6       0.00      0.00      0.00        69
           7       0.02      0.00      0.00       375

    accuracy                           0.58    459079
   macro avg       0.17      0.17      0.15    459079
weighted avg       0.46      0.58      0.49    459079

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2512
           1       0.60      0.98      0.74     58063
           2       0.00      0.00      0.00     13687
           3       0.56      0.32      0.41     29891
           4       0.15      0.04      0.07      9619
           5       0.00 

## test

In [95]:
for col in test.columns:
    test = custom_impute(test, col)

In [96]:
test = test.fillna(0)
test.isna().sum()

Age at Injury                     0
Alternative Dispute Resolution    0
Attorney/Representative           0
Average Weekly Wage               0
Birth Year                        0
Carrier Name                      0
Carrier Type                      0
County of Injury                  0
COVID-19 Indicator                0
District Name                     0
Gender                            0
IME-4 Count                       0
Industry Code                     0
Medical Fee Region                0
WCIO Cause of Injury Code         0
WCIO Nature of Injury Code        0
WCIO Part Of Body Code            0
Zip Code                          0
Number of Dependents              0
Accident Year                     0
Accident Month                    0
Accident Day                      0
Assembly Year                     0
Assembly Month                    0
Assembly Day                      0
C-2 Year                          0
C-2 Month                         0
C-2 Day                     

In [97]:
model.predict(test)

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- C-3 Date Binary
- First Hearing Year
