# 1. Importing Libraries & Data
In this section, we set up the foundation for our project by importing the necessary Python libraries and loading the dataset. These libraries provide the tools for data manipulation, visualization, and machine learning modeling throughout the notebook. Additionally, we import the historical claims dataset, which forms the core of our analysis. 

In [1]:
import pandas as pd
import numpy as np

# Train-Test Split
from sklearn.model_selection import StratifiedKFold

import preproc as p 

# Models
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import classification_report
import metrics as m

pd.set_option('display.max_columns', None)
import time

# Suppress Warnings
import warnings
warnings.filterwarnings("ignore")

**Import Data**

In [2]:
# Load training data
df = pd.read_csv('./project_data/train_data_EDA.csv', index_col = 'Claim Identifier')

# Load testing data
test = pd.read_csv('./project_data/test_data_EDA.csv', index_col = 'Claim Identifier')

# Display the first 3 rows of the training data
df.head(3)

Unnamed: 0_level_0,Age at Injury,Average Weekly Wage,Birth Year,Claim Injury Type,IME-4 Count,Industry Code,WCIO Cause of Injury Code,WCIO Nature of Injury Code,WCIO Part Of Body Code,Number of Dependents,Alternative Dispute Resolution Bin,Attorney/Representative Bin,Carrier Name Enc,Carrier Type freq,Carrier Type_1A. PRIVATE,Carrier Type_2A. SIF,Carrier Type_3A. SELF PUBLIC,Carrier Type_4A. SELF PRIVATE,County of Injury freq,COVID-19 Indicator Enc,District Name freq,Gender Enc,Gender_F,Gender_M,Medical Fee Region freq,Accident Date Year,Accident Date Month,Accident Date Day,Accident Date Day of Week,Assembly Date Year,Assembly Date Month,Assembly Date Day,Assembly Date Day of Week,C-2 Date Year,C-2 Date Month,C-2 Date Day,C-2 Date Day of Week,WCIO Codes,Zip Code Valid,Industry Sector Count Enc,Age Group,C-3 Date Binary,First Hearing Date Binary
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1
5393875,31.0,0.0,1988.0,1,0.0,44.0,27,10,62,1.0,0,0,1015,273422,1,0,0,0,3124,0,43001,0,0,1,130727,2019,12,30,0.0,2020,1,1,2,2019,12,31,1.0,271062,0,99969,1,0,0
5393091,46.0,1745.93,1973.0,3,4.0,23.0,97,49,38,4.0,0,1,640,273422,1,0,0,0,691,0,39019,1,1,0,130727,2019,8,30,4.0,2020,1,1,2,2020,1,1,2.0,974938,0,66335,1,1,1
5393889,40.0,1434.8,1979.0,3,0.0,56.0,79,7,10,6.0,0,0,710,273422,1,0,0,0,16883,0,82516,0,0,1,84173,2019,12,6,4.0,2020,1,1,2,2020,1,1,2.0,79710,0,54838,1,0,0


In [40]:
#df = df.sample(500)

# 2. Stratified K-Fold

In [3]:
# Split the DataFrame into features (X) and target variable (y)
X = df.drop('Claim Injury Type', axis=1) 
y = df['Claim Injury Type']  

In [4]:
# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Initialize model
model = RandomForestClassifier()

# Track scores
scores = []

# Perform stratified k-fold cross-validation
for train_index, val_index in skf.split(X, y):

    start_time = time.time()
    
    X_train, X_val = X.iloc[train_index].copy(), X.iloc[val_index].copy()
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # MISSING VALUES
    train_mask = X_train['Accident Date Year'].notna() & X_train['Age at Injury'].notna() & \
           (X_train['Birth Year'].isna() | (X_train['Birth Year'] == 0))
    val_mask = (X_val['Accident Date Year'].notna() &  X_val['Age at Injury'].notna() &  
        (X_val['Birth Year'].isna() | (X_val['Birth Year'] == 0)))
    
    X_train.loc[train_mask, 'Birth Year'] = X_train['Accident Date Year'] - X_train['Age at Injury']
    X_val.loc[val_mask, 'Birth Year'] = X_val['Accident Date Year'] - X_val['Age at Injury']


    X_train['Average Weekly Wage'] = p.ball_tree_impute(X_train, 'Average Weekly Wage', n_neighbors=100)
    X_val['Average Weekly Wage'] = p.ball_tree_impute(X_val, 'Average Weekly Wage', n_neighbors=100)

    # OUTLIERS
    X_train['IME-4 Count Log'] = np.log1p(X_train['IME-4 Count'])
    X_val['IME-4 Count Log'] = np.log1p(X_val['IME-4 Count'])

    X_train['IME-4 Count Double Log'] = np.log1p(X_train['IME-4 Count Log'])
    X_val['IME-4 Count Double Log'] = np.log1p(X_val['IME-4 Count Log'])

    X_train.drop('IME-4 Count', axis = 1, inplace = True)
    X_val.drop('IME-4 Count', axis = 1, inplace = True)
    X_train.drop('IME-4 Count Log', axis = 1, inplace = True)
    X_val.drop('IME-4 Count Log', axis = 1, inplace = True)
        
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    pred_train = model.predict(X_train)
    pred_val = model.predict(X_val)
    
    # Evaluate accuracy
    m.metrics(y_train, pred_train , y_val, pred_val)

    # Time
    end_time = time.time()
    elapsed_time = round((end_time - start_time) / 60, 2)
    print(f'This Fold took {elapsed_time} minutes')

______________________________________________________________________
                                TRAIN                                 
----------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8302
           1       1.00      1.00      1.00    184592
           2       1.00      1.00      1.00     45884
           3       1.00      1.00      1.00     94998
           4       1.00      1.00      1.00     32162
           5       1.00      1.00      1.00      2800
           6       1.00      1.00      1.00        64
           7       1.00      1.00      1.00       304

    accuracy                           1.00    369106
   macro avg       1.00      1.00      1.00    369106
weighted avg       1.00      1.00      1.00    369106

______________________________________________________________________
                                VALIDATION                       