# Importing libraries

In [28]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from statistics import mean

import os
import warnings
import time
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBClassifier
from xgboost import XGBRegressor

from sklearn.preprocessing import LabelEncoder


### Importing the datasets

In [29]:
df = pd.read_csv("train_data_enriched.csv", index_col="Claim Identifier")
data = df.copy()
data_test = pd.read_csv("test_data_enriched.csv",index_col="Claim Identifier")

### Defining the types of our numerical features to input

In [30]:
num_features = ['Age at Injury', 'Average Weekly Wage', 'Birth Year', 'IME-4 Count', 'Number of Dependents',
                "Accident Year","Accident Month","Accident Day","Accident DayOfWeek","Assembly Date DSA",
                "C-2 Date DSA","C-3 Date DSA","First Hearing Date DSA"]

#### Dropping variables

In [31]:
# Dropping redundant variables that carry almost the same information (are extremely correlated (|0.8|))
# We believe it was better to keep Age at Injury than birth year since it should be more related to the injury claim type (it will be tested later)
# The same logic was applied to dropping the other two dates and two DSA variables since we believe Accident date to be more important
data.drop(['Birth Year','Assembly Date', "C-2 Date","Assembly Date DSA", "First Hearing Date DSA"], axis = 1, inplace = True)
data_test.drop(['Birth Year','Assembly Date', "C-2 Date","Assembly Date DSA", "First Hearing Date DSA"], axis = 1, inplace = True)

for col in ['Birth Year',"Assembly Date DSA", "First Hearing Date DSA"]:
    num_features.remove(col)


In [32]:
#Since the codes always seem to provide the same or more information than the descriptions (have more categories),
#and the codes are consistent (always only having 1 description for code, while descriptions may have multiple codes)
#we will drop the description columns.
data.drop(['Industry Code Description','WCIO Cause of Injury Description','WCIO Nature of Injury Description','WCIO Part Of Body Description'], axis = 1,inplace = True)
data_test.drop(['Industry Code Description','WCIO Cause of Injury Description','WCIO Nature of Injury Description','WCIO Part Of Body Description'], axis = 1,inplace = True)


In [35]:
# Removing Zip Code for reason meantion in pre-processement
data.drop(['Zip Code'], axis=1, inplace = True)
data_test.drop(['Zip Code'], axis=1 , inplace = True)

### Converting our target into labels for our model to predict

In [36]:
le = LabelEncoder()

In [37]:
#Label inconding our target variable 
data["Claim Injury Type"] = le.fit_transform(data["Claim Injury Type"])

### Missing Values

In [38]:
# Inputing missing values with a given algorithm
def impute_missing_values(data, target_column, algorithm):
    
    # Separating the missing values from the non missing values
    available_data = data[data[target_column].notna()]
    missing_data = data[data[target_column].isna()]

    # Making sure there is enough data to input 
    if len(available_data) == 0 or len(missing_data) == 0:
        return data

    # Separating the target column from the rest 
    X_available = available_data.drop(columns=[target_column])
    y_available = available_data[target_column]

    # Training the model with the available data
    model = algorithm
    model.fit(X_available, y_available)

    # Prediting the missing values
    X_missing = missing_data.drop(columns=[target_column])
    predicted_values = model.predict(X_missing)

    # Inputing the missing values with the predictions
    data.loc[data[target_column].isna(), target_column] = predicted_values

    return data



### Defining X and y 

In [39]:
X = data.drop(["Claim Injury Type"], axis = 1)

In [40]:
y = data["Claim Injury Type"]

# Feature Selection

### RFE (repeated feature elimination) 

In [None]:
def Rfe(algorithm, num_inputing_algorithm= DecisionTreeRegressor() , cat_inputing_algorithm = DecisionTreeClassifier()):

    X_train, X_val,y_train, y_val = train_test_split(X,y,
                                                train_size = 0.75, 
                                                shuffle = True, 
                                                stratify = y)

    #Filling num missing values
    for column in num_features:
        impute_missing_values(X_train, column, num_inputing_algorithm)
        impute_missing_values(X_val, column, num_inputing_algorithm)

    #Filling cat missing values
    impute_missing_values(X_train, "Alternative Dispute Resolution", cat_inputing_algorithm)
    impute_missing_values(X_val, "Alternative Dispute Resolution", cat_inputing_algorithm)

    # Removing inconsistencies on the train
    inconsistent = X_train[(X_train['Age at Injury'] > 80) | (X_train["Age at Injury"] < 16)].index
    X_train.drop(inconsistent, inplace=True)
    y_train.drop(inconsistent, inplace=True)
    
    #Generating the variables where we will store our results
    nof_list = np.arange(1, len(X_train.columns) + 1)            
    high_score = 0
    opt_n_features = 0
    train_score_list = []
    val_score_list = []

    #Variable where we will store the optimum amount of features
    best_rfe = None

    model = algorithm

    for n in nof_list:
        rfe = RFE(estimator=model, n_features_to_select=n)
    
    # Fitting the model to rfe
        X_train_rfe = rfe.fit_transform(X_train, y_train)
        X_val_rfe = rfe.transform(X_val)
    
    # Training and predicting
        model.fit(X_train_rfe, y_train)
        pred_train = model.predict(X_train_rfe)
        pred_val = model.predict(X_val_rfe)
    
    # Evaluating using the macro f1_score
        train_score = f1_score(y_train, pred_train, average="macro")
        val_score = f1_score(y_val, pred_val, average="macro")
        train_score_list.append(train_score)
        val_score_list.append(val_score)
    
    # Checking if this is the best combination of features so far
        if val_score >= high_score:
            high_score = val_score
            opt_n_features = n
            best_rfe = rfe  # Armazenar o RFE com o melhor número de features

# Checking what amount of features and which features where the best for the model
    selected_features = X_train.columns[best_rfe.support_].tolist()

    print("Optimal number of features: %d" % opt_n_features)
    print("Score with %d features: %f" % (opt_n_features, high_score))
    print("Selected Features:\n", selected_features)


In [None]:
Rfe(DecisionTreeClassifier())

In [None]:
Rfe(RandomForestClassifier())

In [None]:
Rfe(XGBClassifier())

### These were the variables we obtained as a result of our RFE with a random forest:

In [52]:
selected_features = ['Accident Date', 'Age at Injury', 'Alternative Dispute Resolution', 'Attorney/Representative', 'Average Weekly Wage', 'C-3 Date', 'Carrier Name', 'Carrier Type', 'County of Injury',
                      'COVID-19 Indicator', 'District Name', 'First Hearing Date', 'Gender', 'IME-4 Count', 'Industry Code', 'Medical Fee Region', 'WCIO Cause of Injury Code', 'WCIO Nature of Injury Code', 
                      'WCIO Part Of Body Code', 'Number of Dependents', 'Accident Year', 'Accident Month', 'Accident Day', 'Accident DayOfWeek', 'C-2 Date DSA', 'C-3 Date DSA',
                        'Accident Date_missing', 'First Hearing Date_missing', 'C-3 Date_missing', 'Assembly Date_missing', 'C-2 Date_missing', 'Age at Injury Category',
                        'Carrier Claim Category', 'Body Section']

In [53]:
selected_num_features = ['Age at Injury', 'Average Weekly Wage','IME-4 Count','Number of Dependents',
                         'Accident Year','Accident Month', 'Accident Day', 'Accident DayOfWeek',
                         'C-2 Date DSA', 'C-3 Date DSA']

# Decision tree models and performance

In [54]:
# Computes the crossvalidation scores
def cv_scores(model, X, y, num_inputing_algorithm= XGBRegressor() , cat_inputing_algorithm = XGBClassifier()):
    # Takes as argument the model used, the predictors and the target. Splits the data using StratifiedKFold, and
    # trains model using X and y. Then it returns the results obtained from the stratified cross validation'''
    
    skf = KFold(n_splits=5)
    
    # Generating the lists to store our results
    precision_scores_train = []
    precision_scores_val = []   
    recall_scores_train = []  
    recall_scores_val = []
    f1_scores_train = []    
    f1_scores_val = []
    index = [f'Fold {i}' for i in range(1,6)]
    index.append("Average")
    
    for train_index, test_index in skf.split(X, y):
        # Dividing our data in validation and train
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]

        #Filling num missing values
        for column in selected_num_features:
            impute_missing_values(X_train, column, num_inputing_algorithm)
            impute_missing_values(X_val, column, num_inputing_algorithm)

        #Filling cat missing values
        impute_missing_values(X_train, "Alternative Dispute Resolution", cat_inputing_algorithm)
        impute_missing_values(X_val, "Alternative Dispute Resolution", cat_inputing_algorithm)
        
        # Removing inconsistencies on the train
        inconsistent = X_train[(X_train['Age at Injury'] > 80) | (X_train["Age at Injury"] < 16)].index
        X_train.drop(inconsistent, inplace=True)
        y_train.drop(inconsistent, inplace=True)


        # Training the classification model
        model.fit(X_train, y_train)
        
        # Making the predictions for the training and validation data
        pred_train = model.predict(X_train)
        pred_val = model.predict(X_val)
        
        # Calculating and storing the scores
        precision_scores_train.append(precision_score(y_train, pred_train, average='macro'))
        precision_scores_val.append(precision_score(y_val, pred_val, average='macro'))
        recall_scores_train.append(recall_score(y_train, pred_train, average='macro'))
        recall_scores_val.append(recall_score(y_val, pred_val, average='macro'))
        f1_scores_train.append(f1_score(y_train, pred_train, average='macro'))
        f1_scores_val.append(f1_score(y_val, pred_val, average='macro'))

    
    precision_scores_train.append(mean(precision_scores_train))
    precision_scores_val.append(mean(precision_scores_val))
    recall_scores_train.append(mean(recall_scores_train))
    recall_scores_val.append(mean(recall_scores_val))
    f1_scores_train.append(mean(f1_scores_train))
    f1_scores_val.append(mean(f1_scores_val))

    # Storing the results in a dataframe
    model_results = pd.DataFrame(data={
        'Train_precision': precision_scores_train,
        'Test_precision': precision_scores_val,
        'Train_recall': recall_scores_train,
        'Test_recall': recall_scores_val,
        'Train_f1_score': f1_scores_train,
        'Test_f1_score': f1_scores_val,
    }, index=index)
    
    return model_results



after testing, the model is better with all features, so we ran with all features

In [None]:
dt_results = cv_scores(DecisionTreeClassifier(), X[selected_features], y)
dt_results

In [None]:
dt_results = cv_scores(DecisionTreeClassifier(), X, y)
dt_results

In [None]:
rf_results = cv_scores(RandomForestClassifier(), X, y)
rf_results

In [55]:
xgb_results = cv_scores(XGBClassifier(), X[selected_features], y)
xgb_results

Unnamed: 0,Train_precision,Test_precision,Train_recall,Test_recall,Train_f1_score,Test_f1_score
Fold 1,0.796857,0.448212,0.583432,0.349683,0.612243,0.357272
Fold 2,0.796898,0.486657,0.583648,0.344037,0.614017,0.354671
Fold 3,0.796329,0.437943,0.583859,0.346069,0.614615,0.357854
Fold 4,0.792291,0.42451,0.574621,0.33198,0.605545,0.346707
Fold 5,0.785428,0.409623,0.572856,0.329006,0.602553,0.340913
Average,0.793561,0.441389,0.579683,0.340155,0.609795,0.351483


# Making predictions

In [56]:
def test_prediction(model, X, y , test, num_inputing_algorithm= XGBRegressor() , cat_inputing_algorithm = XGBClassifier()):

    X_train, X_val,y_train, y_val = train_test_split(X,y,
                                                train_size = 0.8, 
                                                shuffle = True, 
                                                stratify = y)

    # Missing value inputation
    #Filling num missing values
    for column in selected_num_features:
        impute_missing_values(X_train, column, num_inputing_algorithm)
        impute_missing_values(X_val, column, num_inputing_algorithm)
        impute_missing_values(test, column, num_inputing_algorithm)

    #Filling cat missing values
    impute_missing_values(X_train, "Alternative Dispute Resolution", cat_inputing_algorithm)
    impute_missing_values(X_val, "Alternative Dispute Resolution", cat_inputing_algorithm)
    impute_missing_values(test, "Alternative Dispute Resolution", cat_inputing_algorithm)

    # Removing inconsistencies on the train
    inconsistent = X_train[(X_train['Age at Injury'] > 80) | (X_train["Age at Injury"] < 16)].index
    X_train.drop(inconsistent, inplace=True)
    y_train.drop(inconsistent, inplace=True)
    
    # Fitting the model
    model.fit(X_train, y_train)

    # Veryfing if the model is performing as expected
    pred_val = model.predict(X_val)
    print(f1_score(y_val, pred_val, average='macro'))

    # Using the model to make prediction on the test dataset
    pred_test = model.predict(test)

    # Inversing the encoding of our target variable 
    pred_test = le.inverse_transform(pred_test)

    # Making a dataframe with the indexes of data_test and predictions converted back to strings
    submission_df = pd.DataFrame({
        "Claim Injury Type": pred_test
    }, index=data_test.index)
    
    return submission_df

In [57]:
submission = test_prediction(XGBClassifier(),X[selected_features],y,data_test)
submission

0.370954220930935


Unnamed: 0_level_0,Claim Injury Type
Claim Identifier,Unnamed: 1_level_1
6165911,2. NON-COMP
6166141,2. NON-COMP
6165907,2. NON-COMP
6166047,2. NON-COMP
6166102,2. NON-COMP
...,...
6553137,2. NON-COMP
6553119,1. CANCELLED
6553542,1. CANCELLED
6553455,2. NON-COMP


In [58]:
submission.to_csv("Submission.csv")

In [59]:
submission.nunique()

Claim Injury Type    6
dtype: int64