# Importing libraries

In [52]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from statistics import mean

import os
import warnings
import time
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

from sklearn.preprocessing import LabelEncoder


In [53]:
low_memory = False

### Importing the datasets

In [54]:
df = pd.read_csv("train_data.csv")
data = df.copy()
data_test = pd.read_csv("test_data.csv")

### Defining the types of our features

In [55]:
date_features = ['Accident Date', 'Assembly Date', 'C-2 Date', 'C-3 Date', 'First Hearing Date']

In [56]:
num_features = ['Age at Injury', 'Average Weekly Wage', 'Birth Year', 'IME-4 Count', 'Number of Dependents']

In [57]:
cat_features = ['Alternative Dispute Resolution', 'Attorney/Representative',
       'Carrier Name', 'Carrier Type','County of Injury',
       'COVID-19 Indicator', 'District Name', 'Gender','Industry Code',
       'Industry Code Description', 'Medical Fee Region','WCIO Cause of Injury Code',
       'WCIO Cause of Injury Description', 'WCIO Nature of Injury Code', 
       'WCIO Nature of Injury Description', 'WCIO Part Of Body Code', 'Agreement Reached',
       'WCIO Part Of Body Description', 'Zip Code', 'WCB Decision']

### Dropping variables

In [58]:
# Dropping variables that are definitely useless from our dataset
# The first variable has 100% missing values and the second only contains one type of value,
# meaning it doesn't provide any useful information + it isn't in the test dataset
data.drop(["OIICS Nature of Injury Description", "WCB Decision"], axis=1, inplace = True)
data_test.drop(["OIICS Nature of Injury Description"], axis=1, inplace = True)
cat_features.remove('WCB Decision')

In [59]:
# Dropping "Agreement Reached" because it is not on the validation dataset
#Later we can try to predict this column and then predict the target but for now lets drop ot
data = data.drop(['Agreement Reached'], axis=1)
cat_features.remove("Agreement Reached")

In [60]:
# We'll eliminate the Zip_Code variable since we already have its information in other variables plus no model will be able to interpret the location information in this code
data = data.drop(['Zip Code'], axis=1)
cat_features.remove("Zip Code")
data_test = data_test.drop(['Zip Code'], axis=1)

In [61]:
# Dropping redundant variables that carry almost the same information (are extremely correlated)
# We believe it was better to keep Age at Injury than birth year since it should be more related to the injury claim type (it will be tested later)
# The same logic was applied to dropping the other two dates since we believe Accident date to be more important
data.drop(['Birth Year','Assembly Date', "C-2 Date"], axis = 1, inplace = True)
data_test.drop(['Birth Year','Assembly Date', "C-2 Date"], axis = 1, inplace = True)
for col in ['Assembly Date', "C-2 Date"]:
    date_features.remove(col)
num_features.remove('Birth Year')

In [62]:
#Since the codes always seem to provide the same or more information than the descriptions (have more categories),
#and the codes are consistent (always only having 1 description for code, while descriptions may have multiple codes)
#we will drop the description columns.
data.drop(['Industry Code Description','WCIO Cause of Injury Description','WCIO Nature of Injury Description','WCIO Part Of Body Description'], axis = 1,inplace = True)
data_test.drop(['Industry Code Description','WCIO Cause of Injury Description','WCIO Nature of Injury Description','WCIO Part Of Body Description'], axis = 1,inplace = True)
for col in ['Industry Code Description','WCIO Cause of Injury Description','WCIO Nature of Injury Description','WCIO Part Of Body Description']:
    cat_features.remove(col)

### Removing Inconsistencies

In [63]:
#Remove rows where the target variable is NaN
#This also removes all the missing values in another 13 variables (which includes all our categorical variables)
#and an inconsistency where we have to rows with the same claim Identifier
data.dropna(axis = 0 , subset=["Claim Injury Type"],inplace = True)

In [64]:
#Removing duplicates
data.drop_duplicates(inplace = True)

### Feature Engeneering

In [65]:
#Adding a binary column for the dates since it might provide useful information that would be lost when the missing values are inputed
#Missing values in dates can mean that it wasn't held yet 
date_columns = ['Accident Date', 'First Hearing Date', 'C-3 Date']

for column in date_columns:
    data[column + '_missing'] = data[column].isnull().astype(int)
    data_test[column + '_missing'] = data[column].isnull().astype(int)

### Defining the indexes

In [66]:
#Defining Claim identifier as the index
data.set_index('Claim Identifier', inplace = True)
data_test.set_index('Claim Identifier', inplace = True)

### Changing impossible 0's and impossible dates to missing values

In [67]:
#Changing impossible 0's to missing values
data["Average Weekly Wage"].replace(0, np.nan, inplace=True)
data["Age at Injury"].replace(0, np.nan, inplace=True)
data_test["Average Weekly Wage"].replace(0, np.nan, inplace=True)
data_test["Age at Injury"].replace(0, np.nan, inplace=True)

##### Converting dates into datetime64 and than into integers in miliseconds after 1970

In [68]:
for col in date_features:
    data[col] = pd.to_datetime(data[col],format = "%Y-%m-%d", errors = "coerce")
    data_test[col] = pd.to_datetime(data_test[col],format = "%Y-%m-%d", errors = "coerce")

In [69]:
#This is done in the middle step because its when we can compare the dates and remove inconsistencies
inconsistent = data[data["Accident Date"] > data["C-3 Date"]].index
data["C-3 Date"].loc[inconsistent] = np.nan
inconsistent2 = data[data["Accident Date"] > data["First Hearing Date"]].index
data["First Hearing Date"].loc[inconsistent2] = np.nan

In [70]:
#Converting date columns to integers
data[date_features] = data[date_features].apply(lambda x: x.astype('int64') // 10**9 if x.dtype == 'datetime64[ns]' else x)
data_test[date_features] = data_test[date_features].apply(lambda x: x.astype('int64') // 10**9 if x.dtype == 'datetime64[ns]' else x)

### Converting to binary the binary variables

In [71]:
#Converting binary variables to binary
data["Attorney/Representative"] = [val if val != val else 1 if val == "Y" else 0 for val in data["Attorney/Representative"]]
data["COVID-19 Indicator"] = [val if val != val else 1 if val == "Y" else 0 for val in data["COVID-19 Indicator"]]
data_test["Attorney/Representative"] = [val if val != val else 1 if val == "Y" else 0 for val in data_test["Attorney/Representative"]]
data_test["COVID-19 Indicator"] = [val if val != val else 1 if val == "Y" else 0 for val in data_test["COVID-19 Indicator"]]

In [72]:
#Even though it is still categorical we have to remove it so it isnt proportionally encoded
cat_features.remove("Attorney/Representative")
cat_features.remove("COVID-19 Indicator")

### Converting our target into labels for our model to predict

In [73]:
le = LabelEncoder()

In [74]:
#Label inconding our target variable 
data["Claim Injury Type"] = le.fit_transform(data["Claim Injury Type"])

### Converting our categorical variables into numerical ones

In [76]:
#Using proportionate encoding to turn our categorical data into numerical because since we have a lot of categories, and
#this solution keeps the information of the most frequent categories while also not increasing the dimensionality of our dataset manyfold
for col in cat_features:
            proportion = data[col].value_counts(normalize = True)  # Get the porportion of each category
            data[col] = data[col].map(proportion)  # Map the porportions in the column
            data_test[col] = data_test[col].map(proportion) 

### Missing Values

###### We have no missing values for categorical features

Numeric Variables

In [78]:
# Inputing numeric missing values with a decision tree
def impute_with_decision_tree(data, target_column):
    
    # Separating the missing values from the non
    available_data = data[data[target_column].notna()]
    missing_data = data[data[target_column].isna()]

    # Making sure there is enough data to input 
    if len(available_data) == 0 or len(missing_data) == 0:
        return data

    # Separating the target column from the rest 
    X_available = available_data.drop(columns=[target_column])
    y_available = available_data[target_column]

    # Training the model with the available data
    model = DecisionTreeRegressor()
    model.fit(X_available, y_available)

    # Prediting the missing values
    X_missing = missing_data.drop(columns=[target_column])
    predicted_values = model.predict(X_missing)

    # Inputing the missing values with the predictions
    data.loc[data[target_column].isna(), target_column] = predicted_values

    return data



In [79]:
# Inputing numeric missing values with a decision tree
def impute_with_xgb(data, target_column):
    
    # Separating the missing values from the non
    available_data = data[data[target_column].notna()]
    missing_data = data[data[target_column].isna()]

    # Making sure there is enough data to input 
    if len(available_data) == 0 or len(missing_data) == 0:
        return data

    # Separating the target column from the rest 
    X_available = available_data.drop(columns=[target_column])
    y_available = available_data[target_column]

    # Training the model with the available data
    model = XGBRegressor()
    model.fit(X_available, y_available)

    # Prediting the missing values
    X_missing = missing_data.drop(columns=[target_column])
    predicted_values = model.predict(X_missing)

    # Inputing the missing values with the predictions
    data.loc[data[target_column].isna(), target_column] = predicted_values

    return data

In [80]:
# Inputing numeric missing values with a decision tree
def impute_with_random_forest(data, target_column):
    
    # Separating the missing values from the non
    available_data = data[data[target_column].notna()]
    missing_data = data[data[target_column].isna()]

    # Making sure there is enough data to input 
    if len(available_data) == 0 or len(missing_data) == 0:
        return data

    # Separating the target column from the rest 
    X_available = available_data.drop(columns=[target_column])
    y_available = available_data[target_column]

    # Training the model with the available data
    model = RandomForestRegressor()
    model.fit(X_available, y_available)

    # Prediting the missing values
    X_missing = missing_data.drop(columns=[target_column])
    predicted_values = model.predict(X_missing)

    # Inputing the missing values with the predictions
    data.loc[data[target_column].isna(), target_column] = predicted_values

    return data

### Defining X and y 

In [81]:
X = data.drop(["Claim Injury Type"], axis = 1)

In [82]:
y = data["Claim Injury Type"]

# Feature Selection

### RFE (repeated feature elimination) 

In [None]:
def Rfe(algorithm):

    X_train, X_val,y_train, y_val = train_test_split(X,y,
                                                train_size = 0.75, 
                                                shuffle = True, 
                                                stratify = y)

    #Filling missing values
    for column in num_features:
        impute_with_decision_tree(X_train, column)
        impute_with_decision_tree(X_val, column)
    
    #Generating the variables where we will store our results
    nof_list = np.arange(1, len(X_train.columns) + 1)            
    high_score = 0
    opt_n_features = 0
    train_score_list = []
    val_score_list = []

    #Variable where we will store the optimum amount of features
    best_rfe = None

    model = algorithm

    for n in nof_list:
        rfe = RFE(estimator=model, n_features_to_select=n)
    
    # Fitting the model to rfe
        X_train_rfe = rfe.fit_transform(X_train, y_train)
        X_val_rfe = rfe.transform(X_val)
    
    # Training and predicting
        model.fit(X_train_rfe, y_train)
        pred_train = model.predict(X_train_rfe)
        pred_val = model.predict(X_val_rfe)
    
    # Evaluating using the macro f1_score
        train_score = f1_score(y_train, pred_train, average="macro")
        val_score = f1_score(y_val, pred_val, average="macro")
        train_score_list.append(train_score)
        val_score_list.append(val_score)
    
    # Checking if this is the best combination of features so far
        if val_score >= high_score:
            high_score = val_score
            opt_n_features = n
            best_rfe = rfe  # Armazenar o RFE com o melhor número de features

# Checking what amount of features and which features where the best for the model
    selected_features = X_train.columns[best_rfe.support_].tolist()

    print("Optimal number of features: %d" % opt_n_features)
    print("Score with %d features: %f" % (opt_n_features, high_score))
    print("Selected Features:\n", selected_features)


In [None]:
Rfe(DecisionTreeClassifier())

In [None]:
Rfe(RandomForestClassifier())

In [None]:
Rfe(XGBClassifier())

### These were the variables we obtain as a result of our RFE with a random forest:

In [None]:
selected_features = ['Accident Date', 'Age at Injury', 'Attorney/Representative', 'Average Weekly Wage', 'C-3 Date', 'Carrier Name', 'County of Injury', 'District Name', 'First Hearing Date', 'IME-4 Count', 'Industry Code', 'WCIO Cause of Injury Code', 'WCIO Nature of Injury Code', 'WCIO Part Of Body Code', 'Number of Dependents', 'First Hearing Date_missing']

In [None]:
selected_features = ['Accident Date', 'Age at Injury', 'Attorney/Representative', 'Average Weekly Wage', 'C-3 Date', 'Carrier Name', 'Carrier Type', 'County of Injury', 'COVID-19 Indicator', 'District Name', 'First Hearing Date', 'Gender', 'IME-4 Count', 'Industry Code', 'Medical Fee Region', 'WCIO Cause of Injury Code', 'WCIO Nature of Injury Code', 'WCIO Part Of Body Code', 'Number of Dependents']

In [None]:
selected_num_features = ['Accident Date', 'Age at Injury', 'Average Weekly Wage', 'C-3 Date','First Hearing Date','IME-4 Count','Number of Dependents']

# Decision tree models and performance

In [83]:
# Computes the crossvalidation scores
def cv_scores(model, X, y):
    # Takes as argument the model used, the predictors and the target. Splits the data using StratifiedKFold, and
    # trains model using X and y. Then it returns the results obtained from the stratified cross validation'''
    
    skf = KFold(n_splits=5)
    
    # Generating the lists to store our results
    precision_scores_train = []
    precision_scores_val = []   
    recall_scores_train = []  
    recall_scores_val = []
    f1_scores_train = []    
    f1_scores_val = []
    index = [f'Fold {i}' for i in range(1,6)]
    index.append("Average")
    
    for train_index, test_index in skf.split(X, y):
        # Dividing our data in validation and train
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]

        # Inputing the missing values
        for column in num_features:
            impute_with_xgb(X_train, column)
            impute_with_xgb(X_val, column)

        # Data pre-processment
        inconsistent = X_train[(X_train['Age at Injury'] > 80) | (X_train["Age at Injury"] < 16)].index
        X_train.drop(inconsistent, inplace=True)
        y_train.drop(inconsistent, inplace=True)


        # Training the classification model
        model.fit(X_train, y_train)
        
        # Making the predictions for the training and validation data
        pred_train = model.predict(X_train)
        pred_val = model.predict(X_val)
        
        # Calculating and storing the scores
        precision_scores_train.append(precision_score(y_train, pred_train, average='macro'))
        precision_scores_val.append(precision_score(y_val, pred_val, average='macro'))
        recall_scores_train.append(recall_score(y_train, pred_train, average='macro'))
        recall_scores_val.append(recall_score(y_val, pred_val, average='macro'))
        f1_scores_train.append(f1_score(y_train, pred_train, average='macro'))
        f1_scores_val.append(f1_score(y_val, pred_val, average='macro'))

    
    precision_scores_train.append(mean(precision_scores_train))
    precision_scores_val.append(mean(precision_scores_val))
    recall_scores_train.append(mean(recall_scores_train))
    recall_scores_val.append(mean(recall_scores_val))
    f1_scores_train.append(mean(f1_scores_train))
    f1_scores_val.append(mean(f1_scores_val))

    # Storing the results in a dataframe
    model_results = pd.DataFrame(data={
        'Train_precision': precision_scores_train,
        'Test_precision': precision_scores_val,
        'Train_recall': recall_scores_train,
        'Test_recall': recall_scores_val,
        'Train_f1_score': f1_scores_train,
        'Test_f1_score': f1_scores_val,
    }, index=index)
    
    return model_results



todos os f1 em baixo foi com input rf, que foram os melhores valores que obtive

In [None]:
dt_results = cv_scores(DecisionTreeClassifier(), X, y)
dt_results

In [None]:
rf_results = cv_scores(RandomForestClassifier(), X, y)
rf_results

In [None]:
xgb_results = cv_scores(XGBClassifier(), X, y)
xgb_results

# Making predictions

In [101]:
def test_prediction(model, X, y , test):

    X_train, X_val,y_train, y_val = train_test_split(X,y,
                                                train_size = 0.8, 
                                                shuffle = True, 
                                                stratify = y)

    
    # data pre-processment
    inconsistent = X_train[(X_train['Age at Injury'] > 80) | (X_train["Age at Injury"] < 14)].index
    X_train.drop(inconsistent, inplace=True)
    y_train.drop(inconsistent, inplace=True)

    # Missing value inputation
    for column in num_features:
        impute_with_xgb(X_train, column)
        impute_with_xgb(test, column)

    # Fitting the model
    model.fit(X_train, y_train)
    
    pred_val = model.predict(X_val)
    print(f1_score(y_val, pred_val, average='macro'))

    # Using the model to make prediction on the test dataset
    pred_test = model.predict(test)

    #Label enconding our target variable 
    pred_test = le.inverse_transform(pred_test)

    # Making a dataframe with the indexes of data_test and predictions converted back to strings
    submission_df = pd.DataFrame({
        "Claim Injury Type": pred_test
    }, index=data_test.index)
    
    return submission_df

In [99]:
submission = test_prediction(XGBClassifier(),X,y,data_test)
submission

0.3104945550519305
Claim Injury Type
1                    303682
3                     57001
2                     21674
0                      4549
4                      1061
7                         8
Name: count, dtype: int64
Claim Injury Type
2. NON-COMP          303682
4. TEMPORARY          57001
3. MED ONLY           21674
1. CANCELLED           4549
5. PPD SCH LOSS        1061
8. DEATH                  8
Name: count, dtype: int64


Unnamed: 0_level_0,Claim Injury Type
Claim Identifier,Unnamed: 1_level_1
6165911,2. NON-COMP
6166141,2. NON-COMP
6165907,2. NON-COMP
6166047,2. NON-COMP
6166102,2. NON-COMP
...,...
6553137,1. CANCELLED
6553119,1. CANCELLED
6553542,1. CANCELLED
6553455,2. NON-COMP


In [102]:
submission.to_csv("Submission.csv")

In [95]:
submission.nunique()

Claim Injury Type    6
dtype: int64