Models Assessment

In [2]:
import pandas as pd
import numpy as np

#model imports
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier,XGBRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

#metrics
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from statistics import mean

#useful functions
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE


In [3]:
#Define an instance of the models
#We are using the best 
DT = DecisionTreeClassifier()
RF = RandomForestClassifier(verbose = 1, n_jobs=-1)
XGB = XGBClassifier()
KNN = KNeighborsClassifier(n_neighbors = 50)
MLP = MLPClassifier(activation='relu',
    solver='sgd',learning_rate='invscaling',
    learning_rate_init=0.001,
    batch_size=100,verbose = True)

#Creating an instance of our encoder for the target
le = LabelEncoder()

In [4]:
# Load data
data= pd.read_csv("train_data_enriched.csv", index_col="Claim Identifier")
data_test = pd.read_csv("test_data_enriched.csv",index_col="Claim Identifier")

  data= pd.read_csv("train_data_enriched.csv", index_col="Claim Identifier")


In [5]:
def impute_missing_values(X_train, X_val, target_column, algorithm,validation = True):

    # Separating the missing values from the non missing values
    available_data = X_train[X_train[target_column].notna()]
    missing_X_train = X_train[X_train[target_column].isna()]
    missing_X_val = X_val[X_val[target_column].isna()]

    # Making sure if there is enough data for inputing, returning it if not
    if len(missing_X_train) == 0:
        print(f"no missing values to input on {target_column}")
        return X_train, X_val

    # Separating the target column from the rest
    X_available = available_data.drop(columns=[target_column])
    y_available = available_data[target_column]

    # Making sure our columns are consistent
    X_available = X_available.select_dtypes(include=["number"])
    missing_X_train = missing_X_train.select_dtypes(include=["number"])
    missing_X_val = missing_X_val.select_dtypes(include=["number"])

    common_columns = X_available.columns.intersection(missing_X_train.columns).intersection(missing_X_val.columns)
    X_available = X_available[common_columns]
    missing_X_train = missing_X_train[common_columns]
    missing_X_val = missing_X_val[common_columns]

    # Making sure there is any column after keeping the common columns
    if X_available.shape[1] == 0:
        print(f"Without any column to input in {target_column}")
        return X_train, X_val

    # Training the model with the available data
    model = algorithm
    model.fit(X_available, y_available)

    # Prediting the missing values
    predicted_train = model.predict(missing_X_train)
    if validation:
        predicted_val = model.predict(missing_X_val)

    # Filling the training and validation with predictions. The latter is only filled if the argument is true
    X_train = X_train.copy()
    X_train.loc[X_train[target_column].isna(), target_column] = predicted_train

    if validation:
        X_val = X_val.copy()
        X_val.loc[X_val[target_column].isna(), target_column] = predicted_val

    return X_train, X_val

In [6]:
#This helper fuction was used for finding problems with later functions
'''def check_missing_values(data, step_name):
    print(f"\n{step_name}: Valores ausentes restantes:")
    print(data.isnull().sum()[data.isnull().sum() > 0])'''

'def check_missing_values(data, step_name):\n    print(f"\n{step_name}: Valores ausentes restantes:")\n    print(data.isnull().sum()[data.isnull().sum() > 0])'

In [7]:
def handle_outliers(data, column):
    # Handles outliers in a numerical column by replacing values outside the interquartile range (IQR) with missing values

    # Makes sure we only treat outliers in columns that have any data
    if data[column].notnull().sum() > 0: 

        # Calculating inter quantile range limits
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Changing the ouliers to missing values
        data[column] = np.where(data[column] < lower_bound, np.nan, data[column])
        data[column] = np.where(data[column] > upper_bound, np.nan, data[column])

    return data

In [8]:
def scale_numerical(column, X_train, X_val, scaler):
    
    # Make sure the column is numerical
    if not pd.api.types.is_numeric_dtype(X_train[column]):
        print(f"Columm '{column}' is not numerical and will be ignored")
        return

    # Scaling the data
    try:
        X_train[column] = scaler.fit_transform(X_train[[column]])
        X_val[column] = scaler.transform(X_val[[column]])
    except ValueError as e:
        print(f"Mistake scaling the column '{column}': {e}")

In [9]:
def  claim_carrier_categories(X_train, X_val):
    
    # Define a function to categorize each carrier based on its claim count
    count = X_train['Carrier Name'].value_counts()
    def categorize_claims(count):
        if count >= 40000:
            return 2
        elif 4000 <= count < 40000:
            return 1
        else:
            return 0

    # Apply the categorization to create a mapping dictionary
    carrier_category_map = count.apply(categorize_claims)

    # Map the `Carrier Name` to the new `Carrier Claim Category`
    X_train['Carrier Claim Category'] = X_train['Carrier Name'].map(carrier_category_map)
    X_val['Carrier Claim Category'] = X_val['Carrier Name'].map(carrier_category_map)

    return X_train.drop(["Carrier Name"], axis = 1, inplace = True) , X_val.drop(["Carrier Name"], axis = 1, inplace = True)


In [10]:
# Categorical encoder function
def categorical_prop_encode(X_train, X_val, feature):
    proportion = X_train[feature].value_counts(normalize = True)  # Get the porportion of each category
    X_train[feature] = X_train[feature].map(proportion)  # Map the porportions in the column
    X_val[feature] = X_val[feature].map(proportion) # Do the same for the validation subset
    X_val[feature] = X_val[feature].fillna(0)  # Handle categories in X_val not seen in X_train with 0


In [11]:
def Rfe(algorithm,X_train,y_train,X_val,y_val):

    #Generating the variables where we will store our results
    nof_list = np.arange(1, len(X_train.columns) + 1)            
    high_score = 0
    opt_n_features = 0
    train_score_list = []
    val_score_list = []

    #Variable where we will store the optimum amount of features
    best_rfe = None

    model = algorithm

    for n in nof_list:
        rfe = RFE(estimator=model, n_features_to_select=n)
    
    # Fitting the model to rfe
        X_train_rfe = rfe.fit_transform(X_train, y_train)
        X_val_rfe = rfe.transform(X_val)
    
    # Training and predicting
        model.fit(X_train_rfe, y_train)
        pred_train = model.predict(X_train_rfe)
        pred_val = model.predict(X_val_rfe)
    
    # Evaluating using the macro f1_score
        train_score = f1_score(y_train, pred_train, average="macro")
        val_score = f1_score(y_val, pred_val, average="macro")
        train_score_list.append(train_score)
        val_score_list.append(val_score)
    
    # Checking if this is the best combination of features so far
        if val_score >= high_score:
            high_score = val_score
            opt_n_features = n
            best_rfe = rfe  # Storing the rfe with the best number of features

# Checking what amount of features and which features where the best for the model
    selected_features = X_train.columns[best_rfe.support_].tolist()

    print("Optimal number of features: %d" % opt_n_features)
    print("Score with %d features: %f" % (opt_n_features, high_score))
    print("Selected Features:\n", selected_features)

    return selected_features


In [12]:
def cv_scores(X, y, num_features, cat_features, num_imputing_algorithm=XGBRegressor(), cat_imputing_algorithm=XGBClassifier(), scaling_outlier= False, scaler=MinMaxScaler(), rfe = False):
    """
    Takes as argument the predictors and the target, the models used for imputing numerical and categorical 
    features, if any scaling and outlier removal should be performed,what scaling method should be used and if feature selection with rfe should be used.
    Then it returns the results obtained from the stratified cross-validation for the given models.
    """
    skf = StratifiedKFold(n_splits=5)

    # Generating the lists to store our results
    precision_scores_train = [[],[],[],[],[]]
    precision_scores_val = [[],[],[],[],[]]  
    recall_scores_train = [[],[],[],[],[]]
    recall_scores_val = [[],[],[],[],[]]
    f1_scores_train =  [[],[],[],[],[]]
    f1_scores_val =  [[],[],[],[],[]]

    precision_scores_train_mean = []
    precision_scores_val_mean = [] 
    recall_scores_train_mean = []
    recall_scores_val_mean = []
    f1_scores_train_mean =  []
    f1_scores_val_mean =  []
    
    for train_index, test_index in skf.split(X, y):
        # Dividing our data in validation and train
        X_train, X_val = X.iloc[train_index].copy(), X.iloc[test_index].copy()
        y_train, y_val = y.iloc[train_index].copy(), y.iloc[test_index].copy()

        # Filling missing values
        for column in num_features:
            X_train, X_val = impute_missing_values(X_train,X_val, column, num_imputing_algorithm)
        
        # Removing inconsistencies on the train
        inconsistent = X_train[(X_train['Age at Injury'] > 80) | (X_train["Age at Injury"] < 16)].index
        X_train = X_train.loc[~X_train.index.isin(inconsistent)]
        y_train = y_train.loc[~y_train.index.isin(inconsistent)]

        # Performing scaling and outlier treatment dependent on the boolean
        if scaling_outlier:
            for column in num_features:
                handle_outliers(X_train, column)
                X_train, X_val = impute_missing_values(X_train,X_val, column, num_imputing_algorithm,validation= False)

            for column in num_features:
                scale_numerical(column, X_train, X_val, scaler)
                
        # Creating an ordinal variable
        claim_carrier_categories(X_train, X_val)

        #Filling missing values in the ordinal variable that might appear on X_val
        X_val, X_train = impute_missing_values(X_val,X_train, "Carrier Claim Category", cat_imputing_algorithm, validation=False)

        # Categorical Prop Encoding
        for cat_feature in cat_features:
            categorical_prop_encode(X_train, X_val, cat_feature)
            if scaling_outlier:
                scale_numerical("Carrier Claim Category", X_train, X_val, scaler)

        # Selecting features with Rfe
        if rfe:
            Selected_features = Rfe(XGBClassifier(), X_train, y_train, X_val, y_val)
            X_train = X_train[Selected_features]
            X_val = X_val[Selected_features]

        # Training the classification models
        DT.fit(X_train, y_train)
        print("Done DT")
        RF.fit(X_train, y_train)
        print("Done RF")
        XGB.fit(X_train, y_train)
        print("Done XGB")
        KNN.fit(X_train, y_train)
        print("Done KNN")
        MLP.fit(X_train, y_train)
        print("Done MLP")

        # Making the predictions for the training and validation data
        pred_train_DT = DT.predict(X_train)
        pred_train_RF = RF.predict(X_train)
        pred_train_XGB = XGB.predict(X_train)
        pred_train_KNN = KNN.predict(X_train)
        pred_train_MLP = MLP.predict(X_train)
        print("Done training predictions")
        
        pred_val_DT = DT.predict(X_val)
        pred_val_RF = RF.predict(X_val)
        pred_val_XGB = XGB.predict(X_val)
        pred_val_KNN = KNN.predict(X_val)
        pred_val_MLP = MLP.predict(X_val)
        print("Done validation predictions")

        # Calculating and storing the scores
        i = 0
        for predictions in [pred_train_DT,pred_train_RF,pred_train_XGB,pred_train_KNN,pred_train_MLP]:
            precision_scores_train[i].append(precision_score(y_train, predictions, average='macro'))
            recall_scores_train[i].append(recall_score(y_train, predictions, average='macro'))
            f1_scores_train[i].append(f1_score(y_train, predictions, average='macro'))
            i+=1
        j=0
        for predictions in [pred_val_DT,pred_val_RF,pred_val_XGB,pred_val_KNN,pred_val_MLP]:
            precision_scores_val[j].append(precision_score(y_val, predictions, average='macro'))
            recall_scores_val[j].append(recall_score(y_val, predictions, average='macro'))
            f1_scores_val[j].append(f1_score(y_val, predictions, average='macro'))
            j+=1

        # Check the confusion matrixes of our predictions
        print(confusion_matrix(y_val, pred_val_DT))
        print(confusion_matrix(y_val, pred_val_RF))
        print(confusion_matrix(y_val, pred_val_XGB))
        print(confusion_matrix(y_val, pred_val_KNN))
        print(confusion_matrix(y_val, pred_val_MLP))

    # Aggregating the average results across the folds
    for l in range(0,5): 
        precision_scores_train_mean.append(mean(precision_scores_train[l]))
        precision_scores_val_mean.append(mean(precision_scores_val[l]))
        recall_scores_train_mean.append(mean(recall_scores_train[l]))
        recall_scores_val_mean.append(mean(recall_scores_val[l]))
        f1_scores_train_mean.append(mean(f1_scores_train[l]))
        f1_scores_val_mean.append(mean(f1_scores_val[l]))

    # Storing the results in a dataframe
    model_results = pd.DataFrame(data={
        'Train_precision': precision_scores_train_mean,
        'Test_precision': precision_scores_val_mean,
        'Train_recall': recall_scores_train_mean,
        'Test_recall': recall_scores_val_mean,
        'Train_f1_score': f1_scores_train_mean,
        'Test_f1_score': f1_scores_val_mean,
    }, index=["Decision Tree","Random Forest","XGBoost", "KNearestNeighbors","Multi Layer Perceptron"])

    return model_results

In [13]:
def test_prediction(model, X, y, num_features, cat_features, data_test, 
                    num_imputing_algorithm=XGBRegressor(), 
                    cat_imputing_algorithm=XGBClassifier(), scaling_outlier = False , 
                    scaler=MinMaxScaler()):

    # Impute missing values
    for column in num_features:
        X, data_test = impute_missing_values(X,data_test,column, num_imputing_algorithm)

    # Remove inconsistencies
    inconsistent = X[(X['Age at Injury'] > 80) | (X["Age at Injury"] < 16)].index
    X.drop(inconsistent, inplace=True)
    y.drop(inconsistent, inplace=True)

    # Scale and remove outliers if specified
    if scaling_outlier:
        for column in num_features:
            handle_outliers(X, column)
            X, data_test = impute_missing_values(X,data_test, column, num_imputing_algorithm,validation = False)

        for column in num_features:
            scale_numerical(column, X, data_test, scaler)
        
    # Creating an ordinal variable
    claim_carrier_categories(X, data_test)
    
    # Inputing missing values that might appear on data_test in the new variable
    data_test, X = impute_missing_values(data_test, X, "Carrier Claim Category", cat_imputing_algorithm,validation= False)

    # Categorical Prop Encoding
    for cat_feature in cat_features:
        categorical_prop_encode(X, data_test, cat_feature)
        if scaling_outlier:
            scale_numerical("Carrier Claim Category", X, data_test, scaler)

    # Fitting the model, making the predictions and reverting the claim injury types back to their string form
    model.fit(X, y)
    pred_test = model.predict(data_test)
    pred_test = le.inverse_transform(pred_test)

    # Saving the final submission dataframe with indexes of data_test
    submission_df = pd.DataFrame({
        "Claim Injury Type": pred_test
    }, index=data_test.index)
    
    return submission_df

In [14]:
#Label enconding our target variable 
data["Claim Injury Type"] = le.fit_transform(data["Claim Injury Type"])

In [15]:
'''Dropping redundant variables that carry almost the same information (are extremely correlated (|0.8|))
We believe it was better to keep Age at Injury than birth year since it should be more related to the injury claim type (it will be tested later)
The same logic was applied to dropping the other two dates and two DSA variables since we believe Accident date to be more important'''

data = data.loc[:, ~data.columns.isin(['Birth Year', 'Assembly Date', 'C-2 Date', 'Assembly Date DSA', 'First Hearing Date DSA'])]
data_test = data_test.loc[:, ~data_test.columns.isin(['Birth Year', 'Assembly Date', 'C-2 Date', 'Assembly Date DSA', 'First Hearing Date DSA'])]


In [16]:
'''Since the codes always seem to provide the same or more information than the descriptions (have more categories),
and the codes are consistent (always only having 1 description for code, while descriptions may have multiple codes)
And Crámer's V says they have a very high association
we will drop the description columns.'''
data.drop(['Industry Code Description','WCIO Cause of Injury Description','WCIO Nature of Injury Description','WCIO Part Of Body Description'], axis = 1,inplace = True)
data_test.drop(['Industry Code Description','WCIO Cause of Injury Description','WCIO Nature of Injury Description','WCIO Part Of Body Description'], axis = 1,inplace = True)


In [17]:
'''Dropping redundant variables that carry almost the same information (have an association above or equal to 0.8)
We chose to keep County of Injury above Zip Code and District Name (these 3 have a high association) because it is the easist to interpret and the one we looked more in detail in the exploratory analysis
We kept the new variable we made called body section because it keeps most of the same information of the body part code but with a much lower cardinality
Lastly we only remove the variable Carrier Name in the function where we create the new variable with lower cardinality because it is need to create that new variable'''
#"WCIO Part Of Body Code","District Name"
data.drop(['Zip Code',"WCIO Part Of Body Code","District Name"], axis=1, inplace = True)
data_test.drop(["Zip Code","WCIO Part Of Body Code","District Name"], axis=1 , inplace = True)

In [18]:
num_features = ['Age at Injury', 'Average Weekly Wage', 'IME-4 Count', 'Number of Dependents',
                "Accident Year","Accident Month","Accident Day","Accident DayOfWeek",
                "C-2 Date DSA","C-3 Date DSA" ,"Accident Year","Accident Month",
                "Accident Day","Accident DayOfWeek","Accident Date","C-3 Date","First Hearing Date"]

In [19]:
cat_features = [
    "Alternative Dispute Resolution",
    "Carrier Type",
    "County of Injury",
    "Gender",
    "Industry Code",
    "Medical Fee Region",
    "WCIO Cause of Injury Code",
    "WCIO Nature of Injury Code",
    "Age at Injury Category",
    "Carrier Claim Category",
    "Body Section",
]

In [20]:
#We remove the missing values in this variable since they are only 5 rows and would never make a big difference in the 600 000+ rows of the dataset
data.dropna(axis= 0, inplace = True, subset="Alternative Dispute Resolution")

In [21]:
X = data.drop(["Claim Injury Type","Agreement Reached"], axis = 1)

In [22]:
y = data["Claim Injury Type"]

In [23]:
y2 = data["Agreement Reached"]

In [24]:
results = cv_scores(X, y,num_features,cat_features,scaling_outlier = True)

no missing values to input on Number of Dependents
no missing values to input on Accident Year
no missing values to input on Accident Month
no missing values to input on Accident Day
no missing values to input on Accident DayOfWeek
no missing values to input on Age at Injury
no missing values to input on Number of Dependents
no missing values to input on Accident Month
no missing values to input on Accident DayOfWeek
no missing values to input on Accident Year
no missing values to input on Accident Month
no missing values to input on Accident Day
no missing values to input on Accident DayOfWeek
Done DT


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   48.1s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 1.05579236
Iteration 2, loss = 1.00605287
Iteration 3, loss = 1.00593230
Iteration 4, loss = 1.00586784
Iteration 5, loss = 1.00582081
Iteration 6, loss = 1.00578257
Iteration 7, loss = 1.00574961
Iteration 8, loss = 1.00572040
Iteration 9, loss = 1.00569374
Iteration 10, loss = 1.00566915
Iteration 11, loss = 1.00564611
Iteration 12, loss = 1.00562450
Iteration 13, loss = 1.00560400
Iteration 14, loss = 1.00558443
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    1.7s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    5.5s finished


Done training predictions


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.4s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[  743  1112   155   286   136    41     4    18]
 [ 3987 32427  6772 11768  2935   175    14   137]
 [  573  5564  1723  3545  1681   532    51   113]
 [  953  8693  3155  8014  4571  3369   464   482]
 [  218  1256   881  1600  3753  1718   194    36]
 [   23   206   127   190    85   156    33    22]
 [    0     7     3     4     2     0     3     1]
 [    5    32     5    24     7     2     0    19]]
[[  950  1285     3   143   106     8     0     0]
 [  517 54183    24  2393  1037    55     0     6]
 [   45  9012   100  2641  1762   219     0     3]
 [   22 11551    87  9163  6746  2109     0    23]
 [    6   804    12  1140  7337   356     0     1]
 [    0    73     7   364   222   176     0     0]
 [    0     4     0    13     0     3     0     0]
 [    0    19     1    66     6     0     0     2]]
[[ 1077  1170     3    98   122    24     0     1]
 [  647 52731    32  2278  2199   282     2    44]
 [   76  8746    89  2389  1985   449    13    35]
 [   66 10387   152  8799  73

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   30.1s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 1.06170315
Iteration 2, loss = 1.00754073
Iteration 3, loss = 1.00688177
Iteration 4, loss = 1.00665206
Iteration 5, loss = 1.00653312
Iteration 6, loss = 1.00645682
Iteration 7, loss = 1.00640121
Iteration 8, loss = 1.00635704
Iteration 9, loss = 1.00631989
Iteration 10, loss = 1.00628745
Iteration 11, loss = 1.00625838
Iteration 12, loss = 1.00623177
Iteration 13, loss = 1.00620713
Iteration 14, loss = 1.00618412
Iteration 15, loss = 1.00616232
Iteration 16, loss = 1.00614160
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    1.1s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    3.6s finished


Done training predictions


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.7s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[  901  1028   224   237    86    16     0     3]
 [ 1815 38130  8550  7949  1625    92    11    43]
 [  289  5573  2443  3363  1691   369     9    44]
 [  695  6199  5837  9099  5530  2111    54   176]
 [  130   752  1931  2313  3526   946    26    32]
 [   50   118   212   307   108    31     0    17]
 [    3     1     5     9     1     0     0     0]
 [    2    23    23    38     4     0     0     4]]
[[ 1072  1107    45   183    86     2     0     0]
 [  418 52737   986  3007  1059     8     0     0]
 [   13  7370   577  3881  1893    47     0     0]
 [   13  6569  1308 15811  5636   363     0     1]
 [    4   399   360  3747  5088    58     0     0]
 [    0    32    51   696    64     0     0     0]
 [    0     2     1    15     1     0     0     0]
 [    0    22     4    68     0     0     0     0]]
[[ 1180   983   117    95   101    14     0     5]
 [  504 47741  5871  2484  1467    91     0    57]
 [   32  7331   768  2897  2424   309     0    20]
 [   54  6269  5035  7745  81

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   30.7s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 1.06629905
Iteration 2, loss = 1.01283265
Iteration 3, loss = 1.01236282
Iteration 4, loss = 1.01214060
Iteration 5, loss = 1.01200066
Iteration 6, loss = 1.01189975
Iteration 7, loss = 1.01182110
Iteration 8, loss = 1.01175681
Iteration 9, loss = 1.01170241
Iteration 10, loss = 1.01165519
Iteration 11, loss = 1.01161352
Iteration 12, loss = 1.01157609
Iteration 13, loss = 1.01154204
Iteration 14, loss = 1.01151080
Iteration 15, loss = 1.01148186
Iteration 16, loss = 1.01145484
Iteration 17, loss = 1.01142946
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    3.1s finished


Done training predictions


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.7s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[  992   880   272   201   122    24     1     4]
 [ 1999 36509 10577  6885  2066   157     4    18]
 [  335  5568  2306  2833  2311   387    13    28]
 [  500  5861  6411  6187  7466  2989    57   230]
 [  128  1071  3003  2875  2003   559     7    10]
 [   36   103   141   424   119    16     0     3]
 [    1     4     5     6     3     0     0     0]
 [    2    21    27    34     5     2     0     3]]
[[ 1091  1064    91   105   139     6     0     0]
 [  391 51000  2899  2463  1442    20     0     0]
 [   11  7666   633  3012  2382    77     0     0]
 [   23  7123  3709 10161  7803   881     0     1]
 [    5   506  1356  4061  3689    39     0     0]
 [    0    21     8   755    58     0     0     0]
 [    0     1     1    14     3     0     0     0]
 [    0    13     6    73     2     0     0     0]]
[[ 1131   974   183    68   130     8     0     2]
 [  426 43109  9908  2784  1941    36     0    11]
 [   27  7249  1123  2363  2862   148     0     9]
 [   88  6300  7692  5105  92

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   31.7s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 1.06069674
Iteration 2, loss = 1.00504154
Iteration 3, loss = 1.00480554
Iteration 4, loss = 1.00468897
Iteration 5, loss = 1.00461212
Iteration 6, loss = 1.00455479
Iteration 7, loss = 1.00450846
Iteration 8, loss = 1.00446930
Iteration 9, loss = 1.00443513
Iteration 10, loss = 1.00440443
Iteration 11, loss = 1.00437653
Iteration 12, loss = 1.00435088
Iteration 13, loss = 1.00432700
Iteration 14, loss = 1.00430459
Iteration 15, loss = 1.00428340
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    3.3s finished


Done training predictions


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.7s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[  966   966   224   184   128    27     0     1]
 [ 1559 37849  8850  6319  3473   130     0    34]
 [  242  5112  2130  2949  2820   481     0    47]
 [  584  5078  5947  7396  7936  2584    11   166]
 [   98  1046  3124  3058  1877   427     8    18]
 [    5   120   225   381    93    12     0     6]
 [    1     2     3     9     4     0     0     0]
 [    0    28    21    33     5     4     0     3]]
[[ 1186   997    78    96   137     2     0     0]
 [  363 52887  1365  1819  1774     6     0     0]
 [   10  7364   538  2656  3194    19     0     0]
 [   14  7161  3480  9295  9555   197     0     0]
 [    4   571  1846  4960  2265    10     0     0]
 [    0    18    22   740    62     0     0     0]
 [    0     0     0    17     2     0     0     0]
 [    0    22     8    62     2     0     0     0]]
[[ 1297   846   140    71   128    11     0     3]
 [  458 46728  6635  1305  3025    42     0    21]
 [   20  6962   876  2083  3671   153     0    16]
 [   34  5947  6645  5439 105

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   33.1s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 1.06043166
Iteration 2, loss = 1.00364842
Iteration 3, loss = 1.00357320
Iteration 4, loss = 1.00351931
Iteration 5, loss = 1.00347541
Iteration 6, loss = 1.00343761
Iteration 7, loss = 1.00340389
Iteration 8, loss = 1.00337331
Iteration 9, loss = 1.00334511
Iteration 10, loss = 1.00331882
Iteration 11, loss = 1.00329418
Iteration 12, loss = 1.00327082
Iteration 13, loss = 1.00324864
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    1.2s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    4.1s finished


Done training predictions


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.4s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[  918   632   745   177    21     0     0     2]
 [  652 11615 41974  3696   263     7     0     7]
 [   70  1987 10202  1363   150     6     0     3]
 [  152  4097 20973  3970   484    19     0     7]
 [   80  1358  4517  3081   601    17     0     2]
 [    8    59   318   381    71     3     0     2]
 [    0     0     8     9     3     0     0     0]
 [    3    11    31    43     5     0     0     1]]
[[ 1087   571   759    71     7     0     0     0]
 [  152  7572 50024   413    53     0     0     0]
 [    3   444 13076   233    25     0     0     0]
 [    4  1185 27245  1242    26     0     0     0]
 [    1   392  5222  3588   453     0     0     0]
 [    0    11   219   584    28     0     0     0]
 [    0     1     3    14     2     0     0     0]
 [    0    10    35    49     0     0     0     0]]
[[ 1087   429   932    37    10     0     0     0]
 [  218 11613 46052   264    66     0     0     1]
 [    2   836 12721   171    51     0     0     0]
 [    8  1416 27235   945    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [48]:
#Check the results
results

Unnamed: 0,Train_precision,Test_precision,Train_recall,Test_recall,Train_f1_score,Test_f1_score
Decision Tree,0.99995,0.213073,0.99999,0.224442,0.99997,0.196063
Random Forest,0.999973,0.316299,0.999958,0.267772,0.999966,0.252458
XGBoost,0.795255,0.291958,0.580957,0.259127,0.623074,0.239476
KNearestNeighbors,0.381528,0.311876,0.302539,0.268804,0.314318,0.272512
Multi Layer Perceptron,0.28677,0.24304,0.203047,0.202977,0.188041,0.188804


In [179]:
# Choose best model from KFold of cross validation
# Train model on whole train and predict test data
submission = test_prediction(XGBClassifier(),X,y,num_features,cat_features,data_test)
submission

no missing values to input on Number of Dependents
no missing values to input on Accident Year
no missing values to input on Accident Month
no missing values to input on Accident Day
no missing values to input on Accident DayOfWeek
no missing values to input on Alternative Dispute Resolution




Unnamed: 0_level_0,Claim Injury Type
Claim Identifier,Unnamed: 1_level_1
6165911,2. NON-COMP
6166141,2. NON-COMP
6165907,2. NON-COMP
6166047,2. NON-COMP
6166102,2. NON-COMP
...,...
6553137,3. MED ONLY
6553119,4. TEMPORARY
6553542,4. TEMPORARY
6553455,4. TEMPORARY


In [181]:
#export to csv
submission.to_csv("Submission.csv")