# Model Training and Assessment

In [1]:
import pandas as pd
import numpy as np

#model imports
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier,XGBRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

#metrics
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from statistics import mean

#useful functions
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE


In [2]:
#Define an instance of the models without hyperparameters

DT = DecisionTreeClassifier()
RF = RandomForestClassifier(verbose = 1, n_jobs=-1)
XGB = XGBClassifier()
KNN = KNeighborsClassifier(n_neighbors = 50) # n_neighbors through trial and error
MLP = MLPClassifier(activation='relu',
    solver='sgd',learning_rate='invscaling',
    learning_rate_init=0.001,
    batch_size=100,verbose = True) # Based on ML Practical Parameters


#Creating an instance of our encoder for the target
le = LabelEncoder()

In [3]:
# Load data
data= pd.read_csv("../../data/train_data_enriched.csv", index_col="Claim Identifier")
data_test = pd.read_csv("../../data/test_data_enriched.csv",index_col="Claim Identifier")

  data= pd.read_csv("../../data/train_data_enriched.csv", index_col="Claim Identifier")


## Helper functions

In [4]:
def impute_missing_values(X_train, X_val, target_column, algorithm,validation = True):

    # Separating the missing values from the non missing values
    available_data = X_train[X_train[target_column].notna()]
    missing_X_train = X_train[X_train[target_column].isna()]
    missing_X_val = X_val[X_val[target_column].isna()]

    # Making sure if there is enough data for inputing, returning it if not
    if len(missing_X_train) == 0:
        print(f"no missing values to input on {target_column}")
        return X_train, X_val

    # Separating the target column from the rest
    X_available = available_data.drop(columns=[target_column])
    y_available = available_data[target_column]

    # Making sure our columns are consistent
    X_available = X_available.select_dtypes(include=["number"])
    missing_X_train = missing_X_train.select_dtypes(include=["number"])
    missing_X_val = missing_X_val.select_dtypes(include=["number"])

    common_columns = X_available.columns.intersection(missing_X_train.columns).intersection(missing_X_val.columns)
    X_available = X_available[common_columns]
    missing_X_train = missing_X_train[common_columns]
    missing_X_val = missing_X_val[common_columns]

    # Making sure there is any column after keeping the common columns
    if X_available.shape[1] == 0:
        print(f"Without any column to input in {target_column}")
        return X_train, X_val

    # Training the model with the available data
    model = algorithm
    model.fit(X_available, y_available)

    # Prediting the missing values
    predicted_train = model.predict(missing_X_train)
    if validation:
        predicted_val = model.predict(missing_X_val)

    # Filling the training and validation with predictions. The latter is only filled if the argument is true
    X_train = X_train.copy()
    X_train.loc[X_train[target_column].isna(), target_column] = predicted_train

    if validation:
        X_val = X_val.copy()
        X_val.loc[X_val[target_column].isna(), target_column] = predicted_val

    return X_train, X_val

In [5]:
#This helper fuction was used for finding problems with later functions
def check_missing_values(data):
    print(data.isnull().sum()[data.isnull().sum() > 0])

In [6]:
def handle_outliers(data, column):
    # Handles outliers in a numerical column by replacing values outside the interquartile range (IQR) with missing values

    # Makes sure we only treat outliers in columns that have any data
    if data[column].notnull().sum() > 0: 

        # Calculating inter quantile range limits
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Changing the ouliers to missing values
        data[column] = np.where(data[column] < lower_bound, np.nan, data[column])
        data[column] = np.where(data[column] > upper_bound, np.nan, data[column])

    return data

In [7]:
def scale_numerical(column, X_train, X_val, scaler):
    # Scales the given numerical value based on the training data with the given scaler
    
    # Make sure the column is numerical
    if not pd.api.types.is_numeric_dtype(X_train[column]):
        print(f"Columm '{column}' is not numerical and will be ignored")
        return

    # Scaling the data
    try:
        X_train[column] = scaler.fit_transform(X_train[[column]])
        X_val[column] = scaler.transform(X_val[[column]])
    except ValueError as e:
        print(f"Mistake scaling the column '{column}': {e}")

In [8]:
def  claim_carrier_categories(X_train, X_val):
    if 'Carrier Claim Category' not in X_train.columns:
        # Function to categorize each carrier based on its claim count for dimensionality reduction
        
        count = X_train['Carrier Name'].value_counts() # Count individual Carriers' counts only on train data
        def categorize_claims(count): # Map carrier size based on fixed thresholds decided by us
            if count >= 40000:
                return 2
            elif 4000 <= count < 40000:
                return 1
            else:
                return 0
    
        # Apply the categorization to create a mapping dictionary
        carrier_category_map = count.apply(categorize_claims)
    
        # Map the `Carrier Name` to the new `Carrier Claim Category`
        X_train['Carrier Claim Category'] = X_train['Carrier Name'].map(carrier_category_map)
        X_val['Carrier Claim Category'] = X_val['Carrier Name'].map(carrier_category_map)

        # If there is a missing value on x_val, it means that that carrier name didn't exist on x_train and is unlikely to have many claim counts
        # X_train cannot have NaN in this new feature as Carrier Name has no Missing values
        X_val['Carrier Claim Category'].fillna(0, inplace = True)
    
        return X_train.drop(["Carrier Name"], axis = 1, inplace = True) , X_val.drop(["Carrier Name"], axis = 1, inplace = True)


In [9]:
def categorical_prop_encode(X_train, X_val, feature):
    # Categorical encoder function for individual feature
    proportion = X_train[feature].value_counts(normalize = True)  # Get the porportion of each category
    X_train[feature] = X_train[feature].map(proportion)  # Map the porportions in the column
    X_val[feature] = X_val[feature].map(proportion) # Do the same for the validation subset
    X_val[feature] = X_val[feature].fillna(0)  # Handle categories in X_val not seen in X_train with 0


In [10]:
def Rfe(algorithm,X_train,y_train,X_val,y_val):
    # Function to run RFE on specified algorithm with train/val split for it to be run within every fold to avoid data leakage
    
    #Generating the variables where we will store our results
    nof_list = np.arange(1, len(X_train.columns) + 1)            
    high_score = 0
    opt_n_features = 0
    train_score_list = []
    val_score_list = []

    #Variable where we will store the optimum amount of features
    best_rfe = None

    model = algorithm

    for n in nof_list:
        rfe = RFE(estimator=model, n_features_to_select=n)
    
    # Fitting the model to rfe
        X_train_rfe = rfe.fit_transform(X_train, y_train)
        X_val_rfe = rfe.transform(X_val)
    
    # Training and predicting
        model.fit(X_train_rfe, y_train)
        pred_train = model.predict(X_train_rfe)
        pred_val = model.predict(X_val_rfe)
    
    # Evaluating using the macro f1_score
        train_score = f1_score(y_train, pred_train, average="macro")
        val_score = f1_score(y_val, pred_val, average="macro")
        train_score_list.append(train_score)
        val_score_list.append(val_score)
    
    # Checking if this is the best combination of features so far
        if val_score >= high_score:
            high_score = val_score
            opt_n_features = n
            best_rfe = rfe  # Storing the rfe with the best number of features

# Checking what amount of features and which features where the best for the model
    selected_features = X_train.columns[best_rfe.support_].tolist()

    print("Optimal number of features: %d" % opt_n_features)
    print("Score with %d features: %f" % (opt_n_features, high_score))
    print("Selected Features:\n", selected_features)

    return selected_features


In [11]:
def cv_scores(X, y, num_features, cat_features, num_imputing_algorithm=XGBRegressor(), cat_imputing_algorithm=XGBClassifier(), scaling_outlier= True, scaler=MinMaxScaler(), rfe = False):
    """
    Performs stratified cross-validation on a dataset to evaluate multiple classification models, while handling 
    preprocessing steps like missing value imputation, scaling, outlier removal, feature engineering, and optional 
    feature selection using Recursive Feature Elimination (RFE).

    Parameters:
    ----------
    X : Entire data without target variable

    y : Entire data with only target variable

    num_features : List of numerical features within X

    cat_features : List of categorical features within X (non-binary)

    num_imputing_algorithm : Algorithm to be used for imputing numerical features' missing values. Defaults to XGBRegressor.

    cat_imputing_algorithm : Algorithm to be used for imputing categorical features' missing values. Defaults to XGBClassifier.

    scaling_outlier : Boolean indicating if scaling and outlier handling should be applied. Defaults to True.

    scaler : Scaling algorithm to be used for scaling. Defaults to True.

    rfe : Boolean indicating if RFE should be used within each fold to determine current most valuable features to use. Defaults to False.

    """
    skf = StratifiedKFold(n_splits=5)

    # Generating the lists to store our results
    precision_scores_train = [[],[],[],[],[]]
    precision_scores_val = [[],[],[],[],[]]  
    recall_scores_train = [[],[],[],[],[]]
    recall_scores_val = [[],[],[],[],[]]
    f1_scores_train =  [[],[],[],[],[]]
    f1_scores_val =  [[],[],[],[],[]]

    precision_scores_train_mean = []
    precision_scores_val_mean = [] 
    recall_scores_train_mean = []
    recall_scores_val_mean = []
    f1_scores_train_mean =  []
    f1_scores_val_mean =  []
    
    for train_index, test_index in skf.split(X, y):
        # Dividing our data in validation and train
        X_train, X_val = X.iloc[train_index].copy(), X.iloc[test_index].copy()
        y_train, y_val = y.iloc[train_index].copy(), y.iloc[test_index].copy()

        # Filling missing values
        for column in num_features:
            X_train, X_val = impute_missing_values(X_train,X_val, column, num_imputing_algorithm)
        
        # Removing inconsistencies on the train
        inconsistent = X_train[(X_train['Age at Injury'] > 80) | (X_train["Age at Injury"] < 16)].index
        X_train = X_train.loc[~X_train.index.isin(inconsistent)]
        y_train = y_train.loc[~y_train.index.isin(inconsistent)]

        # Performing scaling and outlier treatment dependent on the boolean
        if scaling_outlier:
            for column in num_features:
                handle_outliers(X_train, column)
                X_train, X_val = impute_missing_values(X_train,X_val, column, num_imputing_algorithm,validation= False)

            for column in num_features:
                scale_numerical(column, X_train, X_val, scaler)
                
        # Creating an ordinal variable
        claim_carrier_categories(X_train, X_val)

        #Filling missing values in the ordinal variable that might appear on X_val
        X_val, X_train = impute_missing_values(X_val,X_train, "Carrier Claim Category", cat_imputing_algorithm, validation=False)

        # Scaling special Carrier Claim Category feature
        if scaling_outlier:
            scale_numerical("Carrier Claim Category", X_train, X_val, scaler)

        # Categorical Prop Encoding
        for cat_feature in cat_features:
            categorical_prop_encode(X_train, X_val, cat_feature)
            if scaling_outlier:
                scale_numerical("Carrier Claim Category", X_train, X_val, scaler)

        # Selecting features with Rfe
        if rfe:
            Selected_features = Rfe(XGBClassifier(), X_train, y_train, X_val, y_val)
            X_train = X_train[Selected_features]
            X_val = X_val[Selected_features]

        
        # Training the classification models
        DT.fit(X_train, y_train)
        print("Done DT")
        RF.fit(X_train, y_train)
        print("Done RF")
        XGB.fit(X_train, y_train)
        print("Done XGB")
        KNN.fit(X_train, y_train)
        print("Done KNN")
        MLP.fit(X_train, y_train)
        print("Done MLP")

        # Making the predictions for the training and validation data
        pred_train_DT = DT.predict(X_train)
        pred_train_RF = RF.predict(X_train)
        pred_train_XGB = XGB.predict(X_train)
        pred_train_KNN = KNN.predict(X_train)
        pred_train_MLP = MLP.predict(X_train)
        print("Done training predictions")
        
        pred_val_DT = DT.predict(X_val)
        pred_val_RF = RF.predict(X_val)
        pred_val_XGB = XGB.predict(X_val)
        pred_val_KNN = KNN.predict(X_val)
        pred_val_MLP = MLP.predict(X_val)
        print("Done validation predictions")

        # Calculating and storing the scores
        i = 0
        for predictions in [pred_train_DT,pred_train_RF,pred_train_XGB,pred_train_KNN,pred_train_MLP]:
            precision_scores_train[i].append(precision_score(y_train, predictions, average='macro'))
            recall_scores_train[i].append(recall_score(y_train, predictions, average='macro'))
            f1_scores_train[i].append(f1_score(y_train, predictions, average='macro'))
            i+=1
        j=0
        for predictions in [pred_val_DT,pred_val_RF,pred_val_XGB,pred_val_KNN,pred_val_MLP]:
            precision_scores_val[j].append(precision_score(y_val, predictions, average='macro'))
            recall_scores_val[j].append(recall_score(y_val, predictions, average='macro'))
            f1_scores_val[j].append(f1_score(y_val, predictions, average='macro'))
            j+=1

        # Check the confusion matrixes of our predictions
        print(confusion_matrix(y_val, pred_val_DT))
        print(confusion_matrix(y_val, pred_val_RF))
        print(confusion_matrix(y_val, pred_val_XGB))
        print(confusion_matrix(y_val, pred_val_KNN))
        print(confusion_matrix(y_val, pred_val_MLP))

    # Aggregating the average results across the folds
    for l in range(0,5): 
        precision_scores_train_mean.append(mean(precision_scores_train[l]))
        precision_scores_val_mean.append(mean(precision_scores_val[l]))
        recall_scores_train_mean.append(mean(recall_scores_train[l]))
        recall_scores_val_mean.append(mean(recall_scores_val[l]))
        f1_scores_train_mean.append(mean(f1_scores_train[l]))
        f1_scores_val_mean.append(mean(f1_scores_val[l]))

    # Storing the results in a dataframe
    model_results = pd.DataFrame(data={
        'Train_precision': precision_scores_train_mean,
        'Test_precision': precision_scores_val_mean,
        'Train_recall': recall_scores_train_mean,
        'Test_recall': recall_scores_val_mean,
        'Train_f1_score': f1_scores_train_mean,
        'Test_f1_score': f1_scores_val_mean,
    }, index=["Decision Tree","Random Forest","XGBoost", "KNearestNeighbors","Multi Layer Perceptron"])

    return model_results

In [12]:
def test_prediction(model, X_train, y_train, num_features, cat_features, X_test, 
                    num_imputing_algorithm=XGBRegressor(), 
                    cat_imputing_algorithm=XGBClassifier(), scaling_outlier = False , 
                    scaler=MinMaxScaler(), missing = True,
                    secondary_model = None, y_train_secondary=None, secondary_missing = True):

    """
    Performs stratified cross-validation on a dataset to evaluate multiple classification models, while handling 
    preprocessing steps like missing value imputation, scaling, outlier removal, feature engineering, and optional 
    feature selection using Recursive Feature Elimination (RFE).

    Parameters:
    ----------
    model: algorithm to use to predict primary target variable.
    
    X_train : Entire data without target variable.

    y_train : Entire data with only target variable.

    num_features : List of numerical features within X

    cat_features : List of categorical features within X (non-binary)

    num_imputing_algorithm : Algorithm to be used for imputing numerical features' missing values. Defaults to XGBRegressor.

    cat_imputing_algorithm : Algorithm to be used for imputing categorical features' missing values. Defaults to XGBClassifier.

    scaling_outlier : Boolean indicating if scaling and outlier handling should be applied. Defaults to True.

    scaler : Scaling algorithm to be used for scaling. Defaults to True.

    rfe : Boolean indicating if RFE should be used within each fold to determine current most valuable features to use. Defaults to False.

    secondary_model: algorithm to use to predict secondary target variable. Defaults to None if secondary should not be predicted.

    y_train_secondary: Entire data with only secondary target variable. Defaults to None if secondary should not be predicted.

    """
    
    # Impute missing values
    if missing:
        for column in num_features:
            X_train, X_test = impute_missing_values(X_train,X_test,column, num_imputing_algorithm)

    # Remove inconsistencies
    inconsistent = X_train[(X_train['Age at Injury'] > 80) | (X_train["Age at Injury"] < 16)].index
    X_train.drop(inconsistent, inplace=True)
    y_train.drop(inconsistent, inplace=True)
    if secondary_model is not None and y_train_secondary is not None:
        y_train_secondary.drop(inconsistent, inplace=True)

    # Scale and remove outliers if specified
    if scaling_outlier:
        for column in num_features:
            handle_outliers(X_train, column) # Handle outliers only for training partition
            if missing:
                X_train, X_test = impute_missing_values(X_train,X_test, column, num_imputing_algorithm,validation = False)

        for column in num_features:
            scale_numerical(column, X_train, X_test, scaler)
        
    # Creating an ordinal variable
    claim_carrier_categories(X_train, X_test)

    # Scaling special Carrier Claim Category feature
    if scaling_outlier:
        scale_numerical("Carrier Claim Category", X_train, X_test, scaler)

    # Categorical Prop Encoding
    for cat_feature in cat_features:
        categorical_prop_encode(X_train, X_test, cat_feature)
        if scaling_outlier:
            scale_numerical("Carrier Claim Category", X_train, X_test, scaler)

    # Predict secondary target variable if secondary model and variable is given
    if secondary_model is not None and y_train_secondary is not None:
        if secondary_missing: # Imputation of missing values for secondary model
            X_train_secondary = X_train.copy()
            X_test_secondary = X_test.copy()
            for column in num_features:
                X_train_secondary, X_test_secondary = impute_missing_values(X_train_secondary,X_test_secondary, column, num_imputing_algorithm)

        secondary_model.fit(X_train_secondary, y_train_secondary)
        pred_secondary_test = secondary_model.predict(X_test_secondary)
        X_test["Agreement Reached"] = pred_secondary_test
        X_train["Agreement Reached"] = y_train_secondary
        
    # Fitting the model, making the predictions and reverting the claim injury types back to their string form
    model.fit(X_train, y_train)
    pred_test = model.predict(X_test)
    pred_test = le.inverse_transform(pred_test)

    # Saving the final submission dataframe with indexes of X_test
    submission_df = pd.DataFrame({
        "Claim Injury Type": pred_test
    }, index=X_test.index)
    
    return submission_df

# Dataset preparation
Label encoding and dropping variables based on multivariate exploration.

In [13]:
#Label enconding our target variable 
data["Claim Injury Type"] = le.fit_transform(data["Claim Injury Type"])

In [14]:
'''Dropping redundant variables that carry almost the same information (are extremely correlated (|0.8|))
We believe it was better to keep Age at Injury than birth year since it should be more related to the injury claim type (it will be tested later)
The same logic was applied to dropping the other two dates and two DSA variables since we believe Accident date to be more important'''

data = data.loc[:, ~data.columns.isin(['Birth Year', 'Assembly Date', 'C-2 Date', 'Assembly Date DSA', 'First Hearing Date DSA'])]
data_test = data_test.loc[:, ~data_test.columns.isin(['Birth Year', 'Assembly Date', 'C-2 Date', 'Assembly Date DSA', 'First Hearing Date DSA'])]


In [15]:
'''Since the codes always seem to provide the same or more information than the descriptions (have more categories),
and the codes are consistent (always only having 1 description for code, while descriptions may have multiple codes)
And Crámer's V says they have a very high association
we will drop the description columns.'''
data.drop(['Industry Code Description','WCIO Cause of Injury Description','WCIO Nature of Injury Description','WCIO Part Of Body Description'], axis = 1,inplace = True)
data_test.drop(['Industry Code Description','WCIO Cause of Injury Description','WCIO Nature of Injury Description','WCIO Part Of Body Description'], axis = 1,inplace = True)


In [16]:
'''Dropping redundant variables that carry almost the same information (have an association above or equal to 0.8)
We chose to keep County of Injury above Zip Code and District Name (these 3 have a high association) because it is the easist to interpret and the one we looked more in detail in the exploratory analysis
We kept the new variable we made called body section because it keeps most of the same information of the body part code but with a much lower cardinality
Lastly we only remove the variable Carrier Name in the function where we create the new variable with lower cardinality because it is need to create that new variable'''
data.drop(['Zip Code',"WCIO Part Of Body Code","District Name"], axis=1, inplace = True)
data_test.drop(["Zip Code","WCIO Part Of Body Code","District Name"], axis=1 , inplace = True)

## Definition of used numerical and categorical features

In [17]:
num_features = ['Age at Injury', 'Average Weekly Wage', 'IME-4 Count', 'Number of Dependents',
                "Accident Year","Accident Month","Accident Day","Accident DayOfWeek",
                "C-2 Date DSA","C-3 Date DSA","Accident Date","C-3 Date","First Hearing Date"]

In [18]:
cat_features = [
    "Alternative Dispute Resolution",
    "Carrier Type",
    "County of Injury",
    "Gender",
    "Industry Code",
    "Medical Fee Region",
    "WCIO Cause of Injury Code",
    "WCIO Nature of Injury Code",
    "Age at Injury Category",
    "Body Section",
]

### Quality Checks
The only not included variables are the target variables and all the binary variables (non-numeric and non-categorical) as seen below. As binary variables are not scaled nor encoded they don't appear in the definitions above.

In [19]:
len(num_features) + len(cat_features)

23

In [20]:
len(data.columns)

33

In [21]:
# Combine num_features and cat_features into a set
defined_features = set(num_features) | set(cat_features)

# Get all column names in the DataFrame
all_columns = set(data.columns)

# Find columns that are not in the defined features
undefined_columns = all_columns - defined_features

print("Columns not in num_features or cat_features:", undefined_columns)
print("Count of difference:", len(undefined_columns))

Columns not in num_features or cat_features: {'First Hearing Date_missing', 'C-3 Date_missing', 'Carrier Name', 'COVID-19 Indicator', 'Assembly Date_missing', 'Agreement Reached', 'Attorney/Representative', 'Claim Injury Type', 'Accident Date_missing', 'C-2 Date_missing'}
Count of difference: 10


## Isolation of target variables

In [22]:
X = data.drop(["Claim Injury Type","Agreement Reached"], axis = 1)

In [23]:
y = data["Claim Injury Type"]

In [24]:
y2 = data["Agreement Reached"]

# Run CV Score Loop

In [30]:
results = cv_scores(X, y, num_features, cat_features)

no missing values to input on Number of Dependents
no missing values to input on Age at Injury
no missing values to input on Number of Dependents
no missing values to input on Accident Month
no missing values to input on Accident DayOfWeek
Done DT


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   31.8s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 1.05945966
Iteration 2, loss = 1.00383479
Iteration 3, loss = 1.00373393
Iteration 4, loss = 1.00367385
Iteration 5, loss = 1.00362802
Iteration 6, loss = 1.00358934
Iteration 7, loss = 1.00355530
Iteration 8, loss = 1.00352444
Iteration 9, loss = 1.00349600
Iteration 10, loss = 1.00346950
Iteration 11, loss = 1.00344459
Iteration 12, loss = 1.00342100
Iteration 13, loss = 1.00339851
Iteration 14, loss = 1.00337703
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.9s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    4.1s finished


Done training predictions


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.4s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[  743  1093   163   300   137    34    12    13]
 [ 3967 31700  6768 12646  2775   190    22   147]
 [  653  5334  1755  3884  1497   494    40   125]
 [ 1178  7955  3215  8850  4325  3297   365   516]
 [  322  1044   954  1720  3633  1791   153    39]
 [   34   162   124   245    86   143    25    23]
 [    0     5     1    10     2     0     1     1]
 [    5    26     8    27     7     2     4    15]]
[[  886  1338     4   148   109    10     0     0]
 [  448 54257    25  2372  1048    62     0     3]
 [   45  8971   100  2753  1690   222     0     1]
 [   26 11347    69  9501  6475  2260     0    23]
 [    7   746     8  1255  7190   449     0     1]
 [    0    56     4   375   202   205     0     0]
 [    0     1     0    13     1     5     0     0]
 [    0    26     0    59     5     2     0     2]]
[[ 1077  1170     3    98   122    24     0     1]
 [  647 52731    32  2278  2199   282     2    44]
 [   76  8746    89  2389  1985   449    13    35]
 [   66 10387   152  8799  73

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   31.6s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 1.05831738
Iteration 2, loss = 1.00064623
Iteration 3, loss = 1.00043468
Iteration 4, loss = 1.00032688
Iteration 5, loss = 1.00025335
Iteration 6, loss = 1.00019698
Iteration 7, loss = 1.00015084
Iteration 8, loss = 1.00011153
Iteration 9, loss = 1.00007682
Iteration 10, loss = 1.00004553
Iteration 11, loss = 1.00001692
Iteration 12, loss = 0.99999045
Iteration 13, loss = 0.99996566
Iteration 14, loss = 0.99994231
Iteration 15, loss = 0.99992013
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    1.1s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    3.9s finished


Done training predictions


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.8s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[  889  1010   216   270    94    12     0     4]
 [ 1802 38152  8524  7813  1764   101    11    48]
 [  272  5441  2499  3359  1731   410    10    59]
 [  591  5577  6693  8625  5584  2261    69   301]
 [  117   673  2136  2146  3521   984    34    45]
 [   20    84   339   219   132    23     0    26]
 [    1     2     5     8     2     0     0     1]
 [    3    27    24    29     5     0     0     6]]
[[ 1077  1097    50   179    90     2     0     0]
 [  439 52477  1214  3017  1052    16     0     0]
 [   13  7375   645  3689  1960    99     0     0]
 [   14  6680  1551 14611  5989   855     0     1]
 [    3   416   402  3569  5151   115     0     0]
 [    0    32    62   684    63     2     0     0]
 [    0     1     1    17     0     0     0     0]
 [    0    22     3    69     0     0     0     0]]
[[ 1242   925   121   106    84    13     0     4]
 [  567 47351  6128  2640  1390    89     0    50]
 [   36  7290   817  3033  2216   356     0    33]
 [   50  6154  5058  8213  73

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   35.7s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 1.06573210
Iteration 2, loss = 1.00717049
Iteration 3, loss = 1.00700699
Iteration 4, loss = 1.00691632
Iteration 5, loss = 1.00685183
Iteration 6, loss = 1.00680128
Iteration 7, loss = 1.00675908
Iteration 8, loss = 1.00672237
Iteration 9, loss = 1.00668958
Iteration 10, loss = 1.00665979
Iteration 11, loss = 1.00663237
Iteration 12, loss = 1.00660681
Iteration 13, loss = 1.00658287
Iteration 14, loss = 1.00656017
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.9s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    3.6s finished


Done training predictions


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.8s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[  957   903   287   201   121    21     1     5]
 [ 1738 36617 10519  7011  2152   156     3    19]
 [  298  5551  2301  2848  2390   357    11    25]
 [  635  5617  6416  6257  7432  3082    52   210]
 [  119  1024  3097  2729  2073   593    10    11]
 [   27    64   183   421   125    18     0     4]
 [    2     2     4     4     6     1     0     0]
 [    5    25    23    32     5     2     0     2]]
[[ 1078  1071    90   114   139     4     0     0]
 [  380 50961  2816  2533  1506    18     0     1]
 [   12  7647   653  2937  2471    61     0     0]
 [   20  7018  3904 10155  7791   811     0     2]
 [    4   539  1483  4075  3516    39     0     0]
 [    0    21    11   747    63     0     0     0]
 [    0     2     0    13     4     0     0     0]
 [    0    14     5    72     3     0     0     0]]
[[ 1153   945   190    60   138     7     0     3]
 [  436 42907 10106  2667  2034    38     0    27]
 [   35  7235  1151  2310  2899   140     0    11]
 [   74  6362  7753  4719  95

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   29.9s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 1.05629337
Iteration 2, loss = 1.00533859
Iteration 3, loss = 1.00478610
Iteration 4, loss = 1.00454685
Iteration 5, loss = 1.00440601
Iteration 6, loss = 1.00431067
Iteration 7, loss = 1.00424018
Iteration 8, loss = 1.00418497
Iteration 9, loss = 1.00413973
Iteration 10, loss = 1.00410122
Iteration 11, loss = 1.00406758
Iteration 12, loss = 1.00403758
Iteration 13, loss = 1.00401041
Iteration 14, loss = 1.00398549
Iteration 15, loss = 1.00396243
Iteration 16, loss = 1.00394087
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.8s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    3.4s finished


Done training predictions


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.7s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[  968   963   222   180   130    30     0     3]
 [ 1423 38188  8804  6512  3112   146     0    29]
 [  200  5179  2177  2978  2715   496     0    36]
 [  426  5192  5931  7483  7835  2669    12   154]
 [   76  1035  3113  3056  1923   439     5     9]
 [    4    99   209   402   110    14     0     4]
 [    1     3     4     5     5     0     0     1]
 [    1    27    26    27     3     5     0     5]]
[[ 1160  1019    76    98   141     2     0     0]
 [  355 52964  1298  1813  1777     7     0     0]
 [   12  7397   522  2619  3208    23     0     0]
 [   18  7220  3068  9735  9427   234     0     0]
 [    4   570  1780  5094  2201     7     0     0]
 [    0    17    21   747    57     0     0     0]
 [    0     0     0    17     2     0     0     0]
 [    0    13     8    70     3     0     0     0]]
[[ 1297   846   140    71   128    11     0     3]
 [  458 46728  6635  1305  3025    42     0    21]
 [   20  6962   876  2083  3671   153     0    16]
 [   34  5947  6645  5439 105

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   33.0s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 1.05566805
Iteration 2, loss = 1.00197468
Iteration 3, loss = 1.00161870
Iteration 4, loss = 1.00147727
Iteration 5, loss = 1.00139541
Iteration 6, loss = 1.00133789
Iteration 7, loss = 1.00129329
Iteration 8, loss = 1.00125606
Iteration 9, loss = 1.00122366
Iteration 10, loss = 1.00119429
Iteration 11, loss = 1.00116741
Iteration 12, loss = 1.00114242
Iteration 13, loss = 1.00111887
Iteration 14, loss = 1.00109661
Iteration 15, loss = 1.00107541
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    1.2s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    5.0s finished


Done training predictions


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.4s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[  891   642   721   187    50     0     0     4]
 [  666 12123 41119  3709   585     4     0     8]
 [   87  1979  9702  1360   643     4     0     6]
 [  213  3856 19171  3913  2524    18     0     7]
 [   67  1337  4129  2989  1123    10     0     1]
 [   17   109   313   325    74     4     0     0]
 [    1     0     7    10     2     0     0     0]
 [    2    13    29    42     6     0     0     2]]
[[ 1093   567   759    67     9     0     0     0]
 [  177  7758 49798   428    52     0     0     1]
 [    3   446 13067   238    27     0     0     0]
 [    8  1177 27217  1274    26     0     0     0]
 [    1   395  5206  3585   469     0     0     0]
 [    0    18   222   571    31     0     0     0]
 [    0     1     3    15     1     0     0     0]
 [    0    11    41    42     0     0     0     0]]
[[ 1087   429   932    37    10     0     0     0]
 [  218 11613 46052   264    66     0     0     1]
 [    2   836 12721   171    51     0     0     0]
 [    8  1416 27235   945    

In [31]:
#Check the results
results

Unnamed: 0,Train_precision,Test_precision,Train_recall,Test_recall,Train_f1_score,Test_f1_score
Decision Tree,0.99995,0.212526,0.99999,0.221899,0.99997,0.198285
Random Forest,0.999975,0.317133,0.999953,0.266778,0.999964,0.251668
XGBoost,0.796238,0.292125,0.580005,0.261095,0.622694,0.240626
KNearestNeighbors,0.382109,0.311089,0.302306,0.268896,0.314234,0.272701
Multi Layer Perceptron,0.288944,0.253547,0.207521,0.207361,0.196247,0.194124


In [28]:
results_secondary_target = cv_scores(X, y2, num_features, cat_features)

no missing values to input on Number of Dependents
no missing values to input on Age at Injury
no missing values to input on Number of Dependents
no missing values to input on Accident Month
no missing values to input on Accident Day
no missing values to input on Accident DayOfWeek
Done DT


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   26.9s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 0.15569410
Iteration 2, loss = 0.14132858
Iteration 3, loss = 0.14131259
Iteration 4, loss = 0.14130066
Iteration 5, loss = 0.14129066
Iteration 6, loss = 0.14128188
Iteration 7, loss = 0.14127397
Iteration 8, loss = 0.14126671
Iteration 9, loss = 0.14125996
Iteration 10, loss = 0.14125363
Iteration 11, loss = 0.14124765
Iteration 12, loss = 0.14124197
Iteration 13, loss = 0.14123655
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.8s finished


Done training predictions


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[54552 54895]
 [ 2066  3292]]
[[93677 15770]
 [ 2631  2727]]
[[77894 31553]
 [ 2154  3204]]
[[107491   1956]
 [  4704    654]]
[[109366     81]
 [  5259     99]]
no missing values to input on Number of Dependents
no missing values to input on Age at Injury
no missing values to input on Number of Dependents
no missing values to input on Accident Month
no missing values to input on Accident Day
no missing values to input on Accident DayOfWeek
Done DT


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   25.6s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 0.15622841
Iteration 2, loss = 0.14291577
Iteration 3, loss = 0.14290111
Iteration 4, loss = 0.14289014
Iteration 5, loss = 0.14288094
Iteration 6, loss = 0.14287287
Iteration 7, loss = 0.14286558
Iteration 8, loss = 0.14285889
Iteration 9, loss = 0.14285268
Iteration 10, loss = 0.14284684
Iteration 11, loss = 0.14284133
Iteration 12, loss = 0.14283609
Iteration 13, loss = 0.14283109
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.9s finished


Done training predictions


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.1s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[73964 35483]
 [ 3736  1621]]
[[97726 11721]
 [ 4331  1026]]
[[73199 36248]
 [ 4057  1300]]
[[109289    158]
 [  5275     82]]
[[109436     11]
 [  5330     27]]
no missing values to input on Number of Dependents
no missing values to input on Age at Injury
no missing values to input on Number of Dependents
no missing values to input on Accident Month
no missing values to input on Accident Day
no missing values to input on Accident DayOfWeek
Done DT


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   27.3s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 0.15784769
Iteration 2, loss = 0.14388127
Iteration 3, loss = 0.14386678
Iteration 4, loss = 0.14385595
Iteration 5, loss = 0.14384688
Iteration 6, loss = 0.14383891
Iteration 7, loss = 0.14383173
Iteration 8, loss = 0.14382513
Iteration 9, loss = 0.14381899
Iteration 10, loss = 0.14381324
Iteration 11, loss = 0.14380780
Iteration 12, loss = 0.14380264
Iteration 13, loss = 0.14379770
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    1.2s finished


Done training predictions


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.1s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[70546 38901]
 [ 4283  1074]]
[[95434 14013]
 [ 4747   610]]
[[60942 48505]
 [ 4669   688]]
[[109334    113]
 [  5257    100]]
[[109418     29]
 [  5272     85]]
no missing values to input on Number of Dependents
no missing values to input on Age at Injury
no missing values to input on Number of Dependents
no missing values to input on Accident Month
no missing values to input on Accident Day
no missing values to input on Accident DayOfWeek
Done DT


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   29.1s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 0.15926771
Iteration 2, loss = 0.14278852
Iteration 3, loss = 0.14277780
Iteration 4, loss = 0.14276966
Iteration 5, loss = 0.14276285
Iteration 6, loss = 0.14275686
Iteration 7, loss = 0.14275146
Iteration 8, loss = 0.14274650
Iteration 9, loss = 0.14274190
Iteration 10, loss = 0.14273758
Iteration 11, loss = 0.14273350
Iteration 12, loss = 0.14272962
Iteration 13, loss = 0.14272592
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    1.1s finished


Done training predictions


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.1s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[71475 37972]
 [ 4471   886]]
[[90220 19227]
 [ 4864   493]]
[[62847 46600]
 [ 4462   895]]
[[109095    352]
 [  5228    129]]
[[109420     27]
 [  5336     21]]
no missing values to input on Number of Dependents
no missing values to input on Age at Injury
no missing values to input on Number of Dependents
no missing values to input on Accident Month
no missing values to input on Accident Day
no missing values to input on Accident DayOfWeek
Done DT


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   28.0s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 0.15744894
Iteration 2, loss = 0.14112732
Iteration 3, loss = 0.14111371
Iteration 4, loss = 0.14110361
Iteration 5, loss = 0.14109517
Iteration 6, loss = 0.14108777
Iteration 7, loss = 0.14108110
Iteration 8, loss = 0.14107498
Iteration 9, loss = 0.14106931
Iteration 10, loss = 0.14106398
Iteration 11, loss = 0.14105896
Iteration 12, loss = 0.14105418
Iteration 13, loss = 0.14104963
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.8s finished


Done training predictions


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[109076    370]
 [  4881    477]]
[[109437      9]
 [  5094    264]]
[[109426     20]
 [  4994    364]]
[[109429     17]
 [  5211    147]]
[[109442      4]
 [  5315     43]]


In [29]:
#Check the results
results_secondary_target

Unnamed: 0,Train_precision,Test_precision,Train_recall,Test_recall,Train_f1_score,Test_f1_score
Decision Tree,1.0,0.546958,1.0,0.484038,1.0,0.438182
Random Forest,0.999995,0.60489,0.999906,0.540068,0.999951,0.514073
XGBoost,0.907491,0.584703,0.637352,0.471547,0.698709,0.439743
KNearestNeighbors,0.895985,0.700122,0.536993,0.518383,0.557173,0.520026
Multi Layer Perceptron,0.476674,0.812743,0.5,0.504994,0.488058,0.498157


# Run Test Predictions with best model

## 1. Main Target Variable Only

### Best model for Claim Injury Type target test prediction: **XGBoost**

#### V1: XGBoost with imputation of missing values

In [None]:
"""submission_without_missing = test_prediction(
    XGBClassifier(),
    X,y,
    num_features,cat_features,
    data_test,
    scaling_outlier= True)
submission_without_missing"""

In [None]:
#export to csv
"""submission_without_missing.to_csv("submission_without_missing.csv")"""

#### V2: XGBoost without imputation of missing values

In [27]:
# using xgboost´s way of dealing with missing values (better performance)
submission_with_missing = test_prediction(
    XGBClassifier(),
    X,y,
    num_features,cat_features,
    data_test, 
    missing= False)
submission_with_missing

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val['Carrier Claim Category'].fillna(0, inplace = True)


Unnamed: 0_level_0,Claim Injury Type
Claim Identifier,Unnamed: 1_level_1
6165911,2. NON-COMP
6166141,2. NON-COMP
6165907,2. NON-COMP
6166047,2. NON-COMP
6166102,2. NON-COMP
...,...
6553137,2. NON-COMP
6553119,1. CANCELLED
6553542,1. CANCELLED
6553455,2. NON-COMP


In [28]:
#export to csv
submission_with_missing.to_csv("../../data/submission_with_missing.csv")

#### RESULT KAGGLE: 0.44011

## 2. Secondary Target Variable AND Main Target Variable

### Best model prediction of secondary target var: KNN

In [32]:
# Reload data
data= pd.read_csv("../../data/train_data_enriched.csv", index_col="Claim Identifier")
data_test = pd.read_csv("../../data/test_data_enriched.csv",index_col="Claim Identifier")
X = data.drop(["Claim Injury Type","Agreement Reached"], axis = 1)
y = data["Claim Injury Type"]
y2 = data["Agreement Reached"]

  data= pd.read_csv("../../data/train_data_enriched.csv", index_col="Claim Identifier")


In [25]:
submission_with_missing_with_v2 = test_prediction(
    XGBClassifier(), # y classifier
    X, y,# Dataset without target, target var 1
    num_features,
    cat_features,
    data_test,
    missing= False,
    secondary_model = KNeighborsClassifier(n_neighbors = 50), # y2 classifier
    y_train_secondary = y2, # target var 2
    secondary_missing = True
    )
submission_with_missing_with_v2

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val['Carrier Claim Category'].fillna(0, inplace = True)


no missing values to input on Number of Dependents


Unnamed: 0_level_0,Claim Injury Type
Claim Identifier,Unnamed: 1_level_1
6165911,2. NON-COMP
6166141,2. NON-COMP
6165907,2. NON-COMP
6166047,2. NON-COMP
6166102,2. NON-COMP
...,...
6553137,2. NON-COMP
6553119,1. CANCELLED
6553542,1. CANCELLED
6553455,2. NON-COMP


In [None]:
submission_with_missing_with_v2.to_csv("../../data/submission_with_missing_with_v2.csv")

#### RESULT KAGGLE: 0.43776

In [None]:
"""def cv_scores_hyperparameter_tuning(X, y, num_features, cat_features, num_imputing_algorithm=XGBRegressor(), cat_imputing_algorithm=XGBClassifier(), scaling_outlier= False, scaler=MinMaxScaler(), rfe = False):
    
    Takes as argument the predictors and the target, the models used for imputing numerical and categorical 
    features, if any scaling and outlier removal should be performed,what scaling method should be used and if feature selection with rfe should be used.
    Then it returns the results obtained from the stratified cross-validation for the given models.
    
    skf = StratifiedKFold(n_splits=5)

    # Generating the lists to store our results
    precision_scores_train = []
    precision_scores_val = []
    recall_scores_train = []
    recall_scores_val = []
    f1_scores_train =  []
    f1_scores_val =  []

    precision_scores_train_mean = []
    precision_scores_val_mean = [] 
    recall_scores_train_mean = []
    recall_scores_val_mean = []
    f1_scores_train_mean =  []
    f1_scores_val_mean =  []
    
    for train_index, test_index in skf.split(X, y):
        # Dividing our data in validation and train
        X_train, X_val = X.iloc[train_index].copy(), X.iloc[test_index].copy()
        y_train, y_val = y.iloc[train_index].copy(), y.iloc[test_index].copy()

        # Filling missing values
        for column in num_features:
            X_train, X_val = impute_missing_values(X_train,X_val, column, num_imputing_algorithm)
        
        # Removing inconsistencies on the train
        inconsistent = X_train[(X_train['Age at Injury'] > 80) | (X_train["Age at Injury"] < 16)].index
        X_train = X_train.loc[~X_train.index.isin(inconsistent)]
        y_train = y_train.loc[~y_train.index.isin(inconsistent)]

        # Performing scaling and outlier treatment dependent on the boolean
        if scaling_outlier:
            for column in num_features:
                handle_outliers(X_train, column)
                X_train, X_val = impute_missing_values(X_train,X_val, column, num_imputing_algorithm,validation= False)

            for column in num_features:
                scale_numerical(column, X_train, X_val, scaler)
                
        # Creating an ordinal variable
        claim_carrier_categories(X_train, X_val)

        #Filling missing values in the ordinal variable that might appear on X_val
        X_val, X_train = impute_missing_values(X_val,X_train, "Carrier Claim Category", cat_imputing_algorithm, validation=False)

        # Categorical Prop Encoding
        for cat_feature in cat_features:
            categorical_prop_encode(X_train, X_val, cat_feature)
            if scaling_outlier:
                scale_numerical("Carrier Claim Category", X_train, X_val, scaler)

        # Selecting features with Rfe
        if rfe:
            Selected_features = Rfe(XGBClassifier(), X_train, y_train, X_val, y_val)
            X_train = X_train[Selected_features]
            X_val = X_val[Selected_features]

        # Training the classification models
        XGBT.fit(X_train, y_train)
        print("Done XGBT")

        # Making the predictions for the training and validation data
        pred_train_XGBT = XGBT.predict(X_train)
        print("Done training predictions")
        
        pred_val_XGBT = XGBT.predict(X_val)
        print("Done validation predictions")

        # Calculating and storing the scores
        precision_scores_train.append(precision_score(y_train, pred_train_XGBT, average='macro'))
        recall_scores_train.append(recall_score(y_train, pred_train_XGBT, average='macro'))
        f1_scores_train.append(f1_score(y_train, pred_train_XGBT, average='macro'))
        
        precision_scores_val.append(precision_score(y_val, pred_val_XGBT, average='macro'))
        recall_scores_val.append(recall_score(y_val, pred_val_XGBT, average='macro'))
        f1_scores_val.append(f1_score(y_val, pred_val_XGBT, average='macro'))


    # Aggregating the average results across the folds
    precision_scores_train_mean.append(mean(precision_scores_train))
    precision_scores_val_mean.append(mean(precision_scores_val))
    recall_scores_train_mean.append(mean(recall_scores_train))
    recall_scores_val_mean.append(mean(recall_scores_val))
    f1_scores_train_mean.append(mean(f1_scores_train))
    f1_scores_val_mean.append(mean(f1_scores_val))

    # Storing the results in a dataframe
    scores = {'Train_precision': precision_scores_train_mean,
    'Test_precision': precision_scores_val_mean,
    'Train_recall': recall_scores_train_mean,
    'Test_recall': recall_scores_val_mean,
    'Train_f1_score': f1_scores_train_mean,
    'Test_f1_score': f1_scores_val_mean}

    print(scores)
"""

In [None]:
"""import xgboost as xgb
for gama in range(10):
    for depth in range(4,8):
        XGBT = XGBClassifier(gamma = gama, max_depth = depth)
        print(boost, gama, depth)
        cv_scores_hyperparameter_tuning(X, y,num_features,cat_features,scaling_outlier = True)
        print("------------")"""
