# Model Training and Assessment

In [1]:
import pandas as pd
import numpy as np

#model imports
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier,XGBRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

#metrics
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from statistics import mean

#useful functions
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE


# Model definitions

## Helper functions

In [2]:
def impute_missing_values(X_train, X_val, target_column, algorithm,validation = True):

    # Separating the missing values from the non missing values
    available_data = X_train[X_train[target_column].notna()]
    missing_X_train = X_train[X_train[target_column].isna()]
    missing_X_val = X_val[X_val[target_column].isna()]

    # Making sure if there is enough data for inputing, returning it if not
    if len(missing_X_train) == 0:
        print(f"no missing values to input on {target_column}")
        return X_train, X_val

    # Separating the target column from the rest
    X_available = available_data.drop(columns=[target_column])
    y_available = available_data[target_column]

    # Making sure our columns are consistent
    X_available = X_available.select_dtypes(include=["number"])
    missing_X_train = missing_X_train.select_dtypes(include=["number"])
    missing_X_val = missing_X_val.select_dtypes(include=["number"])

    common_columns = X_available.columns.intersection(missing_X_train.columns).intersection(missing_X_val.columns)
    X_available = X_available[common_columns]
    missing_X_train = missing_X_train[common_columns]
    missing_X_val = missing_X_val[common_columns]

    # Making sure there is any column after keeping the common columns
    if X_available.shape[1] == 0:
        print(f"Without any column to input in {target_column}")
        return X_train, X_val

    # Training the model with the available data
    model = algorithm
    model.fit(X_available, y_available)

    # Prediting the missing values
    predicted_train = model.predict(missing_X_train)
    if validation:
        predicted_val = model.predict(missing_X_val)

    # Filling the training and validation with predictions. The latter is only filled if the argument is true
    X_train = X_train.copy()
    X_train.loc[X_train[target_column].isna(), target_column] = predicted_train

    if validation:
        X_val = X_val.copy()
        X_val.loc[X_val[target_column].isna(), target_column] = predicted_val

    return X_train, X_val

In [3]:
#This helper fuction was used for finding problems with later functions
def check_missing_values(data):
    print(data.isnull().sum()[data.isnull().sum() > 0])

In [4]:
def handle_outliers(data, column):
    # Handles outliers in a numerical column by replacing values outside the interquartile range (IQR) with missing values

    # Makes sure we only treat outliers in columns that have any data
    if data[column].notnull().sum() > 0: 

        # Calculating inter quantile range limits
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Changing the ouliers to missing values
        data[column] = np.where(data[column] < lower_bound, np.nan, data[column])
        data[column] = np.where(data[column] > upper_bound, np.nan, data[column])

    return data

In [5]:
def scale_numerical(column, X_train, X_val, scaler):
    # Scales the given numerical value based on the training data with the given scaler
    
    # Make sure the column is numerical
    if not pd.api.types.is_numeric_dtype(X_train[column]):
        print(f"Columm '{column}' is not numerical and will be ignored")
        return

    # Scaling the data
    try:
        X_train[column] = scaler.fit_transform(X_train[[column]])
        X_val[column] = scaler.transform(X_val[[column]])
    except ValueError as e:
        print(f"Mistake scaling the column '{column}': {e}")

In [6]:
def  claim_carrier_categories(X_train, X_val):
    if 'Carrier Claim Category' not in X_train.columns:
        # Function to categorize each carrier based on its claim count for dimensionality reduction
        
        count = X_train['Carrier Name'].value_counts() # Count individual Carriers' counts only on train data
        def categorize_claims(count): # Map carrier size based on fixed thresholds decided by us
            if count >= 40000:
                return 2
            elif 4000 <= count < 40000:
                return 1
            else:
                return 0
    
        # Apply the categorization to create a mapping dictionary
        carrier_category_map = count.apply(categorize_claims)
    
        # Map the `Carrier Name` to the new `Carrier Claim Category`
        X_train['Carrier Claim Category'] = X_train['Carrier Name'].map(carrier_category_map)
        X_val['Carrier Claim Category'] = X_val['Carrier Name'].map(carrier_category_map)

        # If there is a missing value on x_val, it means that that carrier name didn't exist on x_train and is unlikely to have many claim counts
        # X_train cannot have NaN in this new feature as Carrier Name has no Missing values
        X_val['Carrier Claim Category'].fillna(0, inplace = True)
    
        return X_train.drop(["Carrier Name"], axis = 1, inplace = True) , X_val.drop(["Carrier Name"], axis = 1, inplace = True)


In [7]:
def categorical_prop_encode(X_train, X_val, feature):
    # Categorical encoder function for individual feature
    proportion = X_train[feature].value_counts(normalize = True)  # Get the porportion of each category
    X_train[feature] = X_train[feature].map(proportion)  # Map the porportions in the column
    X_val[feature] = X_val[feature].map(proportion) # Do the same for the validation subset
    X_val[feature] = X_val[feature].fillna(0)  # Handle categories in X_val not seen in X_train with 0


In [8]:
def Rfe(algorithm,X_train,y_train,X_val,y_val):
    # Function to run RFE on specified algorithm with train/val split for it to be run within every fold to avoid data leakage
    
    #Generating the variables where we will store our results
    nof_list = np.arange(1, len(X_train.columns) + 1)            
    high_score = 0
    opt_n_features = 0
    train_score_list = []
    val_score_list = []

    #Variable where we will store the optimum amount of features
    best_rfe = None

    model = algorithm

    for n in nof_list:
        rfe = RFE(estimator=model, n_features_to_select=n)
    
    # Fitting the model to rfe
        X_train_rfe = rfe.fit_transform(X_train, y_train)
        X_val_rfe = rfe.transform(X_val)
    
    # Training and predicting
        model.fit(X_train_rfe, y_train)
        pred_train = model.predict(X_train_rfe)
        pred_val = model.predict(X_val_rfe)
    
    # Evaluating using the macro f1_score
        train_score = f1_score(y_train, pred_train, average="macro")
        val_score = f1_score(y_val, pred_val, average="macro")
        train_score_list.append(train_score)
        val_score_list.append(val_score)
    
    # Checking if this is the best combination of features so far
        if val_score >= high_score:
            high_score = val_score
            opt_n_features = n
            best_rfe = rfe  # Storing the rfe with the best number of features

# Checking what amount of features and which features where the best for the model
    selected_features = X_train.columns[best_rfe.support_].tolist()

    print("Optimal number of features: %d" % opt_n_features)
    print("Score with %d features: %f" % (opt_n_features, high_score))
    print("Selected Features:\n", selected_features)

    return selected_features


In [9]:
def cv_scores(X, y, num_features, cat_features, num_imputing_algorithm=XGBRegressor(), cat_imputing_algorithm=XGBClassifier(), scaling_outlier= True, scaler=MinMaxScaler(), rfe = False):
    """
    Performs stratified cross-validation on a dataset to evaluate multiple classification models, while handling 
    preprocessing steps like missing value imputation, scaling, outlier removal, feature engineering, and optional 
    feature selection using Recursive Feature Elimination (RFE).

    Parameters:
    ----------
    X : Entire data without target variable

    y : Entire data with only target variable

    num_features : List of numerical features within X

    cat_features : List of categorical features within X (non-binary)

    num_imputing_algorithm : Algorithm to be used for imputing numerical features' missing values. Defaults to XGBRegressor.

    cat_imputing_algorithm : Algorithm to be used for imputing categorical features' missing values. Defaults to XGBClassifier.

    scaling_outlier : Boolean indicating if scaling and outlier handling should be applied. Defaults to True.

    scaler : Scaling algorithm to be used for scaling. Defaults to True.

    rfe : Boolean indicating if RFE should be used within each fold to determine current most valuable features to use. Defaults to False.

    """
    skf = StratifiedKFold(n_splits=5)

    # Generating the lists to store our results
    precision_scores_train = [[],[],[],[],[]]
    precision_scores_val = [[],[],[],[],[]]  
    recall_scores_train = [[],[],[],[],[]]
    recall_scores_val = [[],[],[],[],[]]
    f1_scores_train =  [[],[],[],[],[]]
    f1_scores_val =  [[],[],[],[],[]]

    precision_scores_train_mean = []
    precision_scores_val_mean = [] 
    recall_scores_train_mean = []
    recall_scores_val_mean = []
    f1_scores_train_mean =  []
    f1_scores_val_mean =  []
    
    for train_index, test_index in skf.split(X, y):
        # Dividing our data in validation and train
        X_train, X_val = X.iloc[train_index].copy(), X.iloc[test_index].copy()
        y_train, y_val = y.iloc[train_index].copy(), y.iloc[test_index].copy()

        # Filling missing values
        for column in num_features:
            X_train, X_val = impute_missing_values(X_train,X_val, column, num_imputing_algorithm)
        
        # Removing inconsistencies on the train
        inconsistent = X_train[(X_train['Age at Injury'] > 80) | (X_train["Age at Injury"] < 16)].index
        X_train = X_train.loc[~X_train.index.isin(inconsistent)]
        y_train = y_train.loc[~y_train.index.isin(inconsistent)]

        # Performing scaling and outlier treatment dependent on the boolean
        if scaling_outlier:
            for column in num_features:
                handle_outliers(X_train, column)
                X_train, X_val = impute_missing_values(X_train,X_val, column, num_imputing_algorithm,validation= False)

            for column in num_features:
                scale_numerical(column, X_train, X_val, scaler)
                
        # Creating an ordinal variable
        claim_carrier_categories(X_train, X_val)


        # Scaling special Carrier Claim Category feature
        if scaling_outlier:
            scale_numerical("Carrier Claim Category", X_train, X_val, scaler)

        # Categorical Prop Encoding
        for cat_feature in cat_features:
            categorical_prop_encode(X_train, X_val, cat_feature)
            if scaling_outlier:
                scale_numerical("Carrier Claim Category", X_train, X_val, scaler)

        # Selecting features with Rfe
        if rfe:
            Selected_features = Rfe(XGBClassifier(), X_train, y_train, X_val, y_val)
            X_train = X_train[Selected_features]
            X_val = X_val[Selected_features]

        
        # Training the classification models
        DT.fit(X_train, y_train)
        print("Done DT")
        RF.fit(X_train, y_train)
        print("Done RF")
        XGB.fit(X_train, y_train)
        print("Done XGB")
        KNN.fit(X_train, y_train)
        print("Done KNN")
        MLP.fit(X_train, y_train)
        print("Done MLP")

        # Making the predictions for the training and validation data
        pred_train_DT = DT.predict(X_train)
        pred_train_RF = RF.predict(X_train)
        pred_train_XGB = XGB.predict(X_train)
        pred_train_KNN = KNN.predict(X_train)
        pred_train_MLP = MLP.predict(X_train)
        print("Done training predictions")
        
        pred_val_DT = DT.predict(X_val)
        pred_val_RF = RF.predict(X_val)
        pred_val_XGB = XGB.predict(X_val)
        pred_val_KNN = KNN.predict(X_val)
        pred_val_MLP = MLP.predict(X_val)
        print("Done validation predictions")

        # Calculating and storing the scores
        i = 0
        for predictions in [pred_train_DT,pred_train_RF,pred_train_XGB,pred_train_KNN,pred_train_MLP]:
            precision_scores_train[i].append(precision_score(y_train, predictions, average='macro'))
            recall_scores_train[i].append(recall_score(y_train, predictions, average='macro'))
            f1_scores_train[i].append(f1_score(y_train, predictions, average='macro'))
            i+=1
        j=0
        for predictions in [pred_val_DT,pred_val_RF,pred_val_XGB,pred_val_KNN,pred_val_MLP]:
            precision_scores_val[j].append(precision_score(y_val, predictions, average='macro'))
            recall_scores_val[j].append(recall_score(y_val, predictions, average='macro'))
            f1_scores_val[j].append(f1_score(y_val, predictions, average='macro'))
            j+=1

        # Check the confusion matrixes of our predictions
        print(confusion_matrix(y_val, pred_val_DT))
        print(confusion_matrix(y_val, pred_val_RF))
        print(confusion_matrix(y_val, pred_val_XGB))
        print(confusion_matrix(y_val, pred_val_KNN))
        print(confusion_matrix(y_val, pred_val_MLP))

    # Aggregating the average results across the folds
    for l in range(0,5): 
        precision_scores_train_mean.append(mean(precision_scores_train[l]))
        precision_scores_val_mean.append(mean(precision_scores_val[l]))
        recall_scores_train_mean.append(mean(recall_scores_train[l]))
        recall_scores_val_mean.append(mean(recall_scores_val[l]))
        f1_scores_train_mean.append(mean(f1_scores_train[l]))
        f1_scores_val_mean.append(mean(f1_scores_val[l]))

    # Storing the results in a dataframe
    model_results = pd.DataFrame(data={
        'Train_precision': precision_scores_train_mean,
        'Test_precision': precision_scores_val_mean,
        'Train_recall': recall_scores_train_mean,
        'Test_recall': recall_scores_val_mean,
        'Train_f1_score': f1_scores_train_mean,
        'Test_f1_score': f1_scores_val_mean,
    }, index=["Decision Tree","Random Forest","XGBoost", "KNearestNeighbors","Multi Layer Perceptron"])

    return model_results

In [10]:
def test_prediction(model, X_train, y_train, num_features, cat_features, X_test, 
                    num_imputing_algorithm=XGBRegressor(), 
                    cat_imputing_algorithm=XGBClassifier(), scaling_outlier = False , 
                    scaler=MinMaxScaler(), missing = True,
                    secondary_model = None, y_train_secondary=None, secondary_missing = True):

    """
    Performs stratified cross-validation on a dataset to evaluate multiple classification models, while handling 
    preprocessing steps like missing value imputation, scaling, outlier removal, feature engineering, and optional 
    feature selection using Recursive Feature Elimination (RFE).

    Parameters:
    ----------
    model: algorithm to use to predict primary target variable.
    
    X_train : Entire data without target variable.

    y_train : Entire data with only target variable.

    num_features : List of numerical features within X

    cat_features : List of categorical features within X (non-binary)

    num_imputing_algorithm : Algorithm to be used for imputing numerical features' missing values. Defaults to XGBRegressor.

    cat_imputing_algorithm : Algorithm to be used for imputing categorical features' missing values. Defaults to XGBClassifier.

    scaling_outlier : Boolean indicating if scaling and outlier handling should be applied. Defaults to True.

    scaler : Scaling algorithm to be used for scaling. Defaults to True.

    rfe : Boolean indicating if RFE should be used within each fold to determine current most valuable features to use. Defaults to False.

    secondary_model: algorithm to use to predict secondary target variable. Defaults to None if secondary should not be predicted.

    y_train_secondary: Entire data with only secondary target variable. Defaults to None if secondary should not be predicted.

    """
    
    # Impute missing values
    if missing:
        for column in num_features:
            X_train, X_test = impute_missing_values(X_train,X_test,column, num_imputing_algorithm)

    # Remove inconsistencies
    inconsistent = X_train[(X_train['Age at Injury'] > 80) | (X_train["Age at Injury"] < 16)].index
    X_train.drop(inconsistent, inplace=True)
    y_train.drop(inconsistent, inplace=True)
    if secondary_model is not None and y_train_secondary is not None:
        y_train_secondary.drop(inconsistent, inplace=True)

    # Scale and remove outliers if specified
    if scaling_outlier:
        for column in num_features:
            handle_outliers(X_train, column) # Handle outliers only for training partition
            if missing:
                X_train, X_test = impute_missing_values(X_train,X_test, column, num_imputing_algorithm,validation = False)

        for column in num_features:
            scale_numerical(column, X_train, X_test, scaler)
        
    # Creating an ordinal variable
    claim_carrier_categories(X_train, X_test)

    # Scaling special Carrier Claim Category feature
    if scaling_outlier:
        scale_numerical("Carrier Claim Category", X_train, X_test, scaler)

    # Categorical Prop Encoding
    for cat_feature in cat_features:
        categorical_prop_encode(X_train, X_test, cat_feature)
        if scaling_outlier:
            scale_numerical("Carrier Claim Category", X_train, X_test, scaler)

    # Predict secondary target variable if secondary model and variable is given
    if secondary_model is not None and y_train_secondary is not None:
        if secondary_missing: # Imputation of missing values for secondary model
            X_train_secondary = X_train.copy()
            X_test_secondary = X_test.copy()
            for column in num_features:
                X_train_secondary, X_test_secondary = impute_missing_values(X_train_secondary,X_test_secondary, column, num_imputing_algorithm)

        secondary_model.fit(X_train_secondary, y_train_secondary)
        pred_secondary_test = secondary_model.predict(X_test_secondary)
        X_test["Agreement Reached"] = pred_secondary_test
        X_train["Agreement Reached"] = y_train_secondary
        
    # Fitting the model, making the predictions and reverting the claim injury types back to their string form
    model.fit(X_train, y_train)
    pred_test = model.predict(X_test)
    pred_test = le.inverse_transform(pred_test)

    # Saving the final submission dataframe with indexes of X_test
    submission_df = pd.DataFrame({
        "Claim Injury Type": pred_test
    }, index=X_test.index)
    
    return submission_df

# Dataset preparation

In [11]:
def model_load():
    #Define an instance of the models without hyperparameters

    DT = DecisionTreeClassifier()
    RF = RandomForestClassifier(verbose = 1, n_jobs=-1)
    XGB = XGBClassifier()
    KNN = KNeighborsClassifier(n_neighbors = 50) # n_neighbors through trial and error
    MLP = MLPClassifier(activation='relu',
        solver='sgd',learning_rate='invscaling',
        learning_rate_init=0.001,
        batch_size=100,verbose = True) # Based on ML Practical Parameters
    return DT , RF , XGB , KNN , MLP


In [12]:
def data_load():
    # Load data
    data= pd.read_csv("../../data/train_data_enriched.csv", index_col="Claim Identifier")
    data_test = pd.read_csv("../../data/test_data_enriched.csv",index_col="Claim Identifier")

    #Creating an instance of our encoder for the target
    le = LabelEncoder()

    #Label enconding our target variable 
    data["Claim Injury Type"] = le.fit_transform(data["Claim Injury Type"])

    '''Dropping redundant variables that carry almost the same information (are extremely correlated (|0.8|))
    We believe it was better to keep Age at Injury than birth year since it should be more related to the injury claim type (it will be tested later)
    The same logic was applied to dropping the other two dates and two DSA variables since we believe Accident date to be more important'''
    data = data.loc[:, ~data.columns.isin(['Birth Year', 'Assembly Date', 'C-2 Date', 'Assembly Date DSA', 'First Hearing Date DSA'])]
    data_test = data_test.loc[:, ~data_test.columns.isin(['Birth Year', 'Assembly Date', 'C-2 Date', 'Assembly Date DSA', 'First Hearing Date DSA'])]


    '''Since the codes always seem to provide the same or more information than the descriptions (have more categories),
    and the codes are consistent (always only having 1 description for code, while descriptions may have multiple codes)
    And Crámer's V says they have a very high association
    we will drop the description columns.'''
    data.drop(['Industry Code Description','WCIO Cause of Injury Description','WCIO Nature of Injury Description','WCIO Part Of Body Description'], axis = 1,inplace = True)
    data_test.drop(['Industry Code Description','WCIO Cause of Injury Description','WCIO Nature of Injury Description','WCIO Part Of Body Description'], axis = 1,inplace = True)

    '''Dropping redundant variables that carry almost the same information (have an association above or equal to 0.8)
    We chose to keep County of Injury above Zip Code and District Name (these 3 have a high association) because it is the easist to interpret and the one we looked more in detail in the exploratory analysis
    We kept the new variable we made called body section because it keeps most of the same information of the body part code but with a much lower cardinality
    Lastly we only remove the variable Carrier Name in the function where we create the new variable with lower cardinality because it is need to create that new variable'''
    data.drop(['Zip Code',"WCIO Part Of Body Code","District Name"], axis=1, inplace = True)
    data_test.drop(["Zip Code","WCIO Part Of Body Code","District Name"], axis=1 , inplace = True)

    # Num and Cat features
    num_features = ['Age at Injury', 'Average Weekly Wage', 'IME-4 Count', 'Number of Dependents',
                "Accident Year","Accident Month","Accident Day","Accident DayOfWeek",
                "C-2 Date DSA","C-3 Date DSA","Accident Date","C-3 Date","First Hearing Date"]

    cat_features = [
    "Alternative Dispute Resolution",
    "Carrier Type",
    "County of Injury",
    "Gender",
    "Industry Code",
    "Medical Fee Region",
    "WCIO Cause of Injury Code",
    "WCIO Nature of Injury Code",
    "Age at Injury Category",
    "Body Section",
    ]
    
    # Isolation of target vars
    X = data.drop(["Claim Injury Type","Agreement Reached"], axis = 1)
    y = data["Claim Injury Type"]
    y2 = data["Agreement Reached"]

    return data, data_test, le, num_features, cat_features, X, y, y2

In [13]:
# Reset all models, datasets and global Parameters for fresh retraining
DT , RF , XGB , KNN , MLP = model_load()
data, data_test,le, num_features, cat_features, X, y, y2 = data_load()

  data= pd.read_csv("../../data/train_data_enriched.csv", index_col="Claim Identifier")


### Quality Checks
The only not included variables are the target variables and all the binary variables (non-numeric and non-categorical) as seen below. As binary variables are not scaled nor encoded they don't appear in the definitions above.

In [14]:
len(num_features) + len(cat_features)

23

In [15]:
len(data.columns)

33

In [16]:
# Combine num_features and cat_features into a set
defined_features = set(num_features) | set(cat_features)

# Get all column names in the DataFrame
all_columns = set(data.columns)

# Find columns that are not in the defined features
undefined_columns = all_columns - defined_features

print("Columns not in num_features or cat_features:", undefined_columns)
print("Count of difference:", len(undefined_columns))

Columns not in num_features or cat_features: {'Attorney/Representative', 'COVID-19 Indicator', 'Carrier Name', 'First Hearing Date_missing', 'C-3 Date_missing', 'Assembly Date_missing', 'Accident Date_missing', 'Agreement Reached', 'Claim Injury Type', 'C-2 Date_missing'}
Count of difference: 10


# Run CV Score Loop

In [17]:
results = cv_scores(X, y, num_features, cat_features)

no missing values to input on Number of Dependents
no missing values to input on Age at Injury
no missing values to input on Number of Dependents
no missing values to input on Accident Month
no missing values to input on Accident DayOfWeek


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val['Carrier Claim Category'].fillna(0, inplace = True)


Done DT


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   16.4s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 1.05985956
Iteration 2, loss = 1.00534335
Iteration 3, loss = 1.00518388
Iteration 4, loss = 1.00510129
Iteration 5, loss = 1.00504309
Iteration 6, loss = 1.00499643
Iteration 7, loss = 1.00495634
Iteration 8, loss = 1.00492066
Iteration 9, loss = 1.00488808
Iteration 10, loss = 1.00485795
Iteration 11, loss = 1.00482974
Iteration 12, loss = 1.00480315
Iteration 13, loss = 1.00477790
Iteration 14, loss = 1.00475380
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.7s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    2.3s finished


Done training predictions


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.2s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[  755  1119   162   282   119    42     7     9]
 [ 4099 31644  6721 12503  2950   157    22   119]
 [  639  5323  1824  3781  1589   484    59    83]
 [ 1104  8313  3421  8354  4440  3231   434   404]
 [  336  1234   989  1626  3653  1619   172    27]
 [   30   182   130   213    78   155    34    20]
 [    0     5     3     7     2     0     2     1]
 [    3    30     3    28     4     3     3    20]]
[[  947  1279     3   146   109    11     0     0]
 [  508 54257    18  2244  1121    63     0     4]
 [   48  9047    90  2652  1684   258     0     3]
 [   32 11468    92  9059  6518  2504     0    28]
 [    7   786    18  1256  7108   480     0     1]
 [    0    70     3   371   180   218     0     0]
 [    0     1     0    13     2     4     0     0]
 [    0    24     0    61     5     2     0     2]]
[[ 1116  1118     7   112   132     9     0     1]
 [  656 51993    38  2418  2860   194     2    54]
 [   65  8589    78  2538  2136   323     9    44]
 [   66  9988   154  9311  78

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val['Carrier Claim Category'].fillna(0, inplace = True)


Done DT


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   16.2s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 1.05935143
Iteration 2, loss = 1.00665848
Iteration 3, loss = 1.00621494
Iteration 4, loss = 1.00606780
Iteration 5, loss = 1.00598919
Iteration 6, loss = 1.00593631
Iteration 7, loss = 1.00589561
Iteration 8, loss = 1.00586175
Iteration 9, loss = 1.00583207
Iteration 10, loss = 1.00580529
Iteration 11, loss = 1.00578066
Iteration 12, loss = 1.00575755
Iteration 13, loss = 1.00573574
Iteration 14, loss = 1.00571503
Iteration 15, loss = 1.00569525
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.7s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    2.1s finished


Done training predictions


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.4s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[  869  1039   217   258    93    14     0     5]
 [ 1812 38029  8557  8050  1602   108    12    45]
 [  286  5488  2394  3436  1698   406     8    65]
 [  669  5704  5806  9267  5692  2189    68   306]
 [  119   643  2150  2193  3445  1022    32    52]
 [   31    89   251   280   145    18     0    29]
 [    1     2     5     8     3     0     0     0]
 [    4    31    15    28     5     1     0    10]]
[[ 1089  1066    57   179    99     5     0     0]
 [  463 52313  1571  2728  1123    17     0     0]
 [   13  7424   635  3615  2012    82     0     0]
 [   13  6938  1737 14117  6158   737     0     1]
 [    3   451   481  3425  5195   101     0     0]
 [    0    29    70   674    69     1     0     0]
 [    0     1     1    16     1     0     0     0]
 [    0    15     4    75     0     0     0     0]]
[[ 1242   925   121   106    84    13     0     4]
 [  568 47350  6128  2640  1390    89     0    50]
 [   36  7290   817  3033  2216   356     0    33]
 [   50  6154  5058  8213  73

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val['Carrier Claim Category'].fillna(0, inplace = True)


Done DT


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   15.8s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 1.06556965
Iteration 2, loss = 1.00921755
Iteration 3, loss = 1.00880697
Iteration 4, loss = 1.00863612
Iteration 5, loss = 1.00853304
Iteration 6, loss = 1.00845917
Iteration 7, loss = 1.00840068
Iteration 8, loss = 1.00835176
Iteration 9, loss = 1.00830942
Iteration 10, loss = 1.00827169
Iteration 11, loss = 1.00823751
Iteration 12, loss = 1.00820615
Iteration 13, loss = 1.00817700
Iteration 14, loss = 1.00814965
Iteration 15, loss = 1.00812388
Iteration 16, loss = 1.00809942
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.7s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    2.0s finished


Done training predictions


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.4s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[  965   895   280   210   119    23     0     4]
 [ 1679 36479 10708  7101  2045   168     7    28]
 [  297  5522  2328  2889  2346   363    11    25]
 [  673  5708  6347  6154  7456  3068    67   228]
 [  110  1053  3078  2709  2105   572    11    18]
 [   30    61   178   430   122    16     0     5]
 [    1     2     2     7     7     0     0     0]
 [    4    17    24    39     6     2     0     2]]
[[ 1088  1050    85   127   141     5     0     0]
 [  411 50763  2981  2670  1375    15     0     0]
 [   12  7618   612  3110  2364    65     0     0]
 [   22  6999  3549 11146  7320   664     0     1]
 [    4   519  1370  4099  3631    33     0     0]
 [    0    23    11   745    63     0     0     0]
 [    0     2     0    13     4     0     0     0]
 [    0    10     5    78     1     0     0     0]]
[[ 1153   945   190    60   138     7     0     3]
 [  436 42907 10106  2667  2034    38     0    27]
 [   35  7235  1151  2310  2899   140     0    11]
 [   74  6362  7753  4719  95

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val['Carrier Claim Category'].fillna(0, inplace = True)


Done DT


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   16.1s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 1.05383962
Iteration 2, loss = 1.00203863
Iteration 3, loss = 1.00135433
Iteration 4, loss = 1.00110877
Iteration 5, loss = 1.00097982
Iteration 6, loss = 1.00089564
Iteration 7, loss = 1.00083428
Iteration 8, loss = 1.00078611
Iteration 9, loss = 1.00074590
Iteration 10, loss = 1.00071126
Iteration 11, loss = 1.00068047
Iteration 12, loss = 1.00065252
Iteration 13, loss = 1.00062681
Iteration 14, loss = 1.00060289
Iteration 15, loss = 1.00058044
Iteration 16, loss = 1.00055918
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.7s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    2.2s finished


Done training predictions


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.4s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[  981   969   205   191   122    27     0     1]
 [ 1556 38300  8757  6344  3021   202     0    34]
 [  221  5143  2145  3009  2630   610     1    22]
 [  434  5169  5869  7536  7498  3075    13   108]
 [   91  1033  3101  3032  1942   447     3     7]
 [    7   103   203   401   112    13     0     3]
 [    1     1     4     8     4     0     0     1]
 [    0    22    21    34     6     6     0     5]]
[[ 1181   991    69   113   137     5     0     0]
 [  369 52947  1345  1814  1733     6     0     0]
 [   10  7361   575  2688  3124    23     0     0]
 [   12  7119  3362  9702  9286   221     0     0]
 [    4   590  1835  4968  2250     9     0     0]
 [    0    16    21   750    55     0     0     0]
 [    0     0     0    18     1     0     0     0]
 [    0    16     8    68     2     0     0     0]]
[[ 1297   846   140    71   128    11     0     3]
 [  458 46728  6635  1305  3024    43     0    21]
 [   20  6962   876  2083  3671   153     0    16]
 [   34  5947  6645  5439 105

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val['Carrier Claim Category'].fillna(0, inplace = True)


Done DT


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   16.5s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 1.05477512
Iteration 2, loss = 1.00191961
Iteration 3, loss = 1.00165613
Iteration 4, loss = 1.00151744
Iteration 5, loss = 1.00142494
Iteration 6, loss = 1.00135627
Iteration 7, loss = 1.00130165
Iteration 8, loss = 1.00125651
Iteration 9, loss = 1.00121779
Iteration 10, loss = 1.00118369
Iteration 11, loss = 1.00115312
Iteration 12, loss = 1.00112517
Iteration 13, loss = 1.00109951
Iteration 14, loss = 1.00107574
Iteration 15, loss = 1.00105341
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.8s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    2.3s finished


Done training predictions


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.2s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[  916   608   745   200    24     0     0     2]
 [  678 11266 41990  3868   398     9     0     5]
 [   98  1724 10215  1407   326     6     0     5]
 [  173  3291 21353  4041   824    17     0     3]
 [   64  1173  4549  3089   772     8     0     1]
 [    7   101   336   333    62     3     0     0]
 [    0     1     6    10     3     0     0     0]
 [    3    11    34    40     5     0     0     1]]
[[ 1077   595   746    70     7     0     0     0]
 [  154  8022 49553   434    51     0     0     0]
 [    3   465 13042   243    28     0     0     0]
 [    5  1189 27182  1297    29     0     0     0]
 [    1   388  5207  3603   457     0     0     0]
 [    0    12   213   588    29     0     0     0]
 [    0     1     3    14     2     0     0     0]
 [    0     9    37    48     0     0     0     0]]
[[ 1087   429   932    37    10     0     0     0]
 [  218 11613 46052   264    66     0     0     1]
 [    2   836 12721   171    51     0     0     0]
 [    8  1416 27235   945    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
#Check the results
results

Unnamed: 0,Train_precision,Test_precision,Train_recall,Test_recall,Train_f1_score,Test_f1_score
Decision Tree,0.99995,0.214476,0.99999,0.225414,0.99997,0.197971
Random Forest,0.999964,0.314325,0.999965,0.26824,0.999965,0.252435
XGBoost,0.795557,0.290976,0.579948,0.261711,0.6224,0.240904
KNearestNeighbors,0.382109,0.311092,0.302306,0.268894,0.314234,0.272702
Multi Layer Perceptron,0.250839,0.235134,0.20397,0.205196,0.189973,0.19244


In [19]:
# Reset all models, datasets and global Parameters for fresh retraining
DT , RF , XGB , KNN , MLP = model_load()
data, data_test,le, num_features, cat_features, X, y, y2 = data_load()

  data= pd.read_csv("../../data/train_data_enriched.csv", index_col="Claim Identifier")


In [20]:
results_secondary_target = cv_scores(X, y2, num_features, cat_features)

no missing values to input on Number of Dependents
no missing values to input on Age at Injury
no missing values to input on Number of Dependents
no missing values to input on Accident Month
no missing values to input on Accident Day
no missing values to input on Accident DayOfWeek


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val['Carrier Claim Category'].fillna(0, inplace = True)


Done DT


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   12.5s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 0.15437115
Iteration 2, loss = 0.14084349
Iteration 3, loss = 0.14083148
Iteration 4, loss = 0.14082248
Iteration 5, loss = 0.14081495
Iteration 6, loss = 0.14080834
Iteration 7, loss = 0.14080237
Iteration 8, loss = 0.14079689
Iteration 9, loss = 0.14079178
Iteration 10, loss = 0.14078701
Iteration 11, loss = 0.14078249
Iteration 12, loss = 0.14077819
Iteration 13, loss = 0.14077409
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.2s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.6s finished


Done training predictions


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.1s finished


Done validation predictions
[[61352 48095]
 [ 2058  3300]]
[[95777 13670]
 [ 2671  2687]]
[[77892 31555]
 [ 2154  3204]]
[[107491   1956]
 [  4704    654]]
[[108493    954]
 [  4852    506]]
no missing values to input on Number of Dependents
no missing values to input on Age at Injury
no missing values to input on Number of Dependents
no missing values to input on Accident Month
no missing values to input on Accident Day
no missing values to input on Accident DayOfWeek


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val['Carrier Claim Category'].fillna(0, inplace = True)


Done DT


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   12.6s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 0.15906783
Iteration 2, loss = 0.14157197
Iteration 3, loss = 0.14155674
Iteration 4, loss = 0.14154543
Iteration 5, loss = 0.14153598
Iteration 6, loss = 0.14152772
Iteration 7, loss = 0.14152029
Iteration 8, loss = 0.14151349
Iteration 9, loss = 0.14150717
Iteration 10, loss = 0.14150125
Iteration 11, loss = 0.14149566
Iteration 12, loss = 0.14149036
Iteration 13, loss = 0.14148531
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.2s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.6s finished


Done training predictions


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.1s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[73390 36057]
 [ 3686  1671]]
[[96835 12612]
 [ 4420   937]]
[[73199 36248]
 [ 4057  1300]]
[[109289    158]
 [  5275     82]]
[[109430     17]
 [  5323     34]]
no missing values to input on Number of Dependents
no missing values to input on Age at Injury
no missing values to input on Number of Dependents
no missing values to input on Accident Month
no missing values to input on Accident Day
no missing values to input on Accident DayOfWeek


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val['Carrier Claim Category'].fillna(0, inplace = True)


Done DT


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   12.3s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 0.15609047
Iteration 2, loss = 0.14276226
Iteration 3, loss = 0.14274032
Iteration 4, loss = 0.14272499
Iteration 5, loss = 0.14271271
Iteration 6, loss = 0.14270232
Iteration 7, loss = 0.14269323
Iteration 8, loss = 0.14268510
Iteration 9, loss = 0.14267772
Iteration 10, loss = 0.14267094
Iteration 11, loss = 0.14266465
Iteration 12, loss = 0.14265877
Iteration 13, loss = 0.14265325
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.2s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.6s finished


Done training predictions


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.1s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[70144 39303]
 [ 4314  1043]]
[[92250 17197]
 [ 4728   629]]
[[60942 48505]
 [ 4668   689]]
[[109334    113]
 [  5257    100]]
[[109446      1]
 [  5355      2]]
no missing values to input on Number of Dependents
no missing values to input on Age at Injury
no missing values to input on Number of Dependents
no missing values to input on Accident Month
no missing values to input on Accident Day
no missing values to input on Accident DayOfWeek


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val['Carrier Claim Category'].fillna(0, inplace = True)


Done DT


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   12.1s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 0.15919880
Iteration 2, loss = 0.14469379
Iteration 3, loss = 0.14467638
Iteration 4, loss = 0.14466374
Iteration 5, loss = 0.14465332
Iteration 6, loss = 0.14464429
Iteration 7, loss = 0.14463622
Iteration 8, loss = 0.14462888
Iteration 9, loss = 0.14462209
Iteration 10, loss = 0.14461579
Iteration 11, loss = 0.14460987
Iteration 12, loss = 0.14460427
Iteration 13, loss = 0.14459893
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.2s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.6s finished


Done training predictions


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.1s finished


Done validation predictions


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[71441 38006]
 [ 4424   933]]
[[84973 24474]
 [ 4857   500]]
[[62846 46601]
 [ 4462   895]]
[[109095    352]
 [  5228    129]]
[[109445      2]
 [  5355      2]]
no missing values to input on Number of Dependents
no missing values to input on Age at Injury
no missing values to input on Number of Dependents
no missing values to input on Accident Month
no missing values to input on Accident Day
no missing values to input on Accident DayOfWeek


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val['Carrier Claim Category'].fillna(0, inplace = True)


Done DT


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   12.0s finished


Done RF
Done XGB
Done KNN
Iteration 1, loss = 0.15407354
Iteration 2, loss = 0.14019931
Iteration 3, loss = 0.14018852
Iteration 4, loss = 0.14018048
Iteration 5, loss = 0.14017375
Iteration 6, loss = 0.14016783
Iteration 7, loss = 0.14016250
Iteration 8, loss = 0.14015761
Iteration 9, loss = 0.14015306
Iteration 10, loss = 0.14014880
Iteration 11, loss = 0.14014477
Iteration 12, loss = 0.14014093
Iteration 13, loss = 0.14013728
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Done MLP


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.2s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.6s finished


Done training predictions


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.1s finished


Done validation predictions
[[109033    413]
 [  4883    475]]
[[109434     12]
 [  5026    332]]
[[109426     20]
 [  4994    364]]
[[109429     17]
 [  5211    147]]
[[109394     52]
 [  5206    152]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
#Check the results
results_secondary_target

Unnamed: 0,Train_precision,Test_precision,Train_recall,Test_recall,Train_f1_score,Test_f1_score
Decision Tree,1.0,0.545383,1.0,0.490633,1.0,0.444067
Random Forest,0.999995,0.603634,0.999897,0.532813,0.999946,0.511495
XGBoost,0.907491,0.584706,0.637352,0.471562,0.698709,0.439744
KNearestNeighbors,0.895985,0.700122,0.536993,0.518383,0.557173,0.520026
Multi Layer Perceptron,0.576674,0.769709,0.500005,0.512053,0.488068,0.509603


# Run Test Predictions with best model

## 1. Main Target Variable Only

### Best model for Claim Injury Type target test prediction: **XGBoost**

#### V1: XGBoost with imputation of missing values

In [22]:
# Reset all models, datasets and global Parameters for fresh retraining
DT , RF , XGB , KNN , MLP = model_load()
data, data_test,le, num_features, cat_features, X, y, y2 = data_load()

  data= pd.read_csv("../../data/train_data_enriched.csv", index_col="Claim Identifier")


In [23]:
"""submission_without_missing = test_prediction(
    XGBClassifier(),
    X,y,
    num_features,cat_features,
    data_test,
    scaling_outlier= True)
submission_without_missing"""

'submission_without_missing = test_prediction(\n    XGBClassifier(),\n    X,y,\n    num_features,cat_features,\n    data_test,\n    scaling_outlier= True)\nsubmission_without_missing'

In [24]:
#export to csv
"""submission_without_missing.to_csv("submission_without_missing.csv")"""

'submission_without_missing.to_csv("submission_without_missing.csv")'

#### V2: XGBoost without imputation of missing values

In [25]:
# Reset all models, datasets and global Parameters for fresh retraining
DT , RF , XGB , KNN , MLP = model_load()
data, data_test,le, num_features, cat_features, X, y, y2 = data_load()

  data= pd.read_csv("../../data/train_data_enriched.csv", index_col="Claim Identifier")


In [26]:
# using xgboost´s way of dealing with missing values (better performance)
submission_with_missing = test_prediction(
    XGBClassifier(),
    X,y,
    num_features,cat_features,
    data_test, 
    missing= False)
submission_with_missing

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val['Carrier Claim Category'].fillna(0, inplace = True)


Unnamed: 0_level_0,Claim Injury Type
Claim Identifier,Unnamed: 1_level_1
6165911,2. NON-COMP
6166141,2. NON-COMP
6165907,2. NON-COMP
6166047,2. NON-COMP
6166102,2. NON-COMP
...,...
6553137,2. NON-COMP
6553119,1. CANCELLED
6553542,1. CANCELLED
6553455,2. NON-COMP


In [27]:
#export to csv
submission_with_missing.to_csv("../../data/submission_with_missing.csv")

#### F1 MACRO SCORE KAGGLE: **0.44011**

## 2. Secondary Target Variable AND Main Target Variable

### Best model prediction of secondary target var: KNN

In [28]:
# Reset all models, datasets and global Parameters for fresh retraining
DT , RF , XGB , KNN , MLP = model_load()
data, data_test,le, num_features, cat_features, X, y, y2 = data_load()

  data= pd.read_csv("../../data/train_data_enriched.csv", index_col="Claim Identifier")


In [29]:
submission_with_missing_with_v2 = test_prediction(
    XGBClassifier(), # y classifier
    X, y,# Dataset without target, target var 1
    num_features,
    cat_features,
    data_test,
    missing= False,
    secondary_model = KNeighborsClassifier(n_neighbors = 50), # y2 classifier
    y_train_secondary = y2, # target var 2
    secondary_missing = True
    )
submission_with_missing_with_v2

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val['Carrier Claim Category'].fillna(0, inplace = True)


no missing values to input on Number of Dependents


Unnamed: 0_level_0,Claim Injury Type
Claim Identifier,Unnamed: 1_level_1
6165911,2. NON-COMP
6166141,2. NON-COMP
6165907,2. NON-COMP
6166047,2. NON-COMP
6166102,2. NON-COMP
...,...
6553137,2. NON-COMP
6553119,1. CANCELLED
6553542,1. CANCELLED
6553455,2. NON-COMP


In [30]:
submission_with_missing_with_v2.to_csv("../../data/submission_with_missing_with_v2.csv")

#### F1 MACRO SCORE KAGGLE: **0.43776**

# New attempt to hyperparam tuning

In [31]:
"""def cv_scores_hyperparameter_tuning(X, y, num_features, cat_features, num_imputing_algorithm=XGBRegressor(), cat_imputing_algorithm=XGBClassifier(), scaling_outlier= False, scaler=MinMaxScaler(), rfe = False):
    
    Takes as argument the predictors and the target, the models used for imputing numerical and categorical 
    features, if any scaling and outlier removal should be performed,what scaling method should be used and if feature selection with rfe should be used.
    Then it returns the results obtained from the stratified cross-validation for the given models.
    
    skf = StratifiedKFold(n_splits=5)

    # Generating the lists to store our results
    precision_scores_train = []
    precision_scores_val = []
    recall_scores_train = []
    recall_scores_val = []
    f1_scores_train =  []
    f1_scores_val =  []

    precision_scores_train_mean = []
    precision_scores_val_mean = [] 
    recall_scores_train_mean = []
    recall_scores_val_mean = []
    f1_scores_train_mean =  []
    f1_scores_val_mean =  []
    
    for train_index, test_index in skf.split(X, y):
        # Dividing our data in validation and train
        X_train, X_val = X.iloc[train_index].copy(), X.iloc[test_index].copy()
        y_train, y_val = y.iloc[train_index].copy(), y.iloc[test_index].copy()

        # Filling missing values
        for column in num_features:
            X_train, X_val = impute_missing_values(X_train,X_val, column, num_imputing_algorithm)
        
        # Removing inconsistencies on the train
        inconsistent = X_train[(X_train['Age at Injury'] > 80) | (X_train["Age at Injury"] < 16)].index
        X_train = X_train.loc[~X_train.index.isin(inconsistent)]
        y_train = y_train.loc[~y_train.index.isin(inconsistent)]

        # Performing scaling and outlier treatment dependent on the boolean
        if scaling_outlier:
            for column in num_features:
                handle_outliers(X_train, column)
                X_train, X_val = impute_missing_values(X_train,X_val, column, num_imputing_algorithm,validation= False)

            for column in num_features:
                scale_numerical(column, X_train, X_val, scaler)
                
        # Creating an ordinal variable
        claim_carrier_categories(X_train, X_val)

        #Filling missing values in the ordinal variable that might appear on X_val
        X_val, X_train = impute_missing_values(X_val,X_train, "Carrier Claim Category", cat_imputing_algorithm, validation=False)

        # Categorical Prop Encoding
        for cat_feature in cat_features:
            categorical_prop_encode(X_train, X_val, cat_feature)
            if scaling_outlier:
                scale_numerical("Carrier Claim Category", X_train, X_val, scaler)

        # Selecting features with Rfe
        if rfe:
            Selected_features = Rfe(XGBClassifier(), X_train, y_train, X_val, y_val)
            X_train = X_train[Selected_features]
            X_val = X_val[Selected_features]

        # Training the classification models
        XGBT.fit(X_train, y_train)
        print("Done XGBT")

        # Making the predictions for the training and validation data
        pred_train_XGBT = XGBT.predict(X_train)
        print("Done training predictions")
        
        pred_val_XGBT = XGBT.predict(X_val)
        print("Done validation predictions")

        # Calculating and storing the scores
        precision_scores_train.append(precision_score(y_train, pred_train_XGBT, average='macro'))
        recall_scores_train.append(recall_score(y_train, pred_train_XGBT, average='macro'))
        f1_scores_train.append(f1_score(y_train, pred_train_XGBT, average='macro'))
        
        precision_scores_val.append(precision_score(y_val, pred_val_XGBT, average='macro'))
        recall_scores_val.append(recall_score(y_val, pred_val_XGBT, average='macro'))
        f1_scores_val.append(f1_score(y_val, pred_val_XGBT, average='macro'))


    # Aggregating the average results across the folds
    precision_scores_train_mean.append(mean(precision_scores_train))
    precision_scores_val_mean.append(mean(precision_scores_val))
    recall_scores_train_mean.append(mean(recall_scores_train))
    recall_scores_val_mean.append(mean(recall_scores_val))
    f1_scores_train_mean.append(mean(f1_scores_train))
    f1_scores_val_mean.append(mean(f1_scores_val))

    # Storing the results in a dataframe
    scores = {'Train_precision': precision_scores_train_mean,
    'Test_precision': precision_scores_val_mean,
    'Train_recall': recall_scores_train_mean,
    'Test_recall': recall_scores_val_mean,
    'Train_f1_score': f1_scores_train_mean,
    'Test_f1_score': f1_scores_val_mean}

    print(scores)
"""

'def cv_scores_hyperparameter_tuning(X, y, num_features, cat_features, num_imputing_algorithm=XGBRegressor(), cat_imputing_algorithm=XGBClassifier(), scaling_outlier= False, scaler=MinMaxScaler(), rfe = False):\n    \n    Takes as argument the predictors and the target, the models used for imputing numerical and categorical \n    features, if any scaling and outlier removal should be performed,what scaling method should be used and if feature selection with rfe should be used.\n    Then it returns the results obtained from the stratified cross-validation for the given models.\n    \n    skf = StratifiedKFold(n_splits=5)\n\n    # Generating the lists to store our results\n    precision_scores_train = []\n    precision_scores_val = []\n    recall_scores_train = []\n    recall_scores_val = []\n    f1_scores_train =  []\n    f1_scores_val =  []\n\n    precision_scores_train_mean = []\n    precision_scores_val_mean = [] \n    recall_scores_train_mean = []\n    recall_scores_val_mean = []\n 

In [32]:
"""import xgboost as xgb
for gama in range(10):
    for depth in range(4,8):
        XGBT = XGBClassifier(gamma = gama, max_depth = depth)
        print(boost, gama, depth)
        cv_scores_hyperparameter_tuning(X, y,num_features,cat_features,scaling_outlier = True)
        print("------------")"""


'import xgboost as xgb\nfor gama in range(10):\n    for depth in range(4,8):\n        XGBT = XGBClassifier(gamma = gama, max_depth = depth)\n        print(boost, gama, depth)\n        cv_scores_hyperparameter_tuning(X, y,num_features,cat_features,scaling_outlier = True)\n        print("------------")'