## Import Packages, Training Data Subset, and Intermediate Test Data

In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, RobustScaler
from sklearn.compose import ColumnTransformer 
from sklearn.decomposition import PCA

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import auc

%matplotlib inline

In [2]:
data_path = 'C:/Users/akipp/Documents/GitHub/Data/Insurance-Demo-Data/'
train_data_name = 'exercise_40_train.csv'

In [3]:
train_data = pd.read_csv(data_path + train_data_name)

## Train/Test Split and Data Cleaning

In [5]:
def data_cleaning(df):
    # x7 is a percentage amount which needs to be converted to a standard float value; contains some values in 
    # scientific notation which needs to be accounted for
    def percent_to_float(x):
        x_split = x.strip('%').split('e')
        if len(x_split) == 2:
            return(float(x_split[0]) * 10**float(x_split[1]))
        else:
            return(float(x_split[0]))

    df['x7'] = df.x7.apply(percent_to_float)


    # x19 is a dollar amount which needs to be 
    df['x19'] = df.x19.apply(lambda x: float(x.strip('$')))

    # Consolidate day of week names
    df['x3'] = df['x3'].map({'Monday': 'Monday', 'Tuesday': 'Tuesday', 'Wednesday': 'Wednesday', 'Thursday': 'Thursday', 
                             'Friday': 'Friday', 'Saturday': 'Saturday', 'Sunday': 'Sunday', 
                             'Mon': 'Monday', 'Tue': 'Tuesday', 'Wed': 'Wednesday', 'Thur': 'Thursday', 
                             'Fri': 'Friday', 'Sat': 'Saturday', 'Sun': 'Sunday'})
    # Replace missing values of what is probably a gender or sex column with 'other'
    df['x24'] = df.x24.fillna('other')
    # Replace missing values of what is probably a car make or brand column with 'other'
    df['x77'] = df.x77.fillna('other')
    # Replace missing values in x33, x99, and x79 with 'missing' because we don't know what is missing here;
    # Map the values in x79 to strings so it can having missing as a category and will work with OHE
    df['x99'] = df.x99.fillna('missing')
    df['x79'] = df['x79'].map({1.0: '1', 0.0: '0'}).fillna('missing')
    df['x33'] = df.x33.fillna('missing')
    # Drop columns x39 because it is not useful - only has a single value and no missing values.
    df.drop(columns = ['x39'], inplace = True)
    
    # Output initially cleaned data
    return(df)

In [6]:
# No issues with doing the data cleaning before train/test split because all transformations are element-wise only and there is no data leakage
train_data = data_cleaning(train_data)

In [7]:
y = train_data['y']
X = train_data.drop(columns = ['y'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, stratify = y, random_state = 42)

In [8]:
print(X_train.shape)
print(X_test.shape)

(36000, 99)
(4000, 99)


## Logistic Regression Modeling Pipeline

In [9]:
def lr_model_gscv(X_train, y_train, smote = True):
    # One-Hot Enconde categorical features
    cat_feats = X_train.columns[X_train.dtypes == 'object']
    
    # An imblearn pipeline is needed when using SMOTE in the pipeline because the function doesn't 
    # have a fit_transform method and can't be used with a sklearn pipeline. Also, the imblearn 
    # pipeline will ensure that SMOTE is only applied to training data, even in cross-validation.
    if smote == True: 
        pipeline = imbpipeline(steps = [('ohe', ColumnTransformer(transformers = [('ohe', OneHotEncoder(handle_unknown = 'ignore', sparse = False), cat_feats)], remainder = 'passthrough')),
                                        # Inputing missing continuous feature data with chained equations
                                        ('mice', IterativeImputer(max_iter = 20, tol = 0.01, n_nearest_features = 50, random_state = 42, verbose = 1)), 
                                        # Creating synthetic data from the minority class
                                        ('smote', SMOTE(k_neighbors = 10, random_state = 42)), 
                                        # Attempting to normalize the features with the yeo-johnson transformation (box-cox also an option, but requires all data to be positive)
                                        ('gaussian', PowerTransformer()),
                                        # Robust scaling of the data, rather than Standard scaling, because it is less impacted by outliers
                                        ('scaler', RobustScaler()),
                                        ('classifier', LogisticRegression(random_state = 42))
                                       ])
    # When not using SMOTE, a sklearn pipeline works fine
    else:
        pipeline = Pipeline(steps = [('ohe', ColumnTransformer(transformers = [('ohe', OneHotEncoder(handle_unknown = 'ignore', sparse = False), cat_feats)], remainder = 'passthrough')),
                                     # Inputing missing continuous feature data with chained equations
                                     ('mice', IterativeImputer(max_iter = 20, tol = 0.01, n_nearest_features = 50, random_state = 42, verbose = 1)),
                                     # Attempting to normalize the features with the yeo-johnson transformation (box-cox also an option, but requires all data to be positive)
                                     ('gaussian', PowerTransformer()),
                                     # Robust scaling of the data, rather than Standard scaling, because it is less impacted by outliers
                                     ('scaler', RobustScaler()),
                                     ('classifier', LogisticRegression(random_state = 42))
                                    ])
    # Specify how the folds should be created in GridSearchCV
    stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=42)
    # Define the parameter grid for GridSearchCV
    # defaults: 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter': 100 (doesn't converge at 100), 'C': 1.0
    param_grid = {'classifier__C': [0.01, 0.1, 1, 10, 100], 'classifier__max_iter': [1000]} 
    grid_search = GridSearchCV(estimator = pipeline,
                               param_grid = param_grid,
                               scoring = 'roc_auc',
                               cv = stratified_kfold,
                               n_jobs = 8,
                               verbose = 3)
    
    
    grid_search.fit(X_train, y_train)
    return(grid_search)

### Logistic Regression without SMOTE

In [10]:
lr_model = lr_model_gscv(X_train, y_train, smote = False)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[IterativeImputer] Completing matrix with shape (36000, 185)
[IterativeImputer] Change: 4762.814312581376, scaled tolerance: 1054.43357829 
[IterativeImputer] Change: 902.9395589654421, scaled tolerance: 1054.43357829 
[IterativeImputer] Early stopping criterion reached.


In [11]:
print('Best model parameters determine during grid search:', lr_model.best_params_)
print('Best AUC score on cross-validation data:', lr_model.best_score_)

Best model parameters determine during grid search: {'classifier__C': 0.1, 'classifier__max_iter': 1000}
Best AUC score on cross-validation data: 0.7659928781183799


In [12]:
pd.DataFrame(lr_model.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__C,param_classifier__max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,108.387124,12.910848,0.645548,0.058682,0.01,1000,"{'classifier__C': 0.01, 'classifier__max_iter'...",0.760046,0.758299,0.769281,0.769006,0.770698,0.765466,0.0052,5
1,108.068055,13.227967,0.724269,0.069899,0.1,1000,"{'classifier__C': 0.1, 'classifier__max_iter':...",0.760664,0.75904,0.768502,0.769648,0.772109,0.765993,0.005173,1
2,110.140387,13.518889,0.734409,0.060566,1.0,1000,"{'classifier__C': 1, 'classifier__max_iter': 1...",0.760565,0.759048,0.768056,0.769493,0.772339,0.7659,0.005185,2
3,104.636041,11.953392,0.632222,0.052137,10.0,1000,"{'classifier__C': 10, 'classifier__max_iter': ...",0.760552,0.75901,0.768011,0.769428,0.772377,0.765876,0.005195,3
4,80.777431,21.026607,0.298392,0.073922,100.0,1000,"{'classifier__C': 100, 'classifier__max_iter':...",0.760553,0.759006,0.768003,0.769421,0.772386,0.765874,0.005196,4


### Logistic Regression with SMOTE

In [13]:
lr_model_smote = lr_model_gscv(X_train, y_train, smote = True)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[IterativeImputer] Completing matrix with shape (36000, 185)
[IterativeImputer] Change: 4762.814312581376, scaled tolerance: 1054.43357829 
[IterativeImputer] Change: 902.9395589654421, scaled tolerance: 1054.43357829 
[IterativeImputer] Early stopping criterion reached.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [14]:
print('Best model parameters determine during grid search:', lr_model_smote.best_params_)
print('Best AUC score on cross-validation data:', lr_model_smote.best_score_)

Best model parameters determine during grid search: {'classifier__C': 100, 'classifier__max_iter': 1000}
Best AUC score on cross-validation data: 0.7656243496126398


In [15]:
pd.DataFrame(lr_model_smote.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__C,param_classifier__max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,118.235202,11.422702,0.724222,0.052898,0.01,1000,"{'classifier__C': 0.01, 'classifier__max_iter'...",0.758626,0.757595,0.766974,0.768843,0.771542,0.764716,0.005595,5
1,121.653897,10.382984,0.753368,0.074432,0.1,1000,"{'classifier__C': 0.1, 'classifier__max_iter':...",0.760119,0.758592,0.767243,0.769535,0.772054,0.765509,0.005272,4
2,133.669401,11.520107,0.713796,0.165032,1.0,1000,"{'classifier__C': 1, 'classifier__max_iter': 1...",0.760452,0.758706,0.767045,0.769565,0.772027,0.765559,0.00516,3
3,154.719287,9.283014,0.594104,0.097337,10.0,1000,"{'classifier__C': 10, 'classifier__max_iter': ...",0.760588,0.758636,0.767198,0.769476,0.772137,0.765607,0.005175,2
4,127.38807,33.106193,0.302648,0.06782,100.0,1000,"{'classifier__C': 100, 'classifier__max_iter':...",0.760556,0.75879,0.767237,0.769321,0.772217,0.765624,0.00514,1


From the above results, it appears that while the classes are imbalanced, using SMOTE to aftifically balance the classes does not provide any particular benefit so it can be dropped from use in Logistic Regression.

### Logistic Regression Intermediate Test Data Score - Without SMOTE

In [16]:
print('AUC on test data:', lr_model.score(X_test, y_test))

[IterativeImputer] Completing matrix with shape (4000, 185)
AUC on test data: 0.7478387779794312


## Random Forest Modeling Pipeline

In [17]:
def cbc_model_gscv(X_train, y_train, smote = True):
    # One-Hot Enconde categorical features
    cat_feats = X_train.columns[X_train.dtypes == 'object']
    
    # An imblearn pipeline is needed when using SMOTE in the pipeline because the function doesn't 
    # have a fit_transform method and can't be used with a sklearn pipeline. Also, the imblearn 
    # pipeline will ensure that SMOTE is only applied to training data, even in cross-validation.
    if smote == True: 
        pipeline = imbpipeline(steps = [('ohe', ColumnTransformer(transformers = [('ohe', OneHotEncoder(handle_unknown = 'ignore', sparse = False), cat_feats)], remainder = 'passthrough')),
                                        # Inputing missing continuous feature data with chained equations
                                        ('mice', IterativeImputer(max_iter = 20, tol = 0.01, n_nearest_features = 50, random_state = 42, verbose = 1)), 
                                        # Creating synthetic data from the minority class
                                        ('smote', SMOTE(k_neighbors = 10, random_state = 42)), 
                                        # Attempting to normalize the features with the yeo-johnson transformation (box-cox also an option, but requires all data to be positive)
                                        ('gaussian', PowerTransformer()),
                                        # Robust scaling of the data, rather than Standard scaling, because it is less impacted by outliers
                                        ('scaler', RobustScaler()),
                                        ('classifier', CatBoostClassifier(task_type = 'GPU', devices = '0:1', verbose = 0))
                                       ])
    # When not using SMOTE, a sklearn pipeline works fine
    else:
        pipeline = Pipeline(steps = [('ohe', ColumnTransformer(transformers = [('ohe', OneHotEncoder(handle_unknown = 'ignore', sparse = False), cat_feats)], remainder = 'passthrough')),
                                     # Inputing missing continuous feature data with chained equations
                                     ('mice', IterativeImputer(max_iter = 20, tol = 0.01, n_nearest_features = 50, random_state = 42, verbose = 1)),
                                     # Attempting to normalize the features with the yeo-johnson transformation (box-cox also an option, but requires all data to be positive)
                                     ('gaussian', PowerTransformer()),
                                      # Robust scaling of the data, rather than Standard scaling, because it is less impacted by outliers
                                     ('scaler', RobustScaler()),
                                     ('classifier', CatBoostClassifier(task_type = 'GPU', devices = '0:1', verbose = 0))
                                    ])
    # Specify how the folds should be created in GridSearchCV
    stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=42)
    # Define the parameter grid for GridSearchCV
    param_grid = {'classifier__iterations': [1000]} # 'classifier__iterations': [1000], 'classifier__n_estimators': [100, 200, 300], 'classifier__max_depth': [10, 15, 20]
    grid_search = GridSearchCV(estimator = pipeline,
                               param_grid = param_grid,
                               scoring = 'roc_auc',
                               cv = stratified_kfold,
                               n_jobs = 1,
                               verbose = 1)
    
    
    grid_search.fit(X_train, y_train)
    return(grid_search)

### Random Forest Classifier without SMOTE

In [18]:
cbc_model = cbc_model_gscv(X_train, y_train, smote = False)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[IterativeImputer] Completing matrix with shape (28800, 185)
[IterativeImputer] Change: 4772.316650594342, scaled tolerance: 1054.43357829 
[IterativeImputer] Change: 955.7651025295002, scaled tolerance: 1054.43357829 
[IterativeImputer] Early stopping criterion reached.


  loglike = -n_samples / 2 * np.log(x_trans.var())


[IterativeImputer] Completing matrix with shape (7200, 185)
[IterativeImputer] Completing matrix with shape (28800, 185)
[IterativeImputer] Change: 4522.071574019754, scaled tolerance: 1054.43357829 
[IterativeImputer] Change: 1121.1487511084902, scaled tolerance: 1054.43357829 
[IterativeImputer] Change: 772.098752699685, scaled tolerance: 1054.43357829 
[IterativeImputer] Early stopping criterion reached.


  loglike = -n_samples / 2 * np.log(x_trans.var())


[IterativeImputer] Completing matrix with shape (7200, 185)
[IterativeImputer] Completing matrix with shape (28800, 185)
[IterativeImputer] Change: 4726.374474804047, scaled tolerance: 1054.43357829 
[IterativeImputer] Change: 1289.3303725232347, scaled tolerance: 1054.43357829 
[IterativeImputer] Change: 910.748209593336, scaled tolerance: 1054.43357829 
[IterativeImputer] Early stopping criterion reached.


  loglike = -n_samples / 2 * np.log(x_trans.var())


[IterativeImputer] Completing matrix with shape (7200, 185)
[IterativeImputer] Completing matrix with shape (28800, 185)
[IterativeImputer] Change: 5156.155298589655, scaled tolerance: 1054.43357829 
[IterativeImputer] Change: 1118.8323782043637, scaled tolerance: 1054.43357829 
[IterativeImputer] Change: 559.6982391417464, scaled tolerance: 1054.43357829 
[IterativeImputer] Early stopping criterion reached.


  loglike = -n_samples / 2 * np.log(x_trans.var())


[IterativeImputer] Completing matrix with shape (7200, 185)
[IterativeImputer] Completing matrix with shape (28800, 185)
[IterativeImputer] Change: 4853.875315268089, scaled tolerance: 1052.68939228 
[IterativeImputer] Change: 1400.5024189753487, scaled tolerance: 1052.68939228 
[IterativeImputer] Change: 537.7464642603584, scaled tolerance: 1052.68939228 
[IterativeImputer] Early stopping criterion reached.


  loglike = -n_samples / 2 * np.log(x_trans.var())


[IterativeImputer] Completing matrix with shape (7200, 185)
[IterativeImputer] Completing matrix with shape (36000, 185)
[IterativeImputer] Change: 4762.814312581376, scaled tolerance: 1054.43357829 
[IterativeImputer] Change: 902.9395589654421, scaled tolerance: 1054.43357829 
[IterativeImputer] Early stopping criterion reached.


In [19]:
print('Best model parameters determine during grid search:', cbc_model.best_params_)
print('Best AUC score on cross-validation data:', cbc_model.best_score_)

Best model parameters determine during grid search: {'classifier__iterations': 1000}
Best AUC score on cross-validation data: 0.803347346220171


In [20]:
pd.DataFrame(cbc_model.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__iterations,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,69.108671,6.487655,0.226608,0.046409,1000,{'classifier__iterations': 1000},0.798605,0.802302,0.804163,0.805833,0.805834,0.803347,0.002706,1


### Cat Boost Classifier with SMOTE

In [21]:
cbc_model_smote = cbc_model_gscv(X_train, y_train, smote = True)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[IterativeImputer] Completing matrix with shape (28800, 185)
[IterativeImputer] Change: 4772.316650594342, scaled tolerance: 1054.43357829 
[IterativeImputer] Change: 955.7651025295002, scaled tolerance: 1054.43357829 
[IterativeImputer] Early stopping criterion reached.
[IterativeImputer] Completing matrix with shape (7200, 185)
[IterativeImputer] Completing matrix with shape (28800, 185)
[IterativeImputer] Change: 4522.071574019754, scaled tolerance: 1054.43357829 
[IterativeImputer] Change: 1121.1487511084902, scaled tolerance: 1054.43357829 
[IterativeImputer] Change: 772.098752699685, scaled tolerance: 1054.43357829 
[IterativeImputer] Early stopping criterion reached.
[IterativeImputer] Completing matrix with shape (7200, 185)
[IterativeImputer] Completing matrix with shape (28800, 185)
[IterativeImputer] Change: 4726.374474804047, scaled tolerance: 1054.43357829 
[IterativeImputer] Change: 1289.3303725232347, scaled tole

In [22]:
print('Best model parameters determine during grid search:', cbc_model_smote.best_params_)
print('Best AUC score on cross-validation data:', cbc_model_smote.best_score_)

Best model parameters determine during grid search: {'classifier__iterations': 1000}
Best AUC score on cross-validation data: 0.8044069664557026


In [23]:
pd.DataFrame(cbc_model_smote.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__iterations,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,76.841433,2.491626,0.209445,0.014029,1000,{'classifier__iterations': 1000},0.799835,0.802491,0.805405,0.804667,0.809636,0.804407,0.003255,1


From the above results, it appears that while the classes are imbalanced, using SMOTE to aftifically balance the classes does not provide much, if any, benefit so it can be dropped from use in Cat Boost Classifier.

### Cat Boost Classifier Intermediate Test Data Score - Without SMOTE

In [24]:
print('AUC on test data:', cbc_model.score(X_test, y_test))

[IterativeImputer] Completing matrix with shape (4000, 185)
AUC on test data: 0.7943153861665659


## Final Test Predictions

In [27]:
test_data_name = 'exercise_40_test.csv'

In [28]:
test_data = pd.read_csv(data_path + test_data_name)

In [29]:
X_test = data_cleaning(test_data)

In [45]:
lr_test_probs = pd.DataFrame(lr_model.predict_proba(X_test))
lr_test_probs.columns = ['prob_class_0', 'prob_class_1']
lr_test_probs.drop(columns = ['prob_class_0'], inplace = True)
cbc_test_probs = pd.DataFrame(cbc_model.predict_proba(X_test))
cbc_test_probs.columns = ['prob_class_0', 'prob_class_1']
cbc_test_probs.drop(columns = ['prob_class_0'], inplace = True)

[IterativeImputer] Completing matrix with shape (10000, 185)
[IterativeImputer] Completing matrix with shape (10000, 185)


In [47]:
lr_test_probs.to_csv(data_path + 'glmresults.csv', header = False, index = False)
cbc_test_probs.to_csv(data_path + 'nonglmresults.csv', header = False, index = False)