In [1]:
import csv
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

plt.style.use('ggplot')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
import scipy.stats as stats
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer, classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, cohen_kappa_score 
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV

In [3]:
from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [5]:
f = pd.read_csv('df_data.csv')
df = pd.DataFrame(f)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41176 entries, 0 to 41175
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   euribor3m          41176 non-null  float64
 1   cons.conf.idx      41176 non-null  float64
 2   age                41176 non-null  float64
 3   previous           41176 non-null  float64
 4   blue_collar        41176 non-null  int64  
 5   student            41176 non-null  int64  
 6   retiree            41176 non-null  int64  
 7   unemployed         41176 non-null  int64  
 8   single             41176 non-null  int64  
 9   no_default         41176 non-null  int64  
 10  age_student        41176 non-null  float64
 11  age_retiree        41176 non-null  float64
 12  contact_telephone  41176 non-null  int64  
 13  month_aug          41176 non-null  int64  
 14  month_dec          41176 non-null  int64  
 15  month_jul          41176 non-null  int64  
 16  month_jun          411

The data before us is very imbalanced, with only 12.7% of the positive class. The choice of scoring metric on which to train the machine learning algorithm is therefore extremely important. The typical accuracy scorer is not the best choice with such an imbalanced dataset because one could get an apparently high accuracy score of approximately 87% if the model just simply picks the negative class. This is obviously unhelpful to the goal of predicting the positive class.

#### The F1 score tends to be the better metric for imbalanced datasets as it evaluates both the precision and recall rates, so it is focused on how good the model is at predicting the positive (minority in this case) class. Another good metric is Cohen's Kappa, which takes into account how much agreement would be expected by chance.

In [7]:
f1_scorer = make_scorer(f1_score)

# Scaling the Explanatory Variables

In [8]:
y = df.pop('target')

In [9]:
X = df

In [10]:
X.columns

Index(['euribor3m', 'cons.conf.idx', 'age', 'previous', 'blue_collar',
       'student', 'retiree', 'unemployed', 'single', 'no_default',
       'age_student', 'age_retiree', 'contact_telephone', 'month_aug',
       'month_dec', 'month_jul', 'month_jun', 'month_mar', 'month_may',
       'month_nov', 'month_oct', 'month_sep', 'day_of_week_mon',
       'day_of_week_thu', 'day_of_week_tue', 'day_of_week_wed'],
      dtype='object')

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=8)

Will use MinMax scaler to scale the variables because (1) I don't need normally distributed data because not using OLS, (2) plus this is a sizeable dataset, and (2) the prevalence of dummy variables among the explanatory variables.

In [12]:
scaler = MinMaxScaler()
X_train_mm = scaler.fit_transform(X_train)
X_test_mm = scaler.transform(X_test)

In [13]:
X_train = pd.DataFrame(X_train_mm, columns=['euribor3m', 'cons.conf.idx', 'age', 'previous', 'blue_collar',
       'student', 'retiree', 'unemployed', 'single', 'no_default', 'age_student', 'age_retiree', 'contact_telephone', 
        'month_aug', 'month_dec', 'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct', 
        'month_sep', 'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue', 'day_of_week_wed'])

In [14]:
X_test = pd.DataFrame(X_test_mm, columns=['euribor3m', 'cons.conf.idx', 'age', 'previous', 'blue_collar',
       'student', 'retiree', 'unemployed', 'single', 'no_default', 'age_student', 'age_retiree', 'contact_telephone', 
        'month_aug', 'month_dec', 'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct', 
        'month_sep', 'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue', 'day_of_week_wed'])

In [15]:
# Confirms that the max-min scaling has taken place
X_train.describe()

Unnamed: 0,euribor3m,cons.conf.idx,age,previous,blue_collar,student,retiree,unemployed,single,no_default,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed
count,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0,...,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0
mean,0.677648,0.430109,0.283754,0.028796,0.222912,0.020747,0.040731,0.024217,0.281546,0.792561,...,0.129515,0.012906,0.33279,0.099886,0.017521,0.0136,0.207265,0.209728,0.196718,0.197516
std,0.392972,0.193246,0.128394,0.082555,0.416207,0.14254,0.197671,0.153724,0.449761,0.405479,...,0.335774,0.112873,0.47122,0.299852,0.131203,0.115826,0.405354,0.407122,0.397524,0.398132
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.160961,0.338912,0.185185,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.957379,0.376569,0.259259,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.980957,0.60251,0.37037,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Gridsearch Logit, Random Forest & SVC models

In [16]:
kf = StratifiedKFold(n_splits=5)

### Logistic Regression

In [17]:
logit = LogisticRegression(penalty='l2', C=1, solver='liblinear', class_weight='balanced', random_state=8)

In [18]:
logit_gs_params = {'penalty': ['l1', 'l2'],
                   'solver': ['liblinear', 'saga'],
                   'C': [0.001, 0.01, 0.1, 1, 10, 100],
                   'class_weight': [None, 'balanced']}

scorers = {
    'kappa': make_scorer(cohen_kappa_score),
    'f1': make_scorer(f1_score)
}

In [19]:
gs_logit = GridSearchCV(logit, logit_gs_params, scoring=scorers, refit='f1', cv=5, verbose=2, n_jobs=-1)
gs_logit = gs_logit.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   15.4s finished


In [20]:
gs_logit.best_score_

0.4122191012059292

In [21]:
gs_logit.best_params_

{'C': 10, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}

In [22]:
gs_logit.best_estimator_

LogisticRegression(C=10, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=8, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
# Using the "best estimator" logit model
logit = LogisticRegression(C=10, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=8, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
logit.fit(X_train, y_train)
y_hat_train_lr = logit.predict(X_train)
y_hat_test_lr = logit.predict(X_test)
scores = cross_val_score(logit, X_train, y_train, scoring=f1_scorer, cv=kf)

print("Cross-validation F1 scores:", scores)
print("Mean CV F1 scores:", np.mean(scores))

Cross-validation F1 scores: [0.40718563 0.42480469 0.42436548 0.40897999 0.39575972]
Mean CV F1 scores: 0.4122191012059292


In [25]:
print("Train set accuracy score:", logit.score(X_train, y_train))
print("Test set accuracy score:", logit.score(X_test, y_test))
print("Test set F1 score on Class 1:", f1_score(y_test, y_hat_test_lr, average='binary'))
print("Test set Cohen's kappa:", cohen_kappa_score(y_test, y_hat_test_lr))

Train set accuracy score: 0.7964472816847656
Test set accuracy score: 0.7966485873876791
Test set F1 score on Class 1: 0.41662796098467253
Test set Cohen's kappa: 0.31164329851287864


In [26]:
# The F1 score on the test set has more than doubled from the initial logistic regression results
print(classification_report(y_test, y_hat_test_lr))

              precision    recall  f1-score   support

           0       0.95      0.82      0.88     10961
           1       0.31      0.64      0.42      1392

    accuracy                           0.80     12353
   macro avg       0.63      0.73      0.65     12353
weighted avg       0.88      0.80      0.83     12353



### Random Forest

In [27]:
# Entropy criterion could help imbalanced datasets as it computes the logarithm of the probabilities of each class
rf = RandomForestClassifier(n_estimators=200, criterion='entropy', max_features='auto', min_samples_split=5, 
                            class_weight='balanced', random_state=8)

In [28]:
rf_gs_params = {'n_estimators': [200, 300, 400, 500, 800],
                'max_features': ['log2', 'sqrt'],
                'min_samples_split': [2, 5, 10, 20],
                'class_weight': [None, 'balanced']}

scorers = {
    'kappa': make_scorer(cohen_kappa_score),
    'f1': make_scorer(f1_score)
}

In [29]:
gs_rf = GridSearchCV(rf, rf_gs_params, scoring=scorers, refit='f1', cv=5, verbose=2, n_jobs=-1)
gs_rf = gs_rf.fit(X_train, y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 14.4min finished


In [30]:
gs_rf.best_score_

0.465726129853416

In [31]:
gs_rf.best_params_

{'class_weight': 'balanced',
 'max_features': 'sqrt',
 'min_samples_split': 20,
 'n_estimators': 500}

In [32]:
gs_rf.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=8, verbose=0,
                       warm_start=False)

In [33]:
rf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=8, verbose=0,
                       warm_start=False)

In [34]:
rf.fit(X_train, y_train)
y_hat_train_rf = rf.predict(X_train)
y_hat_test_rf = rf.predict(X_test)
scores = cross_val_score(rf, X_train, y_train, scoring=f1_scorer, cv=kf)

print("Cross-validation F1 scores:", scores)
print("Mean CV F1 scores:", np.mean(scores))

Cross-validation F1 scores: [0.46126341 0.48339483 0.46626506 0.47613293 0.44157442]
Mean CV F1 scores: 0.465726129853416


In [35]:
print("Train set accuracy score:", rf.score(X_train, y_train))
print("Test set accuracy score:", rf.score(X_test, y_test))
print("Test set F1 score on Class 1:", f1_score(y_test, y_hat_test_rf, average='binary'))
print("Test set Cohen's kappa:", cohen_kappa_score(y_test, y_hat_test_rf))

Train set accuracy score: 0.874926274156056
Test set accuracy score: 0.8490245284546264
Test set F1 score on Class 1: 0.47627071047458586
Test set Cohen's kappa: 0.392937575560525


In [36]:
# The Random Forest's F1 score on the test set is much better than logistic regression above, 
# and overall accuracy score is also higher
print(classification_report(y_test, y_hat_test_rf))

              precision    recall  f1-score   support

           0       0.95      0.88      0.91     10961
           1       0.39      0.61      0.48      1392

    accuracy                           0.85     12353
   macro avg       0.67      0.74      0.69     12353
weighted avg       0.88      0.85      0.86     12353



### Support Vector Machine Classifier

In [37]:
# Need to fix max_iter at a finite value to reduce the time it takes for the gridsearch to complete.
svc = SVC(C=1, kernel='rbf', gamma=0.01, max_iter=150000, class_weight='balanced', random_state=8)

In [39]:
svc_gs_params = {'C': [1, 10, 100, 1000, 10000],
                 'kernel': ['poly', 'rbf'],
                 'gamma': [0.001, 0.01, 0.1, 1],
                 'class_weight': [None, 'balanced']}

scorers = {
    'kappa': make_scorer(cohen_kappa_score),
    'f1': make_scorer(f1_score)
}

In [40]:
gs_svc = GridSearchCV(svc, svc_gs_params, scoring=scorers, refit='f1', cv=5, verbose=2, n_jobs=-1)
gs_svc = gs_svc.fit(X_train, y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 18.7min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 50.3min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 59.9min finished


In [41]:
gs_svc.best_score_

0.46307030638941216

In [42]:
gs_svc.best_params_

{'C': 1000, 'class_weight': 'balanced', 'gamma': 0.01, 'kernel': 'rbf'}

In [43]:
gs_svc.best_estimator_

SVC(C=1000, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.01,
    kernel='rbf', max_iter=150000, probability=False, random_state=8,
    shrinking=True, tol=0.001, verbose=False)

In [44]:
svc = SVC(C=1000, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=150000, probability=False, random_state=8, shrinking=True,
    tol=0.001, verbose=False)

In [45]:
svc.fit(X_train, y_train)
y_hat_train_svc = svc.predict(X_train)
y_hat_test_svc = svc.predict(X_test)
scores = cross_val_score(svc, X_train, y_train, scoring=f1_scorer, cv=kf)

print("Cross-validation F1 scores:", scores)
print("Mean CV F1 scores:", np.mean(scores))



Cross-validation F1 scores: [0.45265589 0.49044978 0.4636472  0.46217009 0.44642857]
Mean CV F1 scores: 0.46307030638941216


In [46]:
print("Train set accuracy score:", svc.score(X_train, y_train))
print("Test set accuracy score:", svc.score(X_test, y_test))
print("Test set F1 score on Class 1:", f1_score(y_test, y_hat_test_svc, average='binary'))
print("Test set Cohen's kappa:", cohen_kappa_score(y_test, y_hat_test_svc))

Train set accuracy score: 0.8436665163237692
Test set accuracy score: 0.8421436088399579
Test set F1 score on Class 1: 0.46399120395821886
Test set Cohen's kappa: 0.37735849056603776


In [47]:
print(metrics.classification_report(y_test, y_hat_test_svc))

              precision    recall  f1-score   support

           0       0.95      0.87      0.91     10961
           1       0.38      0.61      0.46      1392

    accuracy                           0.84     12353
   macro avg       0.66      0.74      0.69     12353
weighted avg       0.88      0.84      0.86     12353



#### Random Forest had the best F1 score and Cohen's kappa on the test set of the three models in the Gridsearch. Random Forest and SVC both outperformed logistic regression handily, possibly due to non-linear relationships in the dataset. The CV F1 scores on the train set were rather similar between the RF and SVC models.

In terms of the F1 score on the test set, the rankings from lowest to highest are: Logit (0.42), SVC (0.46) & Random Forest (0.48).

In terms of the Kappa score on the test set, the rankings from lowest to highest are: Logit (0.31), SVC(0.38) & Random Forest (0.39).

Time now to resample the training data to balance up the minority class to see that helps with improving the various models'predictive power on the test set. I will use the combination SMOTE-Tomek links resampling technique.

# Resampling the Train Set & Gridsearch Again

#### Oversampling the minority positive class with SMOTE, and undersampling the majority negative class Tomek links.

In [48]:
smto = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'), random_state=8)
X_train_st, y_train_st = smto.fit_resample(X_train, y_train)

In [49]:
X_train.shape

(28823, 26)

In [50]:
X_train_st.shape

(50482, 26)

In [51]:
y_train.shape

(28823,)

In [52]:
y_train_st.shape

(50482,)

### Logistic Regression with resampling

In [53]:
logit = LogisticRegression(penalty='l2', C=1, solver='liblinear', class_weight='balanced', random_state=8)

In [54]:
logit_params = {'class__penalty': ['l1', 'l2'],
                'class__solver': ['liblinear', 'saga'],
                'class__C': [0.001, 0.01, 0.1, 1, 10, 100]}

pipeline = Pipeline([('sampling', smto), ('class', logit)])

In [55]:
resample_logit = GridSearchCV(pipeline, logit_params, scoring=f1_scorer, cv=5, verbose=2, n_jobs=-1)
resample_logit = resample_logit.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  8.7min finished


In [56]:
resample_logit.best_score_

0.405996978373535

In [57]:
resample_logit.best_estimator_

Pipeline(memory=None,
         steps=[('sampling',
                 SMOTETomek(n_jobs=None, random_state=8,
                            sampling_strategy='auto', smote=None,
                            tomek=TomekLinks(n_jobs=None,
                                             sampling_strategy='majority'))),
                ('class',
                 LogisticRegression(C=100, class_weight='balanced', dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l1', random_state=8,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [58]:
logit_re = LogisticRegression(C=100, class_weight='balanced', dual=False, fit_intercept=True, intercept_scaling=1,
                           l1_ratio=None, max_iter=100, multi_class='auto', n_jobs=None, penalty='l1', random_state=8,
                           solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [59]:
logit_re.fit(X_train_st, y_train_st)
y_hat_train_lr = logit_re.predict(X_train_st)
y_hat_test_lr = logit_re.predict(X_test)
scores = cross_val_score(logit_re, X_train_st, y_train_st, scoring=f1_scorer, cv=kf)

print("Cross-validation F1 scores:", scores)
print("Mean CV F1 scores:", np.mean(scores))

Cross-validation F1 scores: [0.70755423 0.71027837 0.70377948 0.70883641 0.7050993 ]
Mean CV F1 scores: 0.707109558301049


In [60]:
print("Train set accuracy score:", logit_re.score(X_train_st, y_train_st))
print("Test set accuracy score:", logit_re.score(X_test, y_test))
print("Test set F1 score on Class 1:", f1_score(y_test, y_hat_test_lr, average='binary'))
print("Test set Cohen's kappa:", cohen_kappa_score(y_test, y_hat_test_lr))

Train set accuracy score: 0.7286161404064815
Test set accuracy score: 0.7900914757548774
Test set F1 score on Class 1: 0.4102797361837617
Test set Cohen's kappa: 0.3029124060143258


#### Much higher CV F1 scores on the train set (~0.71 vs ~0.41) after the resampling, but the results from the test set are slightly worse than the original scores from Logistic regression

### Random Forest with resampling

In [61]:
rf = RandomForestClassifier(n_estimators=200, criterion='entropy', max_features='auto', min_samples_split=5, 
                            class_weight='balanced', random_state=8)

In [62]:
rf_params = {'class__n_estimators': [200, 300, 400, 500, 800],
             'class__max_features': ['log2', 'sqrt'],
             'class__min_samples_split': [2, 5, 10, 20]}

pipeline = Pipeline([('sampling', smto), ('class', rf)])

In [63]:
resample_rf = GridSearchCV(pipeline, rf_params, scoring=f1_scorer, cv=5, verbose=2, n_jobs=-1)
resample_rf = resample_rf.fit(X_train_st, y_train_st)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 28.7min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 37.7min finished


In [64]:
resample_rf.best_score_

0.8991214669347718

In [65]:
resample_rf.best_estimator_

Pipeline(memory=None,
         steps=[('sampling',
                 SMOTETomek(n_jobs=None, random_state=8,
                            sampling_strategy='auto', smote=None,
                            tomek=TomekLinks(n_jobs=None,
                                             sampling_strategy='majority'))),
                ('class',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight='balanced',
                                        criterion='entropy', max_depth=None,
                                        max_features='sqrt',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=5,
                                        min_weight_fraction_leaf=0.0,
                            

In [66]:
rf_re = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight='balanced',
                                        criterion='entropy', max_depth=None,
                                        max_features='sqrt',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=5,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=800, n_jobs=None,
                                        oob_score=False, random_state=8,
                                        verbose=0, warm_start=False)

In [67]:
rf_re.fit(X_train_st, y_train_st)
y_hat_train_rf = rf_re.predict(X_train_st)
y_hat_test_rf = rf_re.predict(X_test)
scores = cross_val_score(rf_re, X_train_st, y_train_st, scoring=f1_scorer, cv=kf)

print("Cross-validation F1 scores:", scores)
print("Mean CV F1 scores:", np.mean(scores))

Cross-validation F1 scores: [0.79977949 0.92910807 0.9237911  0.92780723 0.92178609]
Mean CV F1 scores: 0.900454397817698


In [68]:
print("Train set accuracy score:", rf_re.score(X_train_st, y_train_st))
print("Test set accuracy score:", rf_re.score(X_test, y_test))
print("Test set F1 score on Class 1:", f1_score(y_test, y_hat_test_rf, average='binary'))
print("Test set Cohen's kappa:", cohen_kappa_score(y_test, y_hat_test_rf))

Train set accuracy score: 0.9677904995840101
Test set accuracy score: 0.851210232332227
Test set F1 score on Class 1: 0.40169270833333337
Test set Cohen's kappa: 0.3175855247718634


#### Similarly improved performance on the train set following resampling, but worse scores on the test set with Random Forest.

### Support Vector Machine Classifier with resampling

In [69]:
svc = SVC(C=1, kernel='rbf', gamma=0.01, max_iter=150000, class_weight='balanced', random_state=8)

In [70]:
svc_params = {'class__C': [1, 10, 100, 1000, 10000],
              'class__kernel': ['poly', 'rbf'],
              'class__gamma': [0.001, 0.01, 0.1, 1]}

pipeline = Pipeline([('sampling', smto), ('class', svc)])

In [71]:
resample_svc = GridSearchCV(pipeline, svc_params, scoring=f1_scorer, cv=5, verbose=2, n_jobs=-1)
resample_svc = resample_svc.fit(X_train_st, y_train_st)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 20.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 95.6min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 128.2min finished


In [72]:
resample_svc.best_score_

0.8089049802758371

In [73]:
resample_svc.best_estimator_

Pipeline(memory=None,
         steps=[('sampling',
                 SMOTETomek(n_jobs=None, random_state=8,
                            sampling_strategy='auto', smote=None,
                            tomek=TomekLinks(n_jobs=None,
                                             sampling_strategy='majority'))),
                ('class',
                 SVC(C=10000, break_ties=False, cache_size=200,
                     class_weight='balanced', coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma=1,
                     kernel='rbf', max_iter=150000, probability=False,
                     random_state=8, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

In [74]:
svc_re = SVC(C=10000, break_ties=False, cache_size=200,
                     class_weight='balanced', coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma=1,
                     kernel='rbf', max_iter=150000, probability=False,
                     random_state=8, shrinking=True, tol=0.001,
                     verbose=False)

In [75]:
svc_re.fit(X_train_st, y_train_st)
y_hat_train_svc = svc_re.predict(X_train_st)
y_hat_test_svc = svc_re.predict(X_test)
scores = cross_val_score(svc_re, X_train_st, y_train_st, scoring=f1_scorer, cv=kf)

print("Cross-validation F1 scores:", scores)
print("Mean CV F1 scores:", np.mean(scores))



Cross-validation F1 scores: [0.75232131 0.82730515 0.8184138  0.82168217 0.82345913]
Mean CV F1 scores: 0.8086363104392564


In [76]:
print("Train set accuracy score:", svc_re.score(X_train_st, y_train_st))
print("Test set accuracy score:", svc_re.score(X_test, y_test))
print("Test set F1 score on Class 1:", f1_score(y_test, y_hat_test_svc, average='binary'))
print("Test set Cohen's kappa:", cohen_kappa_score(y_test, y_hat_test_svc))

Train set accuracy score: 0.8369121667128878
Test set accuracy score: 0.7936533635554116
Test set F1 score on Class 1: 0.3048813744205072
Test set Cohen's kappa: 0.19189227747848947


#### Rather poor outcome with SVC on the test set. It has the worst test set results post-resampling in terms of the F1 score and Cohen's kappa among the three models.

### Comparison of the models' post-resampling classification reports

In [77]:
# Logistic regression
print(classification_report(y_test, y_hat_test_lr))

              precision    recall  f1-score   support

           0       0.95      0.81      0.87     10961
           1       0.30      0.65      0.41      1392

    accuracy                           0.79     12353
   macro avg       0.62      0.73      0.64     12353
weighted avg       0.87      0.79      0.82     12353



In [78]:
# Random forest
print(classification_report(y_test, y_hat_test_rf))

              precision    recall  f1-score   support

           0       0.93      0.90      0.92     10961
           1       0.37      0.44      0.40      1392

    accuracy                           0.85     12353
   macro avg       0.65      0.67      0.66     12353
weighted avg       0.86      0.85      0.86     12353



In [79]:
# SVC
print(classification_report(y_test, y_hat_test_svc))

              precision    recall  f1-score   support

           0       0.92      0.84      0.88     10961
           1       0.25      0.40      0.30      1392

    accuracy                           0.79     12353
   macro avg       0.58      0.62      0.59     12353
weighted avg       0.84      0.79      0.81     12353



All three models' "best estimators" on the train set produced much higher F1 scores following the resampling procedure. The Random Forest model achieved the highest cross-validation F1 scores in particular. 

#### However, the use of the combination SMOTE and Tomek links algorithms failed to boost the predictions from all three models on the test data. In short, the resampling procedure did not manage to improve the performance of the models on the test set, so the improvements seen in the training data were not generalisable beyond it.

#### I will use Random Forest as the algorithm of choice for the All-variables predictive model (without resampling) because it was the top model to emerge from the Gridsearch procedures above. Furthermore, one could explore how much each explanatory variable contributed to the overall model prediction with Random Forest, unlike the black-box SVC . So if model interpretability and transparency are important, these are additional reasons to pick RF over SVC.

Given the poor test results, I will not be using resampling for the All-variables predictive model of choice. Other oversampling and undersampling techniques could be analysed to see if they might result in more generalisable improvements to the predictive models. But a lot more time would be needed to do it comprehensively.