In [1]:
import csv
import numpy as np
import pandas as pd

In [2]:
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, cohen_kappa_score, average_precision_score
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import xgboost as xgb

In [4]:
f = pd.read_csv('C:\\df_data.csv')
df = pd.DataFrame(f)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41176 entries, 0 to 41175
Data columns (total 37 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   41176 non-null  float64
 1   campaign              41176 non-null  float64
 2   previous              41176 non-null  float64
 3   cons.conf.idx         41176 non-null  float64
 4   euribor3m             41176 non-null  float64
 5   target                41176 non-null  int64  
 6   blue_collar           41176 non-null  int64  
 7   student               41176 non-null  int64  
 8   retiree               41176 non-null  int64  
 9   unemployed            41176 non-null  int64  
 10  illiterate            41176 non-null  int64  
 11  unknown_edu           41176 non-null  int64  
 12  university            41176 non-null  int64  
 13  basic4                41176 non-null  int64  
 14  basic_other           41176 non-null  int64  
 15  single             

The data before us is very imbalanced, with only 12.7% of the positive class. The choice of scoring metric on which to train the machine learning algorithm is therefore extremely important. The typical accuracy scorer is not the best choice with such an imbalanced dataset because one could get an apparently high accuracy score of approximately 87% if the model just simply picks the negative class. This is obviously unhelpful to the goal of predicting the positive class.

#### The F1 score tends to be the better metric for imbalanced datasets as it evaluates both the precision and recall rates, so it is focused on how good the model is at predicting the positive (minority in this case) class. Another good metric is Cohen's Kappa, which takes into account how much agreement would be expected by chance.

In [6]:
#f1_scorer = make_scorer(f1_score)
# Set scorers using F1 score and Cohen's kappa
scorers = {
    'kappa': make_scorer(cohen_kappa_score),
    'f1': make_scorer(f1_score)
}

# Scaling & Train-Test Split

In [7]:
y = df.pop('target')

In [8]:
X = df

In [9]:
X.columns

Index(['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m',
       'blue_collar', 'student', 'retiree', 'unemployed', 'illiterate',
       'unknown_edu', 'university', 'basic4', 'basic_other', 'single',
       'no_default', 'age_retiree', 'age_student', 'age_basic4',
       'age_illiterate', 'poutcome_nonexistent', 'poutcome_success',
       'contact_telephone', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue',
       'day_of_week_wed'],
      dtype='object')

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=8)

Use **MinMax scaler** to scale the variables because (1) don't need normally distributed data given that we are not using OLS, (2) this is a sizeable dataset, and (3) the prevalence of **dummy variables** among the explanatory variables.

In [11]:
scaler = MinMaxScaler()
X_train_mm = scaler.fit_transform(X_train)
X_test_mm = scaler.transform(X_test)

In [12]:
X_train = pd.DataFrame(X_train_mm, columns=['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m', 
                                            'blue_collar', 'student', 'retiree', 'unemployed', 'illiterate',
                                            'unknown_edu', 'university', 'basic4','basic_other', 'single', 
                                            'no_default', 'age_retiree', 'age_student', 'age_basic4', 
                                            'age_illiterate', 'poutcome_nonexistent', 'poutcome_success', 
                                            'contact_telephone', 'month_aug', 'month_dec', 'month_jul', 
                                            'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct', 
                                            'month_sep', 'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue', 
                                            'day_of_week_wed'])

In [13]:
X_test = pd.DataFrame(X_test_mm, columns=['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m', 
                                          'blue_collar', 'student', 'retiree', 'unemployed', 'illiterate',
                                          'unknown_edu', 'university', 'basic4','basic_other', 'single',
                                          'no_default', 'age_retiree', 'age_student', 'age_basic4', 
                                          'age_illiterate', 'poutcome_nonexistent', 'poutcome_success', 
                                          'contact_telephone', 'month_aug', 'month_dec', 'month_jul', 
                                          'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct', 
                                          'month_sep', 'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue', 
                                          'day_of_week_wed'])

In [14]:
# Confirms that the max-min scaling has taken place
X_train.describe()

Unnamed: 0,age,campaign,previous,cons.conf.idx,euribor3m,blue_collar,student,retiree,unemployed,illiterate,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed
count,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0,...,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0,28823.0
mean,0.283754,0.037291,0.028796,0.430109,0.677648,0.222912,0.020747,0.040731,0.024217,0.000486,...,0.129515,0.012906,0.33279,0.099886,0.017521,0.0136,0.207265,0.209728,0.196718,0.197516
std,0.128394,0.065496,0.082555,0.193246,0.392972,0.416207,0.14254,0.197671,0.153724,0.022034,...,0.335774,0.112873,0.47122,0.299852,0.131203,0.115826,0.405354,0.407122,0.397524,0.398132
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.185185,0.0,0.0,0.338912,0.160961,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.259259,0.02381,0.0,0.376569,0.957379,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.37037,0.047619,0.0,0.60251,0.980957,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Preliminary Estimations

#### Will evaluate 5 classification models: _Logistic regression, NaiveBayes, Random Forest, SVC & XGBoost_

In [15]:
# Dictionary of classifiers, setting class-weight='balanced' where possible, set NB prior to reflect the imbalance,
# and scale_pos_weight in XGBoost to 8 to reflect the approximate ratio of negative class to positive class
clf_dict = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=3000, random_state=8),
    'Naive Bayes': GaussianNB(priors=[0.113, 0.887]), 
    'Random Forest': RandomForestClassifier(criterion='entropy', class_weight='balanced', random_state=8),
    'SV Classification': SVC(class_weight='balanced', random_state=8), 
    'XG Boost' : xgb.XGBClassifier(scale_pos_weight=7.85, use_label_encoder=False, seed=8)    
}

In [16]:
def batch_clf(X_train, y_train, X_test, y_test, clf_dict, verbose=True):
    '''
    Fits a dictionary of algorithms, tests them and returns metrics
    '''
    train_f1_scores = []
    test_f1_scores = []
    train_cohen_scores = []
    test_cohen_scores = []
    train_pre_scores = []
    test_pre_scores = []
    train_acc_scores = []
    test_acc_scores = []
    
    # Loop through dictionary items
    for key, clf in clf_dict.items():
   
        # Fit classifier
        clf_fitted = clf.fit(X_train, y_train)
        
        # Get predictions
        train_preds = clf_fitted.predict(X_train)
        test_preds = clf_fitted.predict(X_test)

        #Get F1 scores
        train_f1 = f1_score(y_train, train_preds, average='binary')
        train_f1_scores.append(round(train_f1, 4))
        test_f1 = f1_score(y_test, test_preds, average='binary')
        test_f1_scores.append(round(test_f1, 4))
        
        # Get Cohen's kappa
        train_cohen = cohen_kappa_score(y_train, train_preds)
        train_cohen_scores.append(round(train_cohen, 4))
        test_cohen = cohen_kappa_score(y_test, test_preds)
        test_cohen_scores.append(round(test_cohen, 4))
        
        # Get AUC of precision-recall curves
        train_pre = average_precision_score(y_train, train_preds, average='macro')
        train_pre_scores.append(round(train_pre, 4))
        test_pre = average_precision_score(y_test, test_preds, average='macro')
        test_pre_scores.append(round(test_pre, 4))        
                
        #Get accuracy scores
        train_acc = accuracy_score(y_train, train_preds)
        train_acc_scores.append(round(train_acc,4))
        test_acc = accuracy_score(y_test, test_preds)
        test_acc_scores.append(round(test_acc,4))
        
    # Create results dataframe
    results = pd.DataFrame({'Model': list(clf_dict.keys()), 
                            'Train F1': train_f1_scores,
                            'Test F1': test_f1_scores,
                            "Train Cohen's kappa" : train_cohen_scores,
                            "Test Cohen's kappa" : test_cohen_scores,
                            'Train PR-AUC': train_pre_scores,
                            'Test PR-AUC': test_pre_scores,
                            'Train Accuracy': train_acc_scores,
                            'Test Accuracy': test_acc_scores,
                            })

    return results

In [17]:
results = batch_clf(X_train, y_train, X_test, y_test, clf_dict)
results



Unnamed: 0,Model,Train F1,Test F1,Train Cohen's kappa,Test Cohen's kappa,Train PR-AUC,Test PR-AUC,Train Accuracy,Test Accuracy
0,Logistic Regression,0.4339,0.4356,0.3346,0.3361,0.2507,0.2523,0.8111,0.8103
1,Naive Bayes,0.4074,0.3971,0.3245,0.3104,0.2296,0.2219,0.8535,0.8464
2,Random Forest,0.9216,0.3647,0.9109,0.2996,0.8545,0.2149,0.9811,0.8782
3,SV Classification,0.4763,0.4635,0.3899,0.3748,0.2833,0.2725,0.8412,0.8368
4,XG Boost,0.5767,0.449,0.5075,0.3593,0.3805,0.2601,0.8734,0.8361


This simple cross-model evaluation finds **SVC as the top performer** across the **test F1, Cohen's kappa and Precision-Recall AUC scores**, even though it did not have the highest test accuracy score. **XGBoost** was the second best in the test F1, Cohen's kappa and PR AUC scores, while **Logistic regression** had the third highest scores across these metrics. **Random Forest** had the highest **accuracy scores** on the train and test sets, but did poorly in the other metrics, indicating that it was focused on predicting the **majority class** of 0.

# Gridsearch Hyper-Parameters

In [18]:
# Use stratified KFold for the cross-validation given the imbalanced data
kf = StratifiedKFold(n_splits=3, random_state=10)



### Logistic Regression

In [19]:
logit = LogisticRegression(penalty='l2', C=1, solver='liblinear', class_weight='balanced', max_iter=3000, 
                           random_state=8)

In [20]:
logit_gs_params = {'penalty': [None, 'l1', 'l2', 'elasticnet'],
                   'solver': ['lbfgs', 'liblinear', 'saga'],
                   'C': [0.001, 0.01, 0.1, 1, 10],
                   'class_weight': [None, 'balanced']}

In [21]:
gs_logit = GridSearchCV(logit, logit_gs_params, scoring=scorers, refit='f1', cv=kf, verbose=2, n_jobs=-1)
gs_logit = gs_logit.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 120 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   24.5s finished


In [22]:
gs_logit.best_score_

0.4334378709571261

In [23]:
gs_logit.best_params_

{'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'}

In [24]:
gs_logit.best_estimator_

LogisticRegression(C=0.1, class_weight='balanced', max_iter=3000, penalty='l1',
                   random_state=8, solver='liblinear')

### Naive Bayes

In [25]:
gnb = GaussianNB(var_smoothing=1e-9, priors=[0.11, 0.89])

In [26]:
gnb_gs_params = {'priors': [None, [0.113, 0.887]],
                 'var_smoothing': [1e-3, 1e-4, 1e-5, 1e-7, 1e-9]}

In [27]:
gs_gnb = GridSearchCV(gnb, gnb_gs_params, scoring=scorers, refit='f1', cv=kf, verbose=2, n_jobs=-1)
gs_gnb = gs_gnb.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.4s finished


In [28]:
gs_gnb.best_score_

0.408307843844616

In [29]:
gs_gnb.best_params_

{'priors': None, 'var_smoothing': 0.0001}

In [30]:
gs_gnb.best_estimator_

GaussianNB(var_smoothing=0.0001)

### Random Forest

In [31]:
# Entropy criterion could help imbalanced datasets as it computes the logarithm of the probabilities of each class
rf = RandomForestClassifier(n_estimators=100, criterion='entropy', max_features='log2', min_samples_split=20, 
                            class_weight='balanced', random_state=8)

In [32]:
rf_gs_params = {'n_estimators': [100, 125, 150, 175, 200],
                'criterion': ['gini', 'entropy'],
                'max_features': ['log2', 'sqrt'],
                'min_samples_split': [20, 25, 30, 35, 40],
                'class_weight': [None, 'balanced', 'balanced_subsample']}

In [33]:
gs_rf = GridSearchCV(rf, rf_gs_params, scoring=scorers, refit='f1', cv=kf, verbose=2, n_jobs=-1)
gs_rf = gs_rf.fit(X_train, y_train)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  9.3min finished


In [34]:
gs_rf.best_score_

0.494108163710905

In [35]:
gs_rf.best_params_

{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_features': 'log2',
 'min_samples_split': 30,
 'n_estimators': 175}

In [36]:
gs_rf.best_estimator_

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_features='log2', min_samples_split=30,
                       n_estimators=175, random_state=8)

### Support Vector Classifier

In [37]:
# Need to fix max_iter at a finite value to reduce the time it takes for the gridsearch to complete.
svc = SVC(C=100, kernel='rbf', gamma=0.01, class_weight='balanced', max_iter=200000, random_state=8)

In [38]:
svc_gs_params = {'C': [1, 10, 100, 1000, 10000],
                 'kernel': ['poly', 'rbf'],
                 'gamma': [0.001, 0.01, 0.1, 1],
                 'class_weight': [None, 'balanced']}

In [39]:
gs_svc = GridSearchCV(svc, svc_gs_params, scoring=scorers, refit='f1', cv=kf, verbose=2, n_jobs=-1)
gs_svc = gs_svc.fit(X_train, y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed: 18.9min
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed: 28.0min finished


In [40]:
gs_svc.best_score_

0.47550073486355293

In [41]:
gs_svc.best_params_

{'C': 1000, 'class_weight': 'balanced', 'gamma': 0.01, 'kernel': 'rbf'}

In [42]:
gs_svc.best_estimator_

SVC(C=1000, class_weight='balanced', gamma=0.01, max_iter=200000,
    random_state=8)

### XGBoost

In [43]:
xgb = xgb.XGBClassifier(booster='gbtree', gamma=1, n_estimators=20, learning_rate=0.01, objective='binary:logistic', 
                        scale_pos_weight=7.85, eval_metric='logloss', seed=8)  

In [44]:
xgb_gs_params = {'gamma': [1, 10, 100],
                 'n_estimators': [10, 20, 25, 30],
                 'learning_rate': [0.01, 0.1, 0.2],
                 'scale_pos_weight': [3, 5, 7.85],
                 'eval_metric': ['logloss', 'error', 'aucpr']}

In [45]:
gs_xgb = GridSearchCV(xgb, xgb_gs_params, scoring=scorers, refit='f1', cv=kf, verbose=2, n_jobs=-1)
gs_xgb = gs_xgb.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 324 candidates, totalling 972 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 972 out of 972 | elapsed: 14.6min finished


In [46]:
gs_xgb.best_score_

0.5004594784679534

In [47]:
gs_xgb.best_params_

{'eval_metric': 'logloss',
 'gamma': 10,
 'learning_rate': 0.1,
 'n_estimators': 20,
 'scale_pos_weight': 5}

In [48]:
gs_xgb.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=10, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=20, n_jobs=12,
              num_parallel_tree=1, random_state=8, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=5, seed=8, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

The _**best_score**_ from the GridsearchCV on the training data was from **XGB**, followed by **Random Forest** and **SVC**. Let's follow up with scoring the test data using the optimized hyper-parameter settings found through the Gridsearch above.

### The analysis is continued in the third file of the series