In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

## Reading preprocessed data file 

In [3]:
data_model= pd.read_csv('Preprocessed_data.csv')
data_model.shape

(41176, 18)

In [5]:
X_train, X_test, y_train, y_test=train_test_split(data_model.drop('y',axis=1), data_model['y'],test_size=0.3,
                                                  stratify=data_model['y'],random_state=42)
print('Train Data',X_train.shape,y_train.shape)
print('Test Data',X_test.shape,y_test.shape)

Train Data (28823, 17) (28823,)
Test Data (12353, 17) (12353,)


### Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression


def print_metrics_by_model(model, x_test,y_test):
    print('Accuracy_Score', accuracy_score(y_test,model.predict(x_test)))
    print('Confusion Matrix')
    print(pd.crosstab(y_test,model.predict(x_test),rownames=['Actual'],colnames=['Predicted'],margins=True))
    print("Classification Report")
    print(classification_report(y_test,model.predict(x_test)))

    
mod_log= LogisticRegression(max_iter=3000,C=1, penalty='l1',class_weight='balanced',solver='liblinear')
mod_log.fit(X_train,y_train)
print_metrics_by_model(mod_log,X_test,y_test)

Accuracy_Score 0.7789201003804744
Confusion Matrix
Predicted     0     1    All
Actual                      
0          8652  2309  10961
1           422   970   1392
All        9074  3279  12353
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.79      0.86     10961
           1       0.30      0.70      0.42      1392

    accuracy                           0.78     12353
   macro avg       0.62      0.74      0.64     12353
weighted avg       0.88      0.78      0.81     12353



* HyperParameter Finetuning for C and Solver parameters using 
GridSearchCV

In [22]:
def my_custom_loss_func(y_true, y_pred):
    print('Confusion Matrix on Cross Validation Set')
    print(pd.crosstab(y_true,y_pred,rownames=['Actual'],colnames=['Predicted'],margins=True))
    return f1_score(y_true,y_pred)

parameters = {'solver':['liblinear','lbfgs'],
             "C":[0.001, 0.01,0.1,1]}
mod_log= LogisticRegression(max_iter=1000,class_weight='balanced')
clf= GridSearchCV(mod_log,parameters,verbose=3,scoring=make_scorer(my_custom_loss_func,greater_is_better=True))
clf.fit(X_train,y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Confusion Matrix on Cross Validation Set
Predicted     0     1   All
Actual                     
0          3823  1293  5116
1           220   429   649
All        4043  1722  5765
[CV 1/5] END .........C=0.001, solver=liblinear;, score=0.362 total time=   0.0s
Confusion Matrix on Cross Validation Set
Predicted     0     1   All
Actual                     
0          3807  1308  5115
1           215   435   650
All        4022  1743  5765
[CV 2/5] END .........C=0.001, solver=liblinear;, score=0.364 total time=   0.0s
Confusion Matrix on Cross Validation Set
Predicted     0     1   All
Actual                     
0          3826  1289  5115
1           222   428   650
All        4048  1717  5765
[CV 3/5] END .........C=0.001, solver=liblinear;, score=0.362 total time=   0.0s
Confusion Matrix on Cross Validation Set
Predicted     0     1   All
Actual                     
0          3827  1288  5115
1           203   446   649
A

Predicted     0     1   All
Actual                     
0          4048  1067  5115
1           211   438   649
All        4259  1505  5764
[CV 4/5] END .............C=1, solver=liblinear;, score=0.407 total time=   0.0s
Confusion Matrix on Cross Validation Set
Predicted     0     1   All
Actual                     
0          4069  1046  5115
1           212   437   649
All        4281  1483  5764
[CV 5/5] END .............C=1, solver=liblinear;, score=0.410 total time=   0.0s
Confusion Matrix on Cross Validation Set
Predicted     0     1   All
Actual                     
0          4012  1104  5116
1           230   419   649
All        4242  1523  5765
[CV 1/5] END .................C=1, solver=lbfgs;, score=0.386 total time=   0.4s
Confusion Matrix on Cross Validation Set
Predicted     0     1   All
Actual                     
0          4020  1095  5115
1           219   431   650
All        4239  1526  5765
[CV 2/5] END .................C=1, solver=lbfgs;, score=0.396 total time= 

GridSearchCV(estimator=LogisticRegression(class_weight='balanced',
                                          max_iter=1000),
             param_grid={'C': [0.001, 0.01, 0.1, 1],
                         'solver': ['liblinear', 'lbfgs']},
             scoring=make_scorer(my_custom_loss_func), verbose=3)

* Accuracy of best model obtained by gridsearch

In [23]:
print_metrics_by_model(clf.best_estimator_,X_test,y_test)

Accuracy_Score 0.7790010523759411
Confusion Matrix
Predicted     0     1    All
Actual                      
0          8653  2308  10961
1           422   970   1392
All        9075  3278  12353
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.79      0.86     10961
           1       0.30      0.70      0.42      1392

    accuracy                           0.78     12353
   macro avg       0.62      0.74      0.64     12353
weighted avg       0.88      0.78      0.81     12353



### SGD Classifier

In [24]:
from sklearn.linear_model import SGDClassifier
mod_sgd=SGDClassifier(max_iter=1000, tol=1e-3, random_state=42,)
mod_sgd.fit(X_train,y_train)
print_metrics_by_model(mod_sgd,X_test,y_test)

Accuracy_Score 0.8725815591354327
Confusion Matrix
Predicted      0     1    All
Actual                       
0          10060   901  10961
1            673   719   1392
All        10733  1620  12353
Classification Report
              precision    recall  f1-score   support

           0       0.94      0.92      0.93     10961
           1       0.44      0.52      0.48      1392

    accuracy                           0.87     12353
   macro avg       0.69      0.72      0.70     12353
weighted avg       0.88      0.87      0.88     12353



### K-Nearest Neighbors 

In [25]:
from sklearn.neighbors import KNeighborsClassifier
mod_knn=KNeighborsClassifier(weights='distance', n_neighbors=4)
mod_knn.fit(X_train,y_train)
print_metrics_by_model(mod_knn, X_test,y_test)

Accuracy_Score 0.882376750586902
Confusion Matrix
Predicted      0    1    All
Actual                      
0          10475  486  10961
1            967  425   1392
All        11442  911  12353
Classification Report
              precision    recall  f1-score   support

           0       0.92      0.96      0.94     10961
           1       0.47      0.31      0.37      1392

    accuracy                           0.88     12353
   macro avg       0.69      0.63      0.65     12353
weighted avg       0.86      0.88      0.87     12353



### Decision Tree 

In [41]:
def print_feature_importances(model):
    print('Feature Importances')
    return pd.DataFrame(np.c_[model.feature_names_in_,model.feature_importances_],
                columns=['Feature_Name', 'Feature Importances']).sort_values('Feature Importances',ascending=False)

mod_tree=DecisionTreeClassifier()
mod_tree.fit(X_train,y_train)
print_metrics_by_model(mod_tree, X_test,y_test)
print_feature_importances(mod_tree)

Accuracy_Score 0.8729863191127661
Confusion Matrix
Predicted      0     1    All
Actual                       
0          10368   593  10961
1            976   416   1392
All        11344  1009  12353
Classification Report
              precision    recall  f1-score   support

           0       0.91      0.95      0.93     10961
           1       0.41      0.30      0.35      1392

    accuracy                           0.87     12353
   macro avg       0.66      0.62      0.64     12353
weighted avg       0.86      0.87      0.86     12353

Feature Importances


Unnamed: 0,Feature_Name,Feature Importances
11,age,0.275434
13,pdays,0.172115
12,campaign,0.135202
16,cons.conf.idx,0.104666
0,job,0.086521
15,emp.var.rate,0.074149
2,marital_married,0.052229
1,month,0.024806
14,previous,0.020517
3,default_no,0.013863


In [42]:
from sklearn.ensemble import RandomForestClassifier
mod_forest=RandomForestClassifier(class_weight='balanced',n_estimators=150)
mod_forest.fit(X_train,y_train)
print_metrics_by_model(mod_forest,X_test,y_test)
print_feature_importances(mod_forest)

Accuracy_Score 0.8372055371164899
Confusion Matrix
Predicted      0     1    All
Actual                       
0           9850  1111  10961
1            900   492   1392
All        10750  1603  12353
Classification Report
              precision    recall  f1-score   support

           0       0.92      0.90      0.91     10961
           1       0.31      0.35      0.33      1392

    accuracy                           0.84     12353
   macro avg       0.61      0.63      0.62     12353
weighted avg       0.85      0.84      0.84     12353

Feature Importances


Unnamed: 0,Feature_Name,Feature Importances
11,age,0.382654
12,campaign,0.134654
15,emp.var.rate,0.122998
16,cons.conf.idx,0.079225
0,job,0.069237
1,month,0.050232
13,pdays,0.044669
2,marital_married,0.026313
10,poutcome_success,0.021053
14,previous,0.014061


### Naive Baye's

In [43]:
from sklearn.naive_bayes import GaussianNB
mod_nb= GaussianNB()
mod_nb.fit(X_train,y_train)
print_metrics_by_model(mod_nb, X_test,y_test)

Accuracy_Score 0.7985914352788797
Confusion Matrix
Predicted     0     1    All
Actual                      
0          9044  1917  10961
1           571   821   1392
All        9615  2738  12353
Classification Report
              precision    recall  f1-score   support

           0       0.94      0.83      0.88     10961
           1       0.30      0.59      0.40      1392

    accuracy                           0.80     12353
   macro avg       0.62      0.71      0.64     12353
weighted avg       0.87      0.80      0.82     12353



### Kernel Based SVM

In [46]:
best_model= SVC(kernel='rbf',class_weight='balanced',C=1, gamma=50)
best_model.fit(X_train,y_train)
print_metrics_by_model(best_model, X_test,y_test)

Accuracy_Score 0.8199627620820853
Confusion Matrix
Predicted     0     1    All
Actual                      
0          9286  1675  10961
1           549   843   1392
All        9835  2518  12353
Classification Report
              precision    recall  f1-score   support

           0       0.94      0.85      0.89     10961
           1       0.33      0.61      0.43      1392

    accuracy                           0.82     12353
   macro avg       0.64      0.73      0.66     12353
weighted avg       0.88      0.82      0.84     12353



### GridSearchCV 

In [None]:

from sklearn.metrics import make_scorer
def my_custom_loss_func(y_true, y_pred):
    print(pd.crosstab(y_true,y_pred,rownames=['Actual'],colnames=['Predicted'],margins=True))
    return f1_score(y_true,y_pred)

parameters = {'C':[0.1,1,10],'gamma':[0.001,0.1,1,5],
                'kernel':['linear', 'poly', 'rbf']}
grid_svc= SVC(kernel='rbf',class_weight='balanced')
clf= GridSearchCV(grid_svc,parameters,verbose=3,scoring=make_scorer(my_custom_loss_func,greater_is_better=True))
clf.fit(X_train,y_train)

In [None]:
from xgboost import XGBClassifier
mod_xgb= XGBClassifier(n_estimators= 1000,learning_rate=0.01, reg_alpha=1.5,
                       booster='dart',
                       scale_pos_weight=sum(y_train==0)/sum(y_train==1))
mod_xgb.fit(X_train,y_train)
print_metrics_by_model(mod_xgb,X_test,y_test)

In [47]:
y_test.value_counts()

0    10961
1     1392
Name: y, dtype: int64

* From above we can see data is very imbalanced due to which
Accuracy (recall for Positive Class is very less)
* Hence in order to deal with class imbalanced we use oversampling technique
called SMOTE [Synthetic Minority Oversampling Technique]

In [None]:
!pip install imblearn

In [80]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SVMSMOTE
oversample = SVMSMOTE(sampling_strategy = 0.5)
X,y= oversample.fit_resample(data_model.drop('y',axis=1), data_model['y'])
print(X.shape,y.shape)
data_model.y.value_counts(), y.value_counts()

(54805, 17) (54805,)


(0    36537
 1     4639
 Name: y, dtype: int64,
 0    36537
 1    18268
 Name: y, dtype: int64)

In [81]:
X_train, X_test, y_train, y_test=train_test_split(X, y,test_size=0.3,
                                                  stratify=y,random_state=42)
print('Train Data',X_train.shape,y_train.shape)
print('Test Data',X_test.shape,y_test.shape)

Train Data (38363, 17) (38363,)
Test Data (16442, 17) (16442,)


In [84]:
print('Before Oversampling..')
print(data_model.y.value_counts())
print('After Oversampling..')
print(y.value_counts())

Before Oversampling..
0    36537
1     4639
Name: y, dtype: int64
After Oversampling..
0    36537
1    18268
Name: y, dtype: int64


### Logistic Regression on Oversampled Dataset

In [57]:
mod_log= LogisticRegression(max_iter=1000,C=1.0, penalty='l1',class_weight='balanced',solver='liblinear')
mod_log.fit(X_train,y_train)
print_metrics_by_model(mod_log,X_test,y_test)

Accuracy_Score 0.8205814377812918
Confusion Matrix
Predicted      0     1    All
Actual                       
0           9298  1663  10961
1           1287  4194   5481
All        10585  5857  16442
Classification Report
              precision    recall  f1-score   support

           0       0.88      0.85      0.86     10961
           1       0.72      0.77      0.74      5481

    accuracy                           0.82     16442
   macro avg       0.80      0.81      0.80     16442
weighted avg       0.82      0.82      0.82     16442



### Random Forest 

In [85]:
from sklearn.ensemble import RandomForestClassifier
mod_forest=RandomForestClassifier()
mod_forest.fit(X_train,y_train)
print_metrics_by_model(mod_forest,X_test,y_test)

Accuracy_Score 0.8629728743461866
Confusion Matrix
Predicted      0     1    All
Actual                       
0          10124   837  10961
1           1416  4065   5481
All        11540  4902  16442
Classification Report
              precision    recall  f1-score   support

           0       0.88      0.92      0.90     10961
           1       0.83      0.74      0.78      5481

    accuracy                           0.86     16442
   macro avg       0.85      0.83      0.84     16442
weighted avg       0.86      0.86      0.86     16442



In [86]:
from xgboost import XGBClassifier
mod_xgb= XGBClassifier(scale_pos_weight=sum(y_train==0)/sum(y_train==1))#scale_pos_weight=sum(y_train==0)/sum(y_train==1)
mod_xgb.fit(X_train,y_train)
print_metrics_by_model(mod_xgb,X_test,y_test)



Accuracy_Score 0.9067023476462718
Confusion Matrix
Predicted      0     1    All
Actual                       
0          10256   705  10961
1            829  4652   5481
All        11085  5357  16442
Classification Report
              precision    recall  f1-score   support

           0       0.93      0.94      0.93     10961
           1       0.87      0.85      0.86      5481

    accuracy                           0.91     16442
   macro avg       0.90      0.89      0.89     16442
weighted avg       0.91      0.91      0.91     16442



#### By using Oversampling technique, F1_score for Positive class has increased from 0.42 to 0.86. 