# **Santander Bank Customer Satisfaction - Classification**


In [None]:
!pip install feature_engine

Collecting feature_engine
  Downloading feature_engine-1.3.0-py2.py3-none-any.whl (260 kB)
[?25l[K     |█▎                              | 10 kB 26.5 MB/s eta 0:00:01[K     |██▌                             | 20 kB 26.8 MB/s eta 0:00:01[K     |███▊                            | 30 kB 18.0 MB/s eta 0:00:01[K     |█████                           | 40 kB 15.7 MB/s eta 0:00:01[K     |██████▎                         | 51 kB 7.2 MB/s eta 0:00:01[K     |███████▌                        | 61 kB 8.5 MB/s eta 0:00:01[K     |████████▉                       | 71 kB 7.8 MB/s eta 0:00:01[K     |██████████                      | 81 kB 8.7 MB/s eta 0:00:01[K     |███████████▎                    | 92 kB 9.6 MB/s eta 0:00:01[K     |████████████▋                   | 102 kB 8.0 MB/s eta 0:00:01[K     |█████████████▉                  | 112 kB 8.0 MB/s eta 0:00:01[K     |███████████████                 | 122 kB 8.0 MB/s eta 0:00:01[K     |████████████████▍               | 133 kB 8.0 M

In [None]:
# Import the libraries, please only use the libraries imported below
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score,classification_report, roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
import pickle
import joblib
from sklearn.inspection import permutation_importance
from feature_engine.selection import DropDuplicateFeatures, DropConstantFeatures
import warnings
warnings.filterwarnings('ignore')

# **1. Download the Dataset from Shared Google Drive ID's**
1.1 Google Drive ID for Train Data is given below and execute the following command to download the data from Google Drive


```
!gdown 1_NPqplpJLHl28cbAhuL2k6jJObJF9wwU
```
1.2 To download Test Data execute the following command


```
!gdown 1LdvVrsrWLNO-UAJf0Z9S-SxnV1nozcsu
```

**Important:** Please note that **!** represents the linux commands being executed on the drive. Don't use **!** while using at your system. 

In [None]:
!gdown 1_NPqplpJLHl28cbAhuL2k6jJObJF9wwU

Downloading...
From: https://drive.google.com/uc?id=1_NPqplpJLHl28cbAhuL2k6jJObJF9wwU
To: /content/train.csv
100% 59.4M/59.4M [00:00<00:00, 142MB/s]


In [None]:
!gdown 1LdvVrsrWLNO-UAJf0Z9S-SxnV1nozcsu


Downloading...
From: https://drive.google.com/uc?id=1LdvVrsrWLNO-UAJf0Z9S-SxnV1nozcsu
To: /content/test.csv
100% 59.1M/59.1M [00:00<00:00, 131MB/s]


# **2. Load and Preprocess the Data**

Load the train and test dataset and Preprocess train data with the following objectives.

> 1. have zero variance

> 2. duplicated column

> 3. very sparse features






In [None]:
df = pd.read_csv('train.csv')
df.head(2)

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0


In [None]:
Y = df.TARGET
X = df.drop("TARGET",axis=1)

In [None]:
X.shape

(76020, 370)

In [None]:
const = DropConstantFeatures(tol=1)
X = const.fit_transform(X)

dup = DropDuplicateFeatures()
X = dup.fit_transform(X)

q_cons = DropConstantFeatures(tol=0.99)
X = q_cons.fit_transform(X)

# **3. Training the Different Models**

Train different models with the following configurations and try to achieve maximum recall score or balanced precision/recall scores.

# 3.1 Simple Logistic Regression
Train simple logistic regression model and evaluate it on the validation set and print precision, recall, classification_report and ROC_AUC at the end.

In [None]:
X.shape

(76020, 143)

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,stratify=Y,train_size = 0.8)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train,Y_train)

LogisticRegression()

In [None]:
y_pred = logreg.predict(X_test)
acc = accuracy_score(Y_test,y_pred)
acc

0.9604051565377533

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     14602
           1       0.00      0.00      0.00       602

    accuracy                           0.96     15204
   macro avg       0.48      0.50      0.49     15204
weighted avg       0.92      0.96      0.94     15204



# 3.2 Train Vanilla kNN
Train simple kNN model and evaluate it on the validation set and print precision, recall, classification_report and ROC_AUC at the end.

In [None]:
# Your code here
knnWala  = KNeighborsClassifier()
knnWala.fit(X_train,Y_train)


KNeighborsClassifier()

In [None]:
y_pred = knnWala.predict(X_test)
acc = accuracy_score(Y_test,y_pred)
acc

0.9593528018942383

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     14602
           1       0.06      0.00      0.00       602

    accuracy                           0.96     15204
   macro avg       0.51      0.50      0.49     15204
weighted avg       0.92      0.96      0.94     15204



# 3.3 Train kNN for k=2 and k=4 and evaluate it
Train kNN model with k values above and evaluate it on the validation set and print precision, recall, classification_report and ROC_AUC at the end.

In [None]:
# Your code here
knnWala  = KNeighborsClassifier(n_neighbors=2)
knnWala.fit(X_train,Y_train)



KNeighborsClassifier(n_neighbors=2)

In [None]:
y_pred = knnWala.predict(X_test)
acc = accuracy_score(Y_test,y_pred)
acc

0.9586293080768219

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     14602
           1       0.11      0.01      0.01       602

    accuracy                           0.96     15204
   macro avg       0.54      0.50      0.50     15204
weighted avg       0.93      0.96      0.94     15204



##For k = 4

In [None]:
# Your code here
knnWala  = KNeighborsClassifier(n_neighbors=4)
knnWala.fit(X_train,Y_train)



KNeighborsClassifier(n_neighbors=4)

In [None]:
y_pred = knnWala.predict(X_test)
acc = accuracy_score(Y_test,y_pred)
acc

0.9600105235464351

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     14602
           1       0.00      0.00      0.00       602

    accuracy                           0.96     15204
   macro avg       0.48      0.50      0.49     15204
weighted avg       0.92      0.96      0.94     15204



# 3.4 Training a Tuned Logistic Regression Model with Upsampling using SMOTE
Train tuned Logistic Regression model with following configurations and evaluate it on the validation set and print precision, recall, classification_report and ROC_AUC at the end.



```
logreg_tuned_upsampled = LogisticRegression(max_iter=50000,class_weight='balanced',C=100,
                                         fit_intercept=True, penalty='l2',solver='newton-cg')
```

SMOTE stands for Synthetic Minority Oversampling Technique.

SMOTE first selects a minority class instance a at random and finds its k nearest minority class neighbors. The synthetic instance is then created by choosing one of the k nearest neighbors b at random and connecting a and b to form a line segment in the feature space. The synthetic instances are generated as a convex combination of the two chosen instances a and b

Credits : https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/

Important: Please umsample the data before training this model.


In [None]:
# Your code here
sm = SMOTE(random_state=1)
X_res , Y_res = sm.fit_resample(X_train,Y_train)
ltu= LogisticRegression(max_iter=50000,class_weight='balanced',C=100,
                                         fit_intercept=True, penalty='l2',solver='newton-cg',verbose=2)
ltu.fit(X_res,Y_res)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 20.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 20.2min finished


LogisticRegression(C=100, class_weight='balanced', max_iter=50000,
                   solver='newton-cg', verbose=2)

In [None]:
y_pred_ltu = ltu.predict(X_test)
acc = accuracy_score(Y_test,y_pred_ltu)
acc

0.7122467771639043

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test,y_pred_ltu))

              precision    recall  f1-score   support

           0       0.98      0.71      0.83     14602
           1       0.09      0.69      0.16       602

    accuracy                           0.71     15204
   macro avg       0.54      0.70      0.49     15204
weighted avg       0.95      0.71      0.80     15204



# 3.5 Training a Tuned Logistic Regression Model with Class Weights
Train tuned Logistic Regression model with following configurations and evaluate it on the validation set and print precision, recall, classification_report and ROC_AUC at the end.



```
log_tuned_class_weight = LogisticRegression(max_iter=50000,class_weight={0:1,1:25},C=100,fit_intercept=True,
                               penalty='l2',solver='newton-cg')
```



In [None]:
# Your code here
sm = SMOTE(random_state=1)
X_res , Y_res = sm.fit_resample(X_train,Y_train)

log_tuned_class_weight = LogisticRegression(max_iter=50000,class_weight={0:1,1:25},C=100,fit_intercept=True,
                               penalty='l2',solver='newton-cg')
log_tuned_class_weight.fit(X_res,Y_res)

LogisticRegression(C=100, class_weight={0: 1, 1: 25}, max_iter=50000,
                   solver='newton-cg')

In [None]:
y_pred_log_tuned_class_weight = log_tuned_class_weight.predict(X_test)
acc = accuracy_score(Y_test,y_pred_log_tuned_class_weight)
acc

0.08649039726387793

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test,y_pred_log_tuned_class_weight))

              precision    recall  f1-score   support

           0       1.00      0.05      0.09     14602
           1       0.04      1.00      0.08       602

    accuracy                           0.09     15204
   macro avg       0.52      0.52      0.09     15204
weighted avg       0.96      0.09      0.09     15204



# 3.6 Training a Tuned Random Forest Model with Class Weights
Train Random Forest Classifier model with following configurations and evaluate it on the validation set and print precision, recall, classification_report and ROC_AUC at the end.


```
rfc_tuned = RandomForestClassifier(random_state=1, n_jobs=-1, class_weight={0:1, 1:25}, criterion='gini', max_depth= 6, min_samples_split= 12, n_estimators= 400, warm_start=True)

```



In [None]:
# Your code here
rfc_tuned = RandomForestClassifier(random_state=1, n_jobs=-1, class_weight={0:1, 1:25}, criterion='gini', max_depth= 6, min_samples_split= 12, n_estimators= 400, warm_start=True)
rfc_tuned.fit(X_res,Y_res)


RandomForestClassifier(class_weight={0: 1, 1: 25}, max_depth=6,
                       min_samples_split=12, n_estimators=400, n_jobs=-1,
                       random_state=1, warm_start=True)

In [None]:
y_pred_rfc = rfc_tuned.predict(X_test)
acc = accuracy_score(Y_test,y_pred_rfc)
acc

0.1882399368587214

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test,y_pred_rfc))

              precision    recall  f1-score   support

           0       1.00      0.15      0.27     14602
           1       0.05      1.00      0.09       602

    accuracy                           0.19     15204
   macro avg       0.52      0.57      0.18     15204
weighted avg       0.96      0.19      0.26     15204



# 3.5 Training a Tuned XGBoost Classifier Model with Class Weights
Train XGBoost Classifier model with following configurations and evaluate it on the validation set and print precision, recall, classification_report and ROC_AUC at the end.



```
# Lets train the XGBoost with best hyper parameters
# based on scoring='Recall'
# {'booster': 'dart', 'eta': 0.01, 'max_depth': 2, 'n_estimators': 150}
# These are the best parameters we got
```



```
xgb_tuned = XGBClassifier(scale_pos_weight = 25, eval_metric = 'logloss', seed =0, 
               objective='binary:logistic', 
              nthreads=-1, early_stopping_rounds=15, booster='dart', scoring='Recall',
              eta=0.01, max_depth=2, n_estimators=150)
```



In [None]:
# Your code here
xgb_tuned = XGBClassifier(scale_pos_weight = 25, eval_metric = 'logloss', seed =0, 
               objective='binary:logistic', 
              nthreads=-1, early_stopping_rounds=15, booster='dart', scoring='Recall',
              eta=0.01, max_depth=2, n_estimators=150)


xgb_tuned.fit(X_res,Y_res)

XGBClassifier(booster='dart', early_stopping_rounds=15, eta=0.01,
              eval_metric='logloss', max_depth=2, n_estimators=150, nthreads=-1,
              scale_pos_weight=25, scoring='Recall', seed=0)

In [None]:
y_pred_xgb = xgb_tuned.predict(X_test)
acc = accuracy_score(Y_test,y_pred_xgb)
acc

0.44277821625887925

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test,y_pred_xgb))

              precision    recall  f1-score   support

           0       0.99      0.42      0.59     14602
           1       0.06      0.94      0.12       602

    accuracy                           0.44     15204
   macro avg       0.53      0.68      0.36     15204
weighted avg       0.96      0.44      0.57     15204



# **4. Plot the ROC-AUC Curves of all the models**

In [None]:
# Your code here
roc_ltu = roc_auc_score(Y_test,y_pred_ltu)
print("roc_auc_score  Tuned Logistic Regression Model with Upsampling using SMOTE :  ",roc_ltu)

roc_log_tuned_class_weight = roc_auc_score(Y_test,y_pred_log_tuned_class_weight)
print("roc_auc_score of Tuned Logistic Regression Model with Class Weights :  ",roc_log_tuned_class_weight)

roc_rfc = roc_auc_score(Y_test,y_pred_rfc)
print("roc_auc_score Tuned Random Forest Model with Class Weights :  ",roc_rfc)

roc_rgb = roc_auc_score(Y_test,y_pred_xgb)
print("roc_auc_score Tuned XGBoost Classifier Model with Class Weights :  ",roc_rgb)

roc_auc_score  Tuned Logistic Regression Model with Upsampling using SMOTE :   0.7020756952695235
roc_auc_score of Tuned Logistic Regression Model with Class Weights :   0.5220254950739466
roc_auc_score Tuned Random Forest Model with Class Weights :   0.5749976906635917
roc_auc_score Tuned XGBoost Classifier Model with Class Weights :   0.6828277744686138


**Conclusion**

XGB is better among these because its accuracy is better and ROC to. :)