[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/)
- Click "Upload" and select this file and the data file.

# Bank customers: SVM

Build a classification algorithm for bank customers' response to marketing campaign.
- deposit: whether the client purchased a term deposit (yes, no)
- age
- job: type of job (admin, bluecollar, entrepreneur, housemaid, management, retired, selfemployed, services, student, technician, unemployed)
- marital: marital status (divorced/widowed, married, single)
- education (primary, secondary, tertiary)
- default: has credit in default? (yes, no)
- balance: average yearly balance, in euros
- housing: has housing loan? (yes, no)
- loan: has personal loan? (yes, no)
- day, month: last contact day and month
- duration: last contact duration, in seconds
- campaign: number of contacts performed during this campaign and for the client
- passdays: number of days that passed by after the client was last contacted from a previous campaign
- previous: number of contacts performed before this campaign

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the data

df = pd.read_csv('Bank customer.csv')
df.head()

Unnamed: 0,deposit,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,passdays,previous
0,no,58,management,married,tertiary,no,2143,yes,no,5,may,261,1,-1,0
1,no,44,technician,single,secondary,no,29,yes,no,5,may,151,1,-1,0
2,no,33,entrepreneur,married,secondary,no,2,yes,yes,5,may,76,1,-1,0
3,no,35,management,married,tertiary,no,231,yes,no,5,may,139,1,-1,0
4,no,28,management,single,tertiary,no,447,yes,yes,5,may,217,1,-1,0


In [3]:
# Encode 'deposit' (binary variable)

df['deposit'] = df['deposit'].apply(lambda x: 1 if x=='yes' else 0)
df.head()

Unnamed: 0,deposit,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,passdays,previous
0,0,58,management,married,tertiary,no,2143,yes,no,5,may,261,1,-1,0
1,0,44,technician,single,secondary,no,29,yes,no,5,may,151,1,-1,0
2,0,33,entrepreneur,married,secondary,no,2,yes,yes,5,may,76,1,-1,0
3,0,35,management,married,tertiary,no,231,yes,no,5,may,139,1,-1,0
4,0,28,management,single,tertiary,no,447,yes,yes,5,may,217,1,-1,0


In [4]:
# Encode 'month' (change into numbers)

mth = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
df['month'] = df['month'].apply(lambda x: mth.index(x)+1)
df.head()

Unnamed: 0,deposit,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,passdays,previous
0,0,58,management,married,tertiary,no,2143,yes,no,5,5,261,1,-1,0
1,0,44,technician,single,secondary,no,29,yes,no,5,5,151,1,-1,0
2,0,33,entrepreneur,married,secondary,no,2,yes,yes,5,5,76,1,-1,0
3,0,35,management,married,tertiary,no,231,yes,no,5,5,139,1,-1,0
4,0,28,management,single,tertiary,no,447,yes,yes,5,5,217,1,-1,0


In [5]:
# Create dummies from the categorical variables

df = pd.get_dummies(df, columns=['job','marital','education','default','housing','loan'], drop_first=True)
df.head()

Unnamed: 0,deposit,age,balance,day,month,duration,campaign,passdays,previous,job_blue-collar,...,job_student,job_technician,job_unemployed,marital_married,marital_single,education_secondary,education_tertiary,default_yes,housing_yes,loan_yes
0,0,58,2143,5,5,261,1,-1,0,False,...,False,False,False,True,False,False,True,False,True,False
1,0,44,29,5,5,151,1,-1,0,False,...,False,True,False,False,True,True,False,False,True,False
2,0,33,2,5,5,76,1,-1,0,False,...,False,False,False,True,False,True,False,False,True,True
3,0,35,231,5,5,139,1,-1,0,False,...,False,False,False,True,False,False,True,False,True,False
4,0,28,447,5,5,217,1,-1,0,False,...,False,False,False,False,True,False,True,False,True,True


In [6]:
# Define x and y. Split into train, test data

y=df.deposit
x=df.drop('deposit', axis=1)

xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=1)

### Fit a decision tree model on the training data and predict the dependent variable.

In [10]:
dt = DecisionTreeClassifier(random_state=42).fit(xtrain,ytrain)
pred1 = dt.predict(xtest)

print("Decision Tree Accuracy", accuracy_score(ytest, pred1))
print("Decision Tree Confusion Matrix \n", confusion_matrix(ytest, dt.predict(xtest)))
print("Decision Tree Classification Report \n", classification_report(ytest, pred1))

Decision Tree Accuracy 0.8738772108528567
Decision Tree Confusion Matrix 
 [[8854  714]
 [ 648  583]]
Decision Tree Classification Report 
               precision    recall  f1-score   support

           0       0.93      0.93      0.93      9568
           1       0.45      0.47      0.46      1231

    accuracy                           0.87     10799
   macro avg       0.69      0.70      0.69     10799
weighted avg       0.88      0.87      0.88     10799



### Fit a random forests model on the training data and predict the dependent variable.

In [13]:
rf = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=1).fit(xtrain, ytrain)
pred2 = rf.predict(xtest)

print("Random Forest Accuracy", accuracy_score(ytest, pred2))
print("Random Forest Confusion Matrix \n", confusion_matrix(ytest, pred2))
print("Random Forest Classification Report \n", classification_report(ytest, pred2))

Random Forest Accuracy 0.9034169830539864
Random Forest Confusion Matrix 
 [[9286  282]
 [ 761  470]]
Random Forest Classification Report 
               precision    recall  f1-score   support

           0       0.92      0.97      0.95      9568
           1       0.62      0.38      0.47      1231

    accuracy                           0.90     10799
   macro avg       0.77      0.68      0.71     10799
weighted avg       0.89      0.90      0.89     10799



### Fit a support vector machine model on the training data and predict the dependent variable.

In [12]:
# RBF kernel

svm_rbf = SVC(C=1, kernel='rbf', gamma=1).fit(xtrain, ytrain)
pred3 = svm_rbf.predict(xtest)

print("RBF Kernel Accuracy", accuracy_score(ytest, pred3))
print("RBF Kernel Confusion Matrix \n", confusion_matrix(ytest, pred3))
print("RBF Kernel Classification Report \n", classification_report(ytest, pred3))

RBF Kernel Accuracy 0.8860079637003426
RBF Kernel Confusion Matrix 
 [[9568    0]
 [1231    0]]
RBF Kernel Classification Report 
               precision    recall  f1-score   support

           0       0.89      1.00      0.94      9568
           1       0.00      0.00      0.00      1231

    accuracy                           0.89     10799
   macro avg       0.44      0.50      0.47     10799
weighted avg       0.79      0.89      0.83     10799



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
# Sigmoid kernel

svm_sm = SVC(kernel='sigmoid', C=0.1).fit(xtrain, ytrain)
pred4 = svm_sm.predict(xtest)

print("Sigmoid Kernel Accuracy", accuracy_score(ytest, pred4))
print("Sigmoid Kernel Confusion Matrix \n", confusion_matrix(ytest, pred4))
print("Sigmoid Kernel Classification Report \n", classification_report(ytest, pred4))

Sigmoid Kernel Accuracy 0.8703583665154181
Sigmoid Kernel Confusion Matrix 
 [[9220  348]
 [1052  179]]
Sigmoid Kernel Classification Report 
               precision    recall  f1-score   support

           0       0.90      0.96      0.93      9568
           1       0.34      0.15      0.20      1231

    accuracy                           0.87     10799
   macro avg       0.62      0.55      0.57     10799
weighted avg       0.83      0.87      0.85     10799



# (Optional code for practice) Hyperparameter tuning

In [16]:
# Define the SVC model pipeline
svc = make_pipeline(StandardScaler(), SVC())

# Set up the grid search parameter grid
param = {'svc__C': [1, 10], 'svc__kernel': ['rbf', 'poly']}

# Initialize the GridSearchCV object with cross-validation
search = GridSearchCV(svc, param, cv=5, scoring=['accuracy', 'f1'], refit='f1', verbose=2).fit(xtrain, ytrain)

# Print the best parameters and the highest cross-validated accuracy
print("Best parameters:", search.best_params_)
print("Best cross-validated accuracy:", search.cv_results_['mean_test_accuracy'][search.best_index_])
print("Best cross-validated F1 score:", search.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ..........................svc__C=1, svc__kernel=rbf; total time=   6.0s
[CV] END ..........................svc__C=1, svc__kernel=rbf; total time=   5.9s
[CV] END ..........................svc__C=1, svc__kernel=rbf; total time=   5.9s
[CV] END ..........................svc__C=1, svc__kernel=rbf; total time=   6.0s
[CV] END ..........................svc__C=1, svc__kernel=rbf; total time=   6.0s
[CV] END .........................svc__C=1, svc__kernel=poly; total time=   6.3s
[CV] END .........................svc__C=1, svc__kernel=poly; total time=   6.1s
[CV] END .........................svc__C=1, svc__kernel=poly; total time=   5.9s
[CV] END .........................svc__C=1, svc__kernel=poly; total time=   6.2s
[CV] END .........................svc__C=1, svc__kernel=poly; total time=   5.8s
[CV] END .........................svc__C=10, svc__kernel=rbf; total time=   9.1s
[CV] END .........................svc__C=10, svc_