In [1]:
import sys
import os
import joblib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbl_pipe
from imblearn.over_sampling import SMOTE

from sklearn.svm import SVC 

In [2]:
df = pd.read_csv('../Churn_Modelling.csv')
df =df.drop(columns=['RowNumber','CustomerId','Surname'])
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# Seperate Dataframe into features and response
X, y = df.loc[:,df.columns != 'Exited'], df.Exited
print(X.shape, y.shape)

(10000, 10) (10000,)


In [4]:
# List numerical features
num_columns = X.select_dtypes(include='number').columns.tolist()
num_columns

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary']

In [5]:
# List categorical features
cat_columns = X.select_dtypes(include='object').columns.tolist()
cat_columns

['Geography', 'Gender']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size =0.7, random_state = 1)
X_train.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
2228,644,France,Female,37,8,0.0,2,1,0,20968.88
5910,481,France,Female,39,6,0.0,1,1,1,24677.54
1950,680,France,Female,37,10,123806.28,1,1,0,81776.84
2119,690,France,Male,29,5,0.0,2,1,0,108577.97
5947,656,France,Female,45,7,145933.27,1,1,1,199392.14


In [7]:
num_features = [] 

for i in num_columns:
    location = X.columns.get_loc(i)
    num_features.append(location)
print(num_features)  

cat_features = []

for i in cat_columns:
    location = X.columns.get_loc(i)
    cat_features.append(location)
print(cat_features) 
preprocess = make_column_transformer(
    (MinMaxScaler(), num_features),
    (OneHotEncoder(sparse=False), cat_features)
)
preprocess

[0, 3, 4, 5, 6, 7, 8, 9]
[1, 2]


ColumnTransformer(transformers=[('minmaxscaler', MinMaxScaler(),
                                 [0, 3, 4, 5, 6, 7, 8, 9]),
                                ('onehotencoder', OneHotEncoder(sparse=False),
                                 [1, 2])])

In [8]:
# without SMOTE
model = make_pipeline(preprocess, SVC())
model

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('minmaxscaler',
                                                  MinMaxScaler(),
                                                  [0, 3, 4, 5, 6, 7, 8, 9]),
                                                 ('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  [1, 2])])),
                ('svc', SVC())])

In [9]:
param_grid = {'svc__kernel' : ['linear', 'rbf', 'poly', 'sigmoid'],
              'svc__C': [0.0005,0.001, 0.01, 0.1, 0.5],
              'svc__gamma': [5, 1, 0.1, 0.01]}

In [10]:
svm_grid = GridSearchCV(model, param_grid, cv= 10, scoring='accuracy')
svm_grid.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('minmaxscaler',
                                                                         MinMaxScaler(),
                                                                         [0, 3,
                                                                          4, 5,
                                                                          6, 7,
                                                                          8,
                                                                          9]),
                                                                        ('onehotencoder',
                                                                         OneHotEncoder(sparse=False),
                                                                         [1,
                                                                          2]

In [11]:
print(svm_grid.best_params_)
print(svm_grid.best_score_)

{'svc__C': 0.5, 'svc__gamma': 5, 'svc__kernel': 'poly'}
0.8614285714285714


In [12]:
print(f"Training Data Score: {svm_grid.score(X_train, y_train)}")
print(f"Testing Data Score: {svm_grid.score(X_test, y_test)}")

Training Data Score: 0.8767142857142857
Testing Data Score: 0.8583333333333333


In [13]:
predictions = svm_grid.predict(X_test)
cm = confusion_matrix(y_test, predictions)
print(cm)
print(classification_report(y_test, predictions))

[[2275   98]
 [ 327  300]]
              precision    recall  f1-score   support

           0       0.87      0.96      0.91      2373
           1       0.75      0.48      0.59       627

    accuracy                           0.86      3000
   macro avg       0.81      0.72      0.75      3000
weighted avg       0.85      0.86      0.85      3000



In [14]:
# with SMOTE
model2 = imbl_pipe(preprocess,
                  SMOTE(sampling_strategy='auto', random_state= 1),
                  SVC(random_state= 1))
model2

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('minmaxscaler',
                                                  MinMaxScaler(),
                                                  [0, 3, 4, 5, 6, 7, 8, 9]),
                                                 ('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  [1, 2])])),
                ('smote', SMOTE(random_state=1)),
                ('svc', SVC(random_state=1))])

In [15]:
#svm_grid = GridSearchCV(model2, param_grid, cv= 10, scoring='accuracy')
#svm_grid.fit(X_train, y_train)

In [16]:
#print(svm_grid.best_params_)
#print(svm_grid.best_score_)

In [17]:
#print(f"Training Data Score: {svm_grid.score(X_train, y_train)}")
#print(f"Testing Data Score: {svm_grid.score(X_test, y_test)}")

In [18]:
#p#redictions = svm_grid.predict(X_test)
#cm = confusion_matrix(y_test, predictions)
#print(cm)
#print(classification_report(y_test, predictions))

In [19]:
model2 = imbl_pipe(preprocess,
                  SMOTE(sampling_strategy='auto', random_state=1),
                  SVC(random_state=1))

model2

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('minmaxscaler',
                                                  MinMaxScaler(),
                                                  [0, 3, 4, 5, 6, 7, 8, 9]),
                                                 ('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  [1, 2])])),
                ('smote', SMOTE(random_state=1)),
                ('svc', SVC(random_state=1))])

In [20]:
svm_grid = GridSearchCV(model2, param_grid, verbose=3, cv= 10)

In [21]:
svm_grid.fit(X_train, y_train)

Fitting 10 folds for each of 80 candidates, totalling 800 fits
[CV 1/10] END svc__C=0.0005, svc__gamma=5, svc__kernel=linear;, score=0.677 total time=   4.0s
[CV 2/10] END svc__C=0.0005, svc__gamma=5, svc__kernel=linear;, score=0.687 total time=   4.0s
[CV 3/10] END svc__C=0.0005, svc__gamma=5, svc__kernel=linear;, score=0.681 total time=   4.1s
[CV 4/10] END svc__C=0.0005, svc__gamma=5, svc__kernel=linear;, score=0.683 total time=   4.0s
[CV 5/10] END svc__C=0.0005, svc__gamma=5, svc__kernel=linear;, score=0.651 total time=   4.0s
[CV 6/10] END svc__C=0.0005, svc__gamma=5, svc__kernel=linear;, score=0.663 total time=   3.9s
[CV 7/10] END svc__C=0.0005, svc__gamma=5, svc__kernel=linear;, score=0.679 total time=   4.0s
[CV 8/10] END svc__C=0.0005, svc__gamma=5, svc__kernel=linear;, score=0.680 total time=   3.9s
[CV 9/10] END svc__C=0.0005, svc__gamma=5, svc__kernel=linear;, score=0.670 total time=   4.1s
[CV 10/10] END svc__C=0.0005, svc__gamma=5, svc__kernel=linear;, score=0.690 total

[CV 8/10] END svc__C=0.0005, svc__gamma=0.1, svc__kernel=linear;, score=0.680 total time=   3.8s
[CV 9/10] END svc__C=0.0005, svc__gamma=0.1, svc__kernel=linear;, score=0.670 total time=   4.1s
[CV 10/10] END svc__C=0.0005, svc__gamma=0.1, svc__kernel=linear;, score=0.690 total time=   4.1s
[CV 1/10] END svc__C=0.0005, svc__gamma=0.1, svc__kernel=rbf;, score=0.677 total time=   6.5s
[CV 2/10] END svc__C=0.0005, svc__gamma=0.1, svc__kernel=rbf;, score=0.680 total time=   6.5s
[CV 3/10] END svc__C=0.0005, svc__gamma=0.1, svc__kernel=rbf;, score=0.681 total time=   6.5s
[CV 4/10] END svc__C=0.0005, svc__gamma=0.1, svc__kernel=rbf;, score=0.683 total time=   6.6s
[CV 5/10] END svc__C=0.0005, svc__gamma=0.1, svc__kernel=rbf;, score=0.651 total time=   6.5s
[CV 6/10] END svc__C=0.0005, svc__gamma=0.1, svc__kernel=rbf;, score=0.661 total time=   6.4s
[CV 7/10] END svc__C=0.0005, svc__gamma=0.1, svc__kernel=rbf;, score=0.676 total time=   6.5s
[CV 8/10] END svc__C=0.0005, svc__gamma=0.1, svc__

[CV 4/10] END svc__C=0.001, svc__gamma=5, svc__kernel=rbf;, score=0.710 total time=   7.4s
[CV 5/10] END svc__C=0.001, svc__gamma=5, svc__kernel=rbf;, score=0.714 total time=   7.6s
[CV 6/10] END svc__C=0.001, svc__gamma=5, svc__kernel=rbf;, score=0.700 total time=   8.2s
[CV 7/10] END svc__C=0.001, svc__gamma=5, svc__kernel=rbf;, score=0.749 total time=   7.2s
[CV 8/10] END svc__C=0.001, svc__gamma=5, svc__kernel=rbf;, score=0.697 total time=   7.2s
[CV 9/10] END svc__C=0.001, svc__gamma=5, svc__kernel=rbf;, score=0.709 total time=   7.5s
[CV 10/10] END svc__C=0.001, svc__gamma=5, svc__kernel=rbf;, score=0.744 total time=   7.2s
[CV 1/10] END svc__C=0.001, svc__gamma=5, svc__kernel=poly;, score=0.757 total time=   4.1s
[CV 2/10] END svc__C=0.001, svc__gamma=5, svc__kernel=poly;, score=0.780 total time=   4.2s
[CV 3/10] END svc__C=0.001, svc__gamma=5, svc__kernel=poly;, score=0.791 total time=   4.4s
[CV 4/10] END svc__C=0.001, svc__gamma=5, svc__kernel=poly;, score=0.771 total time=  

[CV 2/10] END svc__C=0.001, svc__gamma=0.1, svc__kernel=poly;, score=0.531 total time=   4.5s
[CV 3/10] END svc__C=0.001, svc__gamma=0.1, svc__kernel=poly;, score=0.511 total time=   4.4s
[CV 4/10] END svc__C=0.001, svc__gamma=0.1, svc__kernel=poly;, score=0.554 total time=   4.5s
[CV 5/10] END svc__C=0.001, svc__gamma=0.1, svc__kernel=poly;, score=0.531 total time=   4.6s
[CV 6/10] END svc__C=0.001, svc__gamma=0.1, svc__kernel=poly;, score=0.520 total time=   4.5s
[CV 7/10] END svc__C=0.001, svc__gamma=0.1, svc__kernel=poly;, score=0.530 total time=   4.6s
[CV 8/10] END svc__C=0.001, svc__gamma=0.1, svc__kernel=poly;, score=0.520 total time=   4.5s
[CV 9/10] END svc__C=0.001, svc__gamma=0.1, svc__kernel=poly;, score=0.504 total time=   4.6s
[CV 10/10] END svc__C=0.001, svc__gamma=0.1, svc__kernel=poly;, score=0.537 total time=   4.6s
[CV 1/10] END svc__C=0.001, svc__gamma=0.1, svc__kernel=sigmoid;, score=0.674 total time=   5.6s
[CV 2/10] END svc__C=0.001, svc__gamma=0.1, svc__kernel=

[CV 9/10] END svc__C=0.01, svc__gamma=5, svc__kernel=poly;, score=0.776 total time=   7.2s
[CV 10/10] END svc__C=0.01, svc__gamma=5, svc__kernel=poly;, score=0.794 total time=   7.2s
[CV 1/10] END svc__C=0.01, svc__gamma=5, svc__kernel=sigmoid;, score=0.201 total time=   7.6s
[CV 2/10] END svc__C=0.01, svc__gamma=5, svc__kernel=sigmoid;, score=0.201 total time=   7.8s
[CV 3/10] END svc__C=0.01, svc__gamma=5, svc__kernel=sigmoid;, score=0.201 total time=   7.7s
[CV 4/10] END svc__C=0.01, svc__gamma=5, svc__kernel=sigmoid;, score=0.201 total time=   7.7s
[CV 5/10] END svc__C=0.01, svc__gamma=5, svc__kernel=sigmoid;, score=0.201 total time=   7.7s
[CV 6/10] END svc__C=0.01, svc__gamma=5, svc__kernel=sigmoid;, score=0.201 total time=   7.7s
[CV 7/10] END svc__C=0.01, svc__gamma=5, svc__kernel=sigmoid;, score=0.201 total time=   7.7s
[CV 8/10] END svc__C=0.01, svc__gamma=5, svc__kernel=sigmoid;, score=0.201 total time=   7.7s
[CV 9/10] END svc__C=0.01, svc__gamma=5, svc__kernel=sigmoid;, sc

[CV 8/10] END svc__C=0.01, svc__gamma=0.1, svc__kernel=sigmoid;, score=0.687 total time=   6.3s
[CV 9/10] END svc__C=0.01, svc__gamma=0.1, svc__kernel=sigmoid;, score=0.673 total time=   5.9s
[CV 10/10] END svc__C=0.01, svc__gamma=0.1, svc__kernel=sigmoid;, score=0.703 total time=   6.7s
[CV 1/10] END svc__C=0.01, svc__gamma=0.01, svc__kernel=linear;, score=0.700 total time=   4.2s
[CV 2/10] END svc__C=0.01, svc__gamma=0.01, svc__kernel=linear;, score=0.710 total time=   4.2s
[CV 3/10] END svc__C=0.01, svc__gamma=0.01, svc__kernel=linear;, score=0.730 total time=   4.1s
[CV 4/10] END svc__C=0.01, svc__gamma=0.01, svc__kernel=linear;, score=0.714 total time=   4.2s
[CV 5/10] END svc__C=0.01, svc__gamma=0.01, svc__kernel=linear;, score=0.687 total time=   4.2s
[CV 6/10] END svc__C=0.01, svc__gamma=0.01, svc__kernel=linear;, score=0.676 total time=   4.0s
[CV 7/10] END svc__C=0.01, svc__gamma=0.01, svc__kernel=linear;, score=0.684 total time=   4.8s
[CV 8/10] END svc__C=0.01, svc__gamma=0

[CV 6/10] END svc__C=0.1, svc__gamma=1, svc__kernel=linear;, score=0.707 total time=   3.6s
[CV 7/10] END svc__C=0.1, svc__gamma=1, svc__kernel=linear;, score=0.706 total time=   3.6s
[CV 8/10] END svc__C=0.1, svc__gamma=1, svc__kernel=linear;, score=0.696 total time=   3.7s
[CV 9/10] END svc__C=0.1, svc__gamma=1, svc__kernel=linear;, score=0.703 total time=   3.9s
[CV 10/10] END svc__C=0.1, svc__gamma=1, svc__kernel=linear;, score=0.703 total time=   3.7s
[CV 1/10] END svc__C=0.1, svc__gamma=1, svc__kernel=rbf;, score=0.696 total time=   5.9s
[CV 2/10] END svc__C=0.1, svc__gamma=1, svc__kernel=rbf;, score=0.700 total time=   5.8s
[CV 3/10] END svc__C=0.1, svc__gamma=1, svc__kernel=rbf;, score=0.734 total time=   5.7s
[CV 4/10] END svc__C=0.1, svc__gamma=1, svc__kernel=rbf;, score=0.706 total time=   5.6s
[CV 5/10] END svc__C=0.1, svc__gamma=1, svc__kernel=rbf;, score=0.673 total time=   5.7s
[CV 6/10] END svc__C=0.1, svc__gamma=1, svc__kernel=rbf;, score=0.683 total time=   5.6s
[CV 7

[CV 5/10] END svc__C=0.1, svc__gamma=0.01, svc__kernel=rbf;, score=0.690 total time=   6.5s
[CV 6/10] END svc__C=0.1, svc__gamma=0.01, svc__kernel=rbf;, score=0.694 total time=   6.3s
[CV 7/10] END svc__C=0.1, svc__gamma=0.01, svc__kernel=rbf;, score=0.723 total time=   6.5s
[CV 8/10] END svc__C=0.1, svc__gamma=0.01, svc__kernel=rbf;, score=0.690 total time=   6.3s
[CV 9/10] END svc__C=0.1, svc__gamma=0.01, svc__kernel=rbf;, score=0.700 total time=   6.7s
[CV 10/10] END svc__C=0.1, svc__gamma=0.01, svc__kernel=rbf;, score=0.717 total time=   6.8s
[CV 1/10] END svc__C=0.1, svc__gamma=0.01, svc__kernel=poly;, score=0.497 total time=   5.6s
[CV 2/10] END svc__C=0.1, svc__gamma=0.01, svc__kernel=poly;, score=0.531 total time=   5.1s
[CV 3/10] END svc__C=0.1, svc__gamma=0.01, svc__kernel=poly;, score=0.511 total time=   4.9s
[CV 4/10] END svc__C=0.1, svc__gamma=0.01, svc__kernel=poly;, score=0.554 total time=   4.9s
[CV 5/10] END svc__C=0.1, svc__gamma=0.01, svc__kernel=poly;, score=0.531 t

[CV 5/10] END svc__C=0.5, svc__gamma=1, svc__kernel=poly;, score=0.769 total time=   5.5s
[CV 6/10] END svc__C=0.5, svc__gamma=1, svc__kernel=poly;, score=0.777 total time=   5.6s
[CV 7/10] END svc__C=0.5, svc__gamma=1, svc__kernel=poly;, score=0.767 total time=   5.7s
[CV 8/10] END svc__C=0.5, svc__gamma=1, svc__kernel=poly;, score=0.753 total time=   5.2s
[CV 9/10] END svc__C=0.5, svc__gamma=1, svc__kernel=poly;, score=0.767 total time=   5.6s
[CV 10/10] END svc__C=0.5, svc__gamma=1, svc__kernel=poly;, score=0.789 total time=   5.5s
[CV 1/10] END svc__C=0.5, svc__gamma=1, svc__kernel=sigmoid;, score=0.533 total time=   5.1s
[CV 2/10] END svc__C=0.5, svc__gamma=1, svc__kernel=sigmoid;, score=0.567 total time=   4.8s
[CV 3/10] END svc__C=0.5, svc__gamma=1, svc__kernel=sigmoid;, score=0.567 total time=   4.9s
[CV 4/10] END svc__C=0.5, svc__gamma=1, svc__kernel=sigmoid;, score=0.510 total time=   4.6s
[CV 5/10] END svc__C=0.5, svc__gamma=1, svc__kernel=sigmoid;, score=0.527 total time=  

[CV 4/10] END svc__C=0.5, svc__gamma=0.01, svc__kernel=sigmoid;, score=0.703 total time=   6.8s
[CV 5/10] END svc__C=0.5, svc__gamma=0.01, svc__kernel=sigmoid;, score=0.689 total time=   6.1s
[CV 6/10] END svc__C=0.5, svc__gamma=0.01, svc__kernel=sigmoid;, score=0.667 total time=   5.6s
[CV 7/10] END svc__C=0.5, svc__gamma=0.01, svc__kernel=sigmoid;, score=0.680 total time=   5.7s
[CV 8/10] END svc__C=0.5, svc__gamma=0.01, svc__kernel=sigmoid;, score=0.651 total time=   6.1s
[CV 9/10] END svc__C=0.5, svc__gamma=0.01, svc__kernel=sigmoid;, score=0.670 total time=   5.7s
[CV 10/10] END svc__C=0.5, svc__gamma=0.01, svc__kernel=sigmoid;, score=0.699 total time=   5.2s


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('minmaxscaler',
                                                                         MinMaxScaler(),
                                                                         [0, 3,
                                                                          4, 5,
                                                                          6, 7,
                                                                          8,
                                                                          9]),
                                                                        ('onehotencoder',
                                                                         OneHotEncoder(sparse=False),
                                                                         [1,
                                                                          2]

In [22]:
print(svm_grid.best_params_)
print(svm_grid.best_score_)

{'svc__C': 0.5, 'svc__gamma': 5, 'svc__kernel': 'poly'}
0.7892857142857144


In [23]:
print(f"Training Data Score: {svm_grid.score(X_train, y_train)}")
print(f"Testing Data Score: {svm_grid.score(X_test, y_test)}")

Training Data Score: 0.8151428571428572
Testing Data Score: 0.7883333333333333


In [24]:
predictions = svm_grid.predict(X_test)
cm = confusion_matrix(y_test, predictions)
print(cm)
print(classification_report(y_test, predictions))

[[1911  462]
 [ 173  454]]
              precision    recall  f1-score   support

           0       0.92      0.81      0.86      2373
           1       0.50      0.72      0.59       627

    accuracy                           0.79      3000
   macro avg       0.71      0.76      0.72      3000
weighted avg       0.83      0.79      0.80      3000



In [25]:
filename = '../Models/svm.sav'
joblib.dump(svm_grid, filename)
svm_model = joblib.load(filename)
print(svm_model.score(X_test, y_test))

0.7883333333333333
