In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
df.drop(["RowNumber","CustomerId","Surname"],axis=1,inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             10000 non-null int64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


In [5]:
df.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [6]:
df.describe()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [7]:
x=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [8]:
x.Gender.value_counts()

Male      5457
Female    4543
Name: Gender, dtype: int64

In [9]:
x.Geography.value_counts()

France     5014
Germany    2509
Spain      2477
Name: Geography, dtype: int64

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(),["Geography", "Gender"])], remainder='passthrough')

x = np.array(ct.fit_transform(x))

In [11]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.30,random_state=1)

In [12]:
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
xtrain=sc.fit_transform(xtrain)
xtest=sc.transform(xtest)

In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix

In [14]:
def mymodel(model):
    model.fit(xtrain,ytrain)
    ypred=model.predict(xtest)
    print(classification_report(ytest,ypred))
    return model

In [15]:
l=LogisticRegression()
knn=KNeighborsClassifier()
svm=SVC()

In [16]:
mymodel(knn)

              precision    recall  f1-score   support

           0       0.85      0.95      0.89      2373
           1       0.63      0.35      0.45       627

    accuracy                           0.82      3000
   macro avg       0.74      0.65      0.67      3000
weighted avg       0.80      0.82      0.80      3000



KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [17]:
mymodel(l)

              precision    recall  f1-score   support

           0       0.82      0.97      0.89      2373
           1       0.66      0.22      0.32       627

    accuracy                           0.81      3000
   macro avg       0.74      0.59      0.61      3000
weighted avg       0.79      0.81      0.77      3000



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
mymodel(svm)

              precision    recall  f1-score   support

           0       0.86      0.98      0.92      2373
           1       0.84      0.40      0.54       627

    accuracy                           0.86      3000
   macro avg       0.85      0.69      0.73      3000
weighted avg       0.86      0.86      0.84      3000



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

# GridSearchCV

In [19]:
params={"C":[1,10,100,1000,10000],"gamma":[1,0.1,0.01,0.001,0.0001],"kernel":["rbf"]}

In [20]:
from sklearn.model_selection import GridSearchCV
grid=GridSearchCV(SVC(),params,refit=True,verbose=3)
grid.fit(x,y)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=1, gamma=1, kernel=rbf ........................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............ C=1, gamma=1, kernel=rbf, score=0.796, total=  18.2s
[CV] C=1, gamma=1, kernel=rbf ........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   18.1s remaining:    0.0s


[CV] ............ C=1, gamma=1, kernel=rbf, score=0.796, total=  17.8s
[CV] C=1, gamma=1, kernel=rbf ........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   36.0s remaining:    0.0s


[CV] ............ C=1, gamma=1, kernel=rbf, score=0.796, total=  16.8s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.796, total=  16.9s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.796, total=  16.6s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.796, total=  16.6s
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.796, total=  17.4s
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.796, total=  17.3s
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.796, total=  17.0s
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] .

[CV] ........ C=10000, gamma=1, kernel=rbf, score=0.796, total=  22.9s
[CV] C=10000, gamma=1, kernel=rbf ....................................
[CV] ........ C=10000, gamma=1, kernel=rbf, score=0.796, total=  20.5s
[CV] C=10000, gamma=1, kernel=rbf ....................................
[CV] ........ C=10000, gamma=1, kernel=rbf, score=0.796, total=  20.6s
[CV] C=10000, gamma=0.1, kernel=rbf ..................................
[CV] ...... C=10000, gamma=0.1, kernel=rbf, score=0.796, total=  22.2s
[CV] C=10000, gamma=0.1, kernel=rbf ..................................
[CV] ...... C=10000, gamma=0.1, kernel=rbf, score=0.796, total=  23.1s
[CV] C=10000, gamma=0.1, kernel=rbf ..................................
[CV] ...... C=10000, gamma=0.1, kernel=rbf, score=0.796, total=  22.2s
[CV] C=10000, gamma=0.01, kernel=rbf .................................
[CV] ..... C=10000, gamma=0.01, kernel=rbf, score=0.796, total=  19.9s
[CV] C=10000, gamma=0.01, kernel=rbf .................................
[CV] .

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed: 23.4min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 10, 100, 1000, 10000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [21]:
grid.best_params_

{'C': 1, 'gamma': 1, 'kernel': 'rbf'}

In [23]:
ypred=grid.predict(x)
print(classification_report(y,ypred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7963
           1       1.00      1.00      1.00      2037

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000

