In [1]:
import pandas as pd

In [3]:
df = pd.read_csv("Churn_Modelling.csv")

In [4]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
df.isnull().sum().sum()

0

In [6]:
df.shape

(10000, 14)

In [7]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [9]:
X = df.iloc[:, 3:13]   # independent features
y = df.iloc[:, 13]    # dependent features

In [10]:
X.columns

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'],
      dtype='object')

In [12]:
X.shape, y.shape

((10000, 10), (10000,))

In [13]:
#Create dummy variables
geography=pd.get_dummies(X["Geography"],drop_first=True)
gender=pd.get_dummies(X['Gender'],drop_first=True)


In [14]:
## Concatenate the Data Frames

X=pd.concat([X,geography,gender],axis=1)


In [15]:
## Drop Unnecessary columns
X=X.drop(['Geography','Gender'],axis=1)


In [16]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [17]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

## Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train_sc,y_train)
y_pred__LR = LR.predict(X_test_sc)

from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred__LR)

0.811

###### Logistic Regression gives - 81.1 % accuracy

## Support Vector Classification

In [21]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train_sc,y_train)
y_pred_svc = svc.predict(X_test_sc)

accuracy_score(y_test,y_pred_svc)

0.8635

In [23]:
#optimize SVC
kernels = ["rbf","linear","poly"]

max_ker="rbf"
max_c = 0
max_acc = 0

for ker in kernels :
    for c in range(1,11) :
        svc = SVC(kernel=ker, C=c)
        svc.fit(X_train_sc,y_train)
        y_pred_svc = svc.predict(X_test_sc)

        acc = accuracy_score(y_test,y_pred_svc)
        if acc > max_acc :
            max_acc = acc
            max_ker = ker
            max_c = c
            
            
print("SVC gives accuracy of {} % , with value of C= {} & kernel= {}".format(max_acc*100,max_c,max_ker))
        

SVC gives accuracy of 86.5 % , with value of C= 2 & kernel= rbf


## Random Forest Classifier

In [24]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred_rf = rf.predict(X_test)

accuracy_score(y_test,y_pred_rf)

0.8655

In [45]:
# hyperparameter tuning
params_grid = {
    "n_estimators" : [50,100,120,150,200,250,300,400,450,500,550,600,700,800],
    "max_depth":[3, 4, 6, 8, 10, 12, 15,18,20],
    "min_samples_split":[2,3,5,7,9,11,13,15],
    "min_samples_leaf":[1,3,5,7,9,11,13,15],
}

In [46]:
from sklearn.model_selection import RandomizedSearchCV
rf_classifier = RandomForestClassifier()

random_search_rf = RandomizedSearchCV(rf_classifier,param_distributions=params_grid,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)

In [47]:
random_search_rf.fit(X_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:    8.2s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    8.4s finished


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=5,
                   n_jobs=-1,
                   param_distributions={'max_depth': [3, 4, 6, 8, 10, 12, 15,
                                                      18, 20],
                                        'min_samples_leaf': [1, 3, 5, 7, 9, 11,
                                                             13, 15],
                                        'min_samples_split': [2, 3, 5, 7, 9, 11,
                                                              13, 15],
                                        'n_estimators': [50, 100, 120, 150, 200,
                                                         250, 300, 400, 450,
                                                         500, 550, 600, 700,
                                                         800]},
                   scoring='roc_auc', verbose=3)

In [48]:
random_search_rf.best_params_, random_search_rf.best_score_

({'n_estimators': 400,
  'min_samples_split': 11,
  'min_samples_leaf': 5,
  'max_depth': 18},
 0.8633624196780765)

In [49]:
random_search_rf_model = RandomForestClassifier(n_estimators= 400,min_samples_split= 11,
                          min_samples_leaf= 5, max_depth= 18)

In [50]:
random_search_rf_model.fit(X_train,y_train)
rf_random_pred = random_search_rf_model.predict(X_test)
accuracy_score(y_test,rf_random_pred)

0.8695

###### Random Forest Classification gives accuracy of : 86.95 %

## XgBoost Classification

In [51]:
## Hyper Parameter Optimization

params={
 "learning_rate"    : [0.03,0.05, 0.07,0.10,0.125, 0.15 ] ,
 "max_depth"        : [ 3, 5, 7, 9, 11, 13,15 ],
 "min_child_weight" : [ 1, 3, 5, 7],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ,0.5],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7,0.9 ],
  "booster" :["gbtree","gblinear"],
  "n_estimators":[50,100,150,180,200,250,300,400],
    "base_score" :[0.3,0.5,0.8]
}


In [52]:
import xgboost
classifier=xgboost.XGBClassifier()

In [53]:
random_search=RandomizedSearchCV(classifier,param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)

In [54]:
random_search.fit(X_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:   17.5s remaining:    5.5s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   19.9s finished




RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
                   n_iter=5, n_jobs=-1,
                   param_distributions={'base_score': [0.3, 0.5, 0.8],
                                        'booster': ['gbtree', 'gblinea

In [55]:
random_search.best_estimator_

XGBClassifier(base_score=0.8, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.15, max_delta_step=0, max_depth=5,
              min_child_weight=7, missing=nan, monotone_constraints='()',
              n_estimators=250, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [56]:
import numpy as np

classifier=xgboost.XGBClassifier(base_score=0.8, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.15, max_delta_step=0, max_depth=5,
              min_child_weight=7, missing=np.nan, monotone_constraints='()',
              n_estimators=250, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [57]:
classifier.fit(X_train,y_train)
xg_pred = classifier.predict(X_test)
accuracy_score(y_test,xg_pred)



0.855

##### XgBoost gives accuracy of : 85.5 %