# Project: Churn Modelling
#### author: Nabajyoti Majumdar
## Course : Machine Learning A-Z™: Hands-On Python & R In Data Science (Udemy)

### Importing important libraries

In [1]:
import pandas as p
import numpy as n
import matplotlib.pyplot as pl

In [2]:
#loading Dataset
dataset = p.read_csv('CM.csv')

In [3]:
dataset.keys()

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [4]:

X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values

### dataset.head() gives us the overview of the dataset

In [5]:
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


#### We have taken the columns from CreditScore to Estimated Salary as our feature set while the variable 'Exited' is the target variable

### Encoding categorical data

In [6]:

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelenco_X_1 = LabelEncoder()


In [7]:
X[:, 1] = labelenco_X_1.fit_transform(X[:, 1])


In [8]:
labelencoder_X2 = LabelEncoder()

In [9]:
X[:, 2] = labelencoder_X2.fit_transform(X[:, 2])

In [10]:
X

array([[619, 0, 0, ..., 1, 1, 101348.88],
       [608, 2, 0, ..., 0, 1, 112542.58],
       [502, 0, 0, ..., 1, 0, 113931.57],
       ...,
       [709, 0, 0, ..., 0, 1, 42085.58],
       [772, 1, 1, ..., 1, 0, 92888.52],
       [792, 0, 0, ..., 1, 0, 38190.78]], dtype=object)

In [11]:
from sklearn.compose import ColumnTransformer 
onehotencoder = OneHotEncoder()
columnTransformer = ColumnTransformer([('onehotencoder', 
                                        OneHotEncoder(), 
                                        [1])], 
                                      remainder='passthrough')


In [12]:
X= n.array(columnTransformer.fit_transform(X), dtype = n.str) 

In [13]:
X

array([['1.0', '0.0', '0.0', ..., '1', '1', '101348.88'],
       ['0.0', '0.0', '1.0', ..., '0', '1', '112542.58'],
       ['1.0', '0.0', '0.0', ..., '1', '0', '113931.57'],
       ...,
       ['1.0', '0.0', '0.0', ..., '0', '1', '42085.58'],
       ['0.0', '1.0', '0.0', ..., '1', '0', '92888.52'],
       ['1.0', '0.0', '0.0', ..., '1', '0', '38190.78']], dtype='<U9')

In [14]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [15]:

# Fitting XGBoost to the Training set
from xgboost import XGBClassifier
clas = XGBClassifier()
clas.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [16]:
y_pred = clas.predict(X_test)

In [17]:

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [18]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = clas, X = X_train, y = y_train, cv = 10)

In [19]:
cm

array([[1508,   77],
       [ 204,  211]], dtype=int64)

In [20]:
y_pred


array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [21]:
y_test

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [22]:
print(accuracies)

[0.85875 0.8625  0.865   0.84875 0.8475  0.83375 0.86    0.87    0.85125
 0.84125]


In [23]:
print(max(accuracies))

0.87
