# Machine Learning with DecisionTreeClassifier !

* The customer churn, also known as customer attrition, refers to the phenomenon whereby a customer leaves a service provider.

* It is advantageous for banks to know what leads a client towards the decision to leave the company.

* Churn prevention allows companies to develop loyalty programs and retention campaigns to keep as many customers as possible.

* In this code challenge, we use customer data from a bank to construct a predictive model using classification algo for the likely churn customers.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# reading  data from csv 
dataset = pd.read_csv('Churn_Modelling.csv')

In [3]:
# let's perform some investigation !

dataset.shape

(10000, 14)

In [4]:
dataset.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [5]:
dataset.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [6]:
dataset.columns.tolist()

['RowNumber',
 'CustomerId',
 'Surname',
 'CreditScore',
 'Geography',
 'Gender',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary',
 'Exited']

In [7]:
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [8]:
# separate out features and labels data !

features=dataset.iloc[:,3:-1].values

labels=dataset.iloc[:,-1].values

In [9]:
features

array([[619, 'France', 'Female', ..., 1, 1, 101348.88],
       [608, 'Spain', 'Female', ..., 0, 1, 112542.58],
       [502, 'France', 'Female', ..., 1, 0, 113931.57],
       ...,
       [709, 'France', 'Female', ..., 0, 1, 42085.58],
       [772, 'Germany', 'Male', ..., 1, 0, 92888.52],
       [792, 'France', 'Female', ..., 1, 0, 38190.78]], dtype=object)

In [10]:
labels

array([1, 0, 1, ..., 1, 1, 0], dtype=int64)

In [12]:
dataset.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

# Convert Categorical values to Numeric Format !

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [14]:
ColumnTransformer=ColumnTransformer([('encoder',OneHotEncoder(),[1,2])],remainder="passthrough")
features=np.array(ColumnTransformer.fit_transform(features),dtype=np.float32)

In [15]:
# handling the dummy variables trape !

features = features[:,1:]

In [16]:
# Splitting into Training and Testing sets !

from sklearn.model_selection import train_test_split
features_train,features_test,labels_train,labels_test = train_test_split(features,labels,test_size =0.4 ,random_state=0)

# Training the Model !

In [17]:
from sklearn.linear_model import LinearRegression 

In [18]:
regressor = LinearRegression()

In [19]:
regressor.fit(features_train,labels_train)

LinearRegression()

In [20]:
pred = regressor.predict(features_test)
print(pd.DataFrame(zip(pred,labels_test)))

             0  1
0     0.270987  0
1     0.323162  1
2     0.196044  0
3     0.131724  0
4     0.209020  0
...        ... ..
3995  0.362551  0
3996  0.241607  0
3997 -0.036071  0
3998  0.159186  0
3999  0.069441  0

[4000 rows x 2 columns]


In [21]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

In [22]:
# Applying DecisionTreeClassifier !

classi = DecisionTreeClassifier(criterion="entropy",random_state=0)
classi.fit(features_train,labels_train)
pred = classi.predict(features_test)
print(pd.DataFrame(zip(pred,labels_test)))

      0  1
0     0  0
1     0  1
2     0  0
3     0  0
4     0  0
...  .. ..
3995  0  0
3996  0  0
3997  0  0
3998  0  0
3999  0  0

[4000 rows x 2 columns]


In [23]:
# Building Confusion Matrix !

CM = confusion_matrix(pred,labels_test)

In [24]:
# Getting Accuracy Score of the Model !

Score = accuracy_score(labels_test,pred)

In [25]:
print ("model accuracy using confusion matrix"+str(CM))
print ("model accuracy using .score() function"+str(round(Score*100,2))+"%")

model accuracy using confusion matrix[[2739  359]
 [ 433  469]]
model accuracy using .score() function80.2%


In [26]:
#Make prediction using own data !

x=[600,"Spain","Male",43,5,77000,3,1,1,9000]
#x=[608,"Spain","Female",41,1,83807.86,1,0,1,112542.58]

In [27]:
x=np.array(x)
x=x.reshape(1,-1)
x=np.array(ColumnTransformer.transform(x),dtype=np.float32)

In [28]:
x=x[:,1:]
classi.predict(x)

array([1], dtype=int64)