In [5]:
#step1 : load dataset using pandas
import pandas as pd
data=pd.read_csv("Churn_Modelling.csv") #enter path of the dataset

In [6]:
#lets display the top 5 rows of this dataset
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [7]:
#step2 : remove columns which do not help in prediction (Here, in this dataset, we will be prediciting the 'Exited value' column)
data.drop(['RowNumber','CustomerId','Surname'],axis=1,inplace=True)
#lets display the dataset now
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [8]:
#we need to convert each column containing numerical values only in order to be able to apply ML algorithms.
#For gender column, assign 1 for female and 0 for male
data.Gender=[1 if each == 'Female' else 0 for each in data.Gender]

In [9]:
#lets use the get_dummies() function to convert each distinct Geography value to a column
data=pd.get_dummies(prefix="Geo",data=data,columns=['Geography'])
#lets display the dataset now
data.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geo_France,Geo_Germany,Geo_Spain
0,619,1,42,2,0.0,1,1,1,101348.88,1,1,0,0
1,608,1,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,502,1,42,8,159660.8,3,1,0,113931.57,1,1,0,0
3,699,1,39,1,0.0,2,0,0,93826.63,0,1,0,0
4,850,1,43,2,125510.82,1,1,1,79084.1,0,0,0,1


In [11]:
#Lets remove the Exited column and store it into another variable now, ie, y.
y=data.Exited.values
x=data.drop(['Exited'],axis=1)

In [12]:
x.describe()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geo_France,Geo_Germany,Geo_Spain
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,650.5288,0.4543,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.5014,0.2509,0.2477
std,96.653299,0.497932,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.500023,0.433553,0.431698
min,350.0,0.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0,0.0,0.0
25%,584.0,0.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0,0.0,0.0
50%,652.0,0.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,1.0,0.0,0.0
75%,718.0,1.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,1.0,1.0,0.0
max,850.0,1.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0,1.0,1.0


In [13]:
#before we apply ML algorithms to this dataset, we need to normalize the data so each value in the above table falls between 0 and 1
#normalization :
import numpy as np
x_norm=(x-np.min(x))/(np.max(x)-np.min(x))

In [14]:
#x_norm is the normalized dataset
#as you can see below, now, after normalization, all values range between 0 and 1
x_norm.describe()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geo_France,Geo_Germany,Geo_Spain
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,0.601058,0.4543,0.282727,0.50128,0.304848,0.176733,0.7055,0.5151,0.500441,0.5014,0.2509,0.2477
std,0.193307,0.497932,0.141727,0.289217,0.248696,0.193885,0.45584,0.499797,0.28758,0.500023,0.433553,0.431698
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.468,0.0,0.189189,0.3,0.0,0.0,0.0,0.0,0.254977,0.0,0.0,0.0
50%,0.604,0.0,0.256757,0.5,0.387402,0.0,1.0,1.0,0.50096,1.0,0.0,0.0
75%,0.736,1.0,0.351351,0.7,0.508749,0.333333,1.0,1.0,0.746955,1.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
#now we need to divide this dataset into training and testing dataset
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x_norm,y,test_size=0.3,random_state=7)
#in this case we have kept the test size 30% 

In [17]:
#decision tree
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(xtrain,ytrain)
y_pred=clf.predict(xtest)
from sklearn.metrics import accuracy_score
accuracy_score(ytest, y_pred)


0.7876666666666666

In [19]:
#random forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=5,n_estimators=100)
rf = rf.fit(xtrain,ytrain)
y_rpred=rf.predict(xtest)
from sklearn.metrics import accuracy_score
accuracy_score(ytest, y_rpred)

0.8626666666666667

In [21]:
#Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(xtrain,ytrain)
y_nbpred=gnb.predict(xtest)
from sklearn.metrics import accuracy_score
accuracy_score(ytest, y_nbpred)

0.821

In [22]:
#SVM
from sklearn import svm
s = svm.SVC(kernel='poly')
s = s.fit(xtrain, ytrain)
y_svmpred=s.predict(xtest)
from sklearn.metrics import accuracy_score
accuracy_score(ytest, y_svmpred)



0.7983333333333333

In [24]:
#in the above case, for svm, you can change the kernel to sigmoid, rbf, etc. 

In [25]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=7)
knn.fit(xtrain,ytrain)
y_knnpred=knn.predict(xtest)
from sklearn.metrics import accuracy_score
accuracy_score(ytest, y_knnpred)

0.815

In [26]:
#logistic regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0)
lr.fit(xtrain, ytrain)
y_lrpred=lr.predict(xtest)
from sklearn.metrics import accuracy_score
accuracy_score(ytest, y_lrpred)



0.8146666666666667