Importing Dependencies or Libararies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

Loading Data

In [2]:
dataset=pd.read_csv('Churn_Modelling.csv')

In [3]:
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
dataset.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [5]:
dataset['Geography'].value_counts()

Geography
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64

Label Encoding

In [6]:
dataset.replace({"Geography":{'France':0,'Germany':1,'Spain':2}},inplace=True)

In [7]:
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,0,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,2,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,0,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,0,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,2,Female,43,2,125510.82,1,1,1,79084.1,0


In [8]:
dataset.replace({"Gender":{'Male':1,'Female':0}},inplace=True)

In [9]:
dataset.shape

(10000, 14)

In [10]:
dataset.tail()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9995,9996,15606229,Obijiaku,771,0,1,39,5,0.0,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,0,1,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,0,0,36,7,0.0,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,1,1,42,3,75075.31,2,1,0,92888.52,1
9999,10000,15628319,Walker,792,0,0,28,4,130142.79,1,1,0,38190.78,0


Splitting Data

In [76]:
X=dataset.drop(columns=['RowNumber','CustomerId','Surname','Exited'])

In [77]:
Y=dataset['Exited']

In [78]:
print(X)

      CreditScore  Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0             619          0       0   42       2       0.00              1   
1             608          2       0   41       1   83807.86              1   
2             502          0       0   42       8  159660.80              3   
3             699          0       0   39       1       0.00              2   
4             850          2       0   43       2  125510.82              1   
...           ...        ...     ...  ...     ...        ...            ...   
9995          771          0       1   39       5       0.00              2   
9996          516          0       1   35      10   57369.61              1   
9997          709          0       0   36       7       0.00              1   
9998          772          1       1   42       3   75075.31              2   
9999          792          0       0   28       4  130142.79              1   

      HasCrCard  IsActiveMember  EstimatedSalary  


In [79]:
print(Y)

0       1
1       0
2       1
3       0
4       0
       ..
9995    0
9996    0
9997    1
9998    1
9999    0
Name: Exited, Length: 10000, dtype: int64


Standardization of values

In [80]:
scaler=StandardScaler()

In [81]:
scaler.fit(X)

In [82]:
dataset_scaler=scaler.transform(X)

In [83]:
print(dataset_scaler)

[[-0.32622142 -0.90188624 -1.09598752 ...  0.64609167  0.97024255
   0.02188649]
 [-0.44003595  1.51506738 -1.09598752 ... -1.54776799  0.97024255
   0.21653375]
 [-1.53679418 -0.90188624 -1.09598752 ...  0.64609167 -1.03067011
   0.2406869 ]
 ...
 [ 0.60498839 -0.90188624 -1.09598752 ... -1.54776799  0.97024255
  -1.00864308]
 [ 1.25683526  0.30659057  0.91241915 ...  0.64609167 -1.03067011
  -0.12523071]
 [ 1.46377078 -0.90188624 -1.09598752 ...  0.64609167 -1.03067011
  -1.07636976]]


In [84]:
X=dataset_scaler

In [85]:
print(X)

[[-0.32622142 -0.90188624 -1.09598752 ...  0.64609167  0.97024255
   0.02188649]
 [-0.44003595  1.51506738 -1.09598752 ... -1.54776799  0.97024255
   0.21653375]
 [-1.53679418 -0.90188624 -1.09598752 ...  0.64609167 -1.03067011
   0.2406869 ]
 ...
 [ 0.60498839 -0.90188624 -1.09598752 ... -1.54776799  0.97024255
  -1.00864308]
 [ 1.25683526  0.30659057  0.91241915 ...  0.64609167 -1.03067011
  -0.12523071]
 [ 1.46377078 -0.90188624 -1.09598752 ...  0.64609167 -1.03067011
  -1.07636976]]


Splitting data for test and train

In [86]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=0)

In [87]:
print(X.shape,X_train.shape,X_test.shape)

(10000, 10) (8000, 10) (2000, 10)


In [88]:
#model=LogisticRegression()

In [89]:
#model.fit(X_train,Y_train)

In [90]:
#model evalutaion
#accuracy on the train
#X_train_prediction=model.predict(X_train)
#training_data_accuracy=accuracy_score(X_train_prediction,Y_train)

In [91]:
#print('Accuracy on training data',training_data_accuracy)

Accuracy on training data 0.80875


In [92]:
#X_test_prediction=model.predict(X_test)
#test_data_accuracy=accuracy_score(X_test_prediction,Y_test)

In [93]:
#print('Accuracy on test data',test_data_accuracy)

Accuracy on test data 0.8045


In [96]:
#Making Predictive system
#input_data=(709,0,0,36,7,0,1,0,1,42085.58)
#input_data_as_numpy_array=np.asarray(input_data)

#reshape the numpy array as we are predicting forr one instance
#input_data_reshaped=input_data_as_numpy_array.reshape(1,-1)

#standardized the input data

#std_data=scaler.transform(input_data_reshaped)
#print(std_data)

#prediction=model.predict(std_data)
#print(prediction)

[[ 0.60498839 -0.90188624 -1.09598752 -0.27860412  0.68712986 -1.22584767
  -0.91158349 -1.54776799  0.97024255 -1.00864308]]
[0]




In [97]:
from sklearn.ensemble import RandomForestClassifier

Model Applying

In [98]:
clf=RandomForestClassifier()

In [99]:
clf.fit(X_train,Y_train)

In [107]:
X_train_prediction=clf.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,Y_train)

In [108]:
print('Accuracy on training data',training_data_accuracy)

Accuracy on training data 1.0


In [109]:
X_test_prediction=clf.predict(X_test)
test_data_accuracy=accuracy_score(X_test_prediction,Y_test)

In [110]:
print('Accuracy on test data',test_data_accuracy)

Accuracy on test data 0.8575


In [112]:
#Making Predictive system
input_data=(792,0,0,28,4,130142.79,1,1,0,38190.78)
input_data_as_numpy_array=np.asarray(input_data)

#reshape the numpy array as we are predicting forr one instance
input_data_reshaped=input_data_as_numpy_array.reshape(1,-1)

#standardized the input data

std_data=scaler.transform(input_data_reshaped)
print(std_data)

prediction=clf.predict(std_data)
print(prediction)

[[ 1.46377078 -0.90188624 -1.09598752 -1.04143285 -0.35020386  0.85996499
  -0.91158349  0.64609167 -1.03067011 -1.07636976]]
[0]


