# Importing Libraries

In [1]:
import numpy as np
import pandas as pd

# Importing Dataset

In [2]:
df = pd.read_csv('churn.csv')

In [3]:
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [4]:
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [5]:
df['Exited'].value_counts()
#print(Counter(df['Exited']))

0    7963
1    2037
Name: Exited, dtype: int64

## The Data is a little bit imabalanced (20%)

# Random Undersampling

# Data Cleansing

In [6]:
df.drop("CustomerId", axis=1, inplace=True)
df.drop("Surname", axis=1, inplace=True)
#df.drop("HasCrCard", axis=1, inplace=True)
df.drop("RowNumber", axis=1, inplace = True)
# Take all Row, Take all column until -1
X = df.iloc[:,:-1].values
# Take all Row, Take only -1 column
y = df.iloc[:,-1].values

In [7]:
print(X)

[[619 'France' 'Female' ... 1 1 101348.88]
 [608 'Spain' 'Female' ... 0 1 112542.58]
 [502 'France' 'Female' ... 1 0 113931.57]
 ...
 [709 'France' 'Female' ... 0 1 42085.58]
 [772 'Germany' 'Male' ... 1 0 92888.52]
 [792 'France' 'Female' ... 1 0 38190.78]]


In [8]:
print(X[0,:])

[619 'France' 'Female' 42 2 0.0 1 1 1 101348.88]


In [9]:
print(y)

[1 0 1 ... 1 1 0]


# Encoding Gender and One Hot Encoder for Country

In [10]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [11]:
le = LabelEncoder()
X[:, 2] = le.fit_transform(X[:, 2])
print(X)

[[619 'France' 0 ... 1 1 101348.88]
 [608 'Spain' 0 ... 0 1 112542.58]
 [502 'France' 0 ... 1 0 113931.57]
 ...
 [709 'France' 0 ... 0 1 42085.58]
 [772 'Germany' 1 ... 1 0 92888.52]
 [792 'France' 0 ... 1 0 38190.78]]


In [12]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X)

[[1.0 0.0 0.0 ... 1 1 101348.88]
 [0.0 0.0 1.0 ... 0 1 112542.58]
 [1.0 0.0 0.0 ... 1 0 113931.57]
 ...
 [1.0 0.0 0.0 ... 0 1 42085.58]
 [0.0 1.0 0.0 ... 1 0 92888.52]
 [1.0 0.0 0.0 ... 1 0 38190.78]]


# Splitting Dataset

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 0)

# Feature Scalling

In [14]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Random Undersampling

In [15]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

In [16]:
print("Before undersampling: ", Counter(y_train))

Before undersampling:  Counter({0: 6368, 1: 1632})


In [17]:
undersample = RandomUnderSampler(sampling_strategy='majority')
X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)

In [18]:
print("After undersampling: ", Counter(y_train_under))

After undersampling:  Counter({0: 1632, 1: 1632})


# Logistic Regression

In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [20]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(random_state = 0)
lr_clf.fit(X_train_under, y_train_under)
y_lr_pred = lr_clf.predict(X_test)

lr_cm = confusion_matrix(y_test, y_lr_pred)
print(lr_cm)
accuracy = accuracy_score(y_test, y_lr_pred)
print(accuracy)

[[1109  486]
 [ 115  290]]
0.6995


# KNN

In [21]:
from sklearn.neighbors import KNeighborsClassifier
clknn = KNeighborsClassifier(n_neighbors = 5, metric='minkowski', p = 2)
clknn.fit(X_train_under, y_train_under)
y_clknn_pred = clknn.predict(X_test)

cm_knn = confusion_matrix(y_test, y_clknn_pred)
print(cm_knn)
accuracy_knn = accuracy_score(y_test, y_clknn_pred)
print(accuracy_knn)

[[1137  458]
 [ 112  293]]
0.715


# SVM

In [22]:
from sklearn.svm import SVC
clf_svm = SVC(kernel = 'sigmoid', random_state = 0)
clf_svm.fit(X_train_under, y_train_under)
y_svm_pred = clf_svm.predict(X_test)

cm_svm = confusion_matrix(y_test, y_svm_pred)
print(cm_svm)
accuracy_svm = accuracy_score(y_test, y_svm_pred)
print(accuracy_svm)

[[910 685]
 [171 234]]
0.572


# Kernel SVM

In [23]:
from sklearn.svm import SVC
clf_ksvm = SVC(kernel = 'rbf', random_state = 0)
clf_ksvm.fit(X_train_under, y_train_under)
y_ksvm_pred = clf_ksvm.predict(X_test)

cm_ksvm = confusion_matrix(y_test, y_ksvm_pred)
print(cm_ksvm)
accuracy_ksvm = accuracy_score(y_test, y_ksvm_pred)
print(accuracy_ksvm)

[[1209  386]
 [  86  319]]
0.764


# Naive Bayess

In [24]:
from sklearn.naive_bayes import GaussianNB
clf_gauss = GaussianNB()
clf_gauss.fit(X_train_under, y_train_under)
y_gauss_pred = clf_gauss.predict(X_test)

cm_gauss = confusion_matrix(y_test, y_gauss_pred)
print(cm_gauss)
accuracy_gauss = accuracy_score(y_test, y_gauss_pred)
print(accuracy_gauss)

[[1140  455]
 [ 124  281]]
0.7105


# Decision Tree

In [25]:
from sklearn.tree import DecisionTreeClassifier
clf_tree = DecisionTreeClassifier(criterion = 'entropy')
clf_tree.fit(X_train_under, y_train_under)
y_tree_pred = clf_tree.predict(X_test)

cm_tree = confusion_matrix(y_test, y_tree_pred)
print(cm_tree)
accuracy_tree = accuracy_score(y_test, y_tree_pred)
print(accuracy_tree)

[[1096  499]
 [ 109  296]]
0.696


# Bagging Decision Tree (Ensamble Bagging)

In [26]:
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(DecisionTreeClassifier(), n_estimators=100, max_samples=0.8, oob_score=True)
bagging.fit(X_train_under, y_train_under)
y_treeb_pred = bagging.predict(X_test)

cm_treeb = confusion_matrix(y_test, y_treeb_pred)
print(cm_treeb)
accuracy_treeb = accuracy_score(y_test, y_treeb_pred)
print(accuracy_treeb)
#print(bagging.score(X_test, y_test))

[[1233  362]
 [  88  317]]
0.775


In [27]:
bagging.oob_score_

0.7699142156862745

# Random Forest (Ensamble Forests of Randomized Trees)

In [28]:
from sklearn.ensemble import RandomForestClassifier
clf_forest = RandomForestClassifier(n_estimators = 50, criterion = 'entropy')
clf_forest.fit(X_train_under, y_train_under)
y_forest_pred = clf_forest.predict(X_test)

cm_forest = confusion_matrix(y_test, y_forest_pred)
print(cm_forest)
accuracy_forest = accuracy_score(y_test, y_forest_pred)
print(accuracy_forest)

[[1232  363]
 [  84  321]]
0.7765


# Voting Classifier (Ensamble Boosting Voting Classifier)

# Use LogisticRegression, Naive, KNN, KernelSVM, RandomForest

In [29]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

In [30]:
clf1 = LogisticRegression(random_state = 1)
clf2 = GaussianNB()
clf3 = KNeighborsClassifier(n_neighbors = 5, metric='minkowski', p = 2)
clf4 = SVC(kernel = 'rbf', random_state = 1)
clf5 = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', random_state = 1)

eclf = VotingClassifier(estimators=[('lr', clf1),('gnb', clf2),('knn',clf3),('svc', clf4),('rf', clf5)], voting='hard')

In [31]:
for clf, label in zip([clf1, clf2, clf3, clf4, clf5, eclf], ['Logistic Regression', 'Naive Bayes', 'KNN', 'SVC', 'Random Forest', 'Ensemble Voting']):
    scores = cross_val_score(clf,X, y, scoring='accuracy', cv=5)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.79 (+/- 0.00) [Logistic Regression]
Accuracy: 0.78 (+/- 0.00) [Naive Bayes]
Accuracy: 0.77 (+/- 0.00) [KNN]
Accuracy: 0.80 (+/- 0.00) [SVC]
Accuracy: 0.86 (+/- 0.01) [Random Forest]
Accuracy: 0.80 (+/- 0.00) [Ensemble Voting]


In [32]:
eclf.fit(X_train_under, y_train_under)
y_eclf_pred = eclf.predict(X_test)

cm_eclf = confusion_matrix(y_test, y_eclf_pred)
print(cm_eclf)
accuracy_eclf = accuracy_score(y_test, y_eclf_pred)
print(accuracy_eclf)

[[1192  403]
 [  84  321]]
0.7565


# ANN

In [33]:
import tensorflow as tf

In [34]:
ann = tf.keras.models.Sequential()

In [35]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [36]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [37]:
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [38]:
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [39]:
ann.fit(X_train_under, y_train_under, batch_size=32,epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/20

Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<tensorflow.python.keras.callbacks.History at 0x278c0741bb0>

In [40]:
y_pred = ann.predict(X_test)
y_pred = (y_pred > 0.5)

In [41]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[1222  373]
 [  90  315]]


0.7685

# Single Prediction

In [42]:
geography = 'Germany'
credit_score = 650
gender = 'Female'
age = 38
tenure = 9
balance = 137843.23
number_prod = 1
cr_card = 1
active_member = 1
estimated_salary = 117622.77

In [43]:
X_test_new = np.array([[credit_score, geography, gender, age, tenure, balance, number_prod, cr_card, active_member, estimated_salary]])

In [44]:
X_test_new[:, 2] = le.transform(X_test_new[:, 2])
print(X_test_new)

[['650' 'Germany' '0' '38' '9' '137843.23' '1' '1' '1' '117622.77']]


In [45]:
X_test_new = ct.transform(X_test_new)
print(X_test_new)

[['0.0' '1.0' '0.0' '650' '0' '38' '9' '137843.23' '1' '1' '1'
  '117622.77']]


In [46]:
sp_forest = clf_forest.predict(sc.transform(X_test_new))
if sp_forest == 0:
    sp_forest = "stay"
else:
    sp_forest = "leave"
print("Single prediction with random forest, the customer will ", sp_forest )

Single prediction with random forest, the customer will  leave


In [47]:
sp_bag = bagging.predict(sc.transform(X_test_new))
if sp_bag == 0:
    sp_bag = "stay"
else:
    sp_bag = "leave"
print("Single prediction with bagging, the customer will ", sp_bag )

Single prediction with bagging, the customer will  leave


In [48]:
sp_ann = ann.predict(sc.transform(X_test_new)) > 0.5
if sp_ann == False:
    sp_ann = "stay"
else:
    sp_ann = "leave"
print("Single prediction with ann, the customer will ", sp_ann)

Single prediction with ann, the customer will  leave


In [49]:
print(Counter(df['Geography']))

Counter({'France': 5014, 'Germany': 2509, 'Spain': 2477})


In [50]:
print(Counter(df['Gender']))

Counter({'Male': 5457, 'Female': 4543})


# Save model

In [51]:
import pickle
from keras.models import model_from_json

In [52]:
filename1 = 'randomforest.sav'
filename2 = 'bag.sav'
filename3 = 'ann.sav'

pickle.dump(clf_forest, open(filename1, 'wb'))
pickle.dump(bagging, open(filename2, 'wb'))

ann_model_json = ann.to_json()
with open("ann_model.json", "w") as json_file:
    json_file.write(ann_model_json)
# serialize weights to HDF5
ann.save_weights("model.h5")

In [53]:
json_file = open('ann_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model_ann = model_from_json(loaded_model_json)
# load weights into new model
loaded_model_ann.load_weights("model.h5")
print("Loaded model from disk")

# evaluate loaded model on test data
loaded_model_ann.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
score = loaded_model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

Loaded model from disk


NameError: name 'loaded_model' is not defined

In [None]:
filename1 = 'randomforest.sav'
filename2 = 'bag.sav'
filename3 = 'ann.sav'

loaded_model_rf = pickle.load(open(filename1, 'rb'))
#result = loaded_model_rf.score(X_test, y_test)
#print(result)

loaded_model_bag = pickle.load(open(filename2, 'rb'))
#result = loaded_model_bag.score(X_test, y_test)
#print(result)

json_file = open('ann_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model_ann = model_from_json(loaded_model_json)
# load weights into new model
loaded_model_ann.load_weights("model.h5")

In [None]:
res1 = loaded_model_rf.predict(sc.transform(X_test_new))
res2 = loaded_model_bag.predict(sc.transform(X_test_new))
res3 = loaded_model_ann.predict(sc.transform(X_test_new)) > 0.5

In [None]:
print(res1)
print(res2)
print(res3)