# Importing Libraries

In [1]:
import numpy as np
import pandas as pd

# Importing Dataset

In [2]:
df = pd.read_csv('churn.csv')

# Data Cleansing

In [3]:
df.drop("CustomerId", axis=1, inplace=True)
df.drop("Surname", axis=1, inplace=True)
df.drop("HasCrCard", axis=1, inplace=True)
df.drop("RowNumber", axis=1, inplace = True)
# Take all Row, Take all column until -1
X = df.iloc[:,:-1].values
# Take all Row, Take only -1 column
y = df.iloc[:,-1].values

In [4]:
print(X)

[[619 'France' 'Female' ... 1 1 101348.88]
 [608 'Spain' 'Female' ... 1 1 112542.58]
 [502 'France' 'Female' ... 3 0 113931.57]
 ...
 [709 'France' 'Female' ... 1 1 42085.58]
 [772 'Germany' 'Male' ... 2 0 92888.52]
 [792 'France' 'Female' ... 1 0 38190.78]]


In [5]:
print(y)

[1 0 1 ... 1 1 0]


# Encoding Gender and One Hot Encoder for Country

In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [7]:
le = LabelEncoder()
X[:, 2] = le.fit_transform(X[:, 2])

In [8]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

# Splitting Dataset

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 0)

# Feature Scalling

In [10]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Logistic Regression

In [11]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [12]:
from sklearn.linear_model import LogisticRegression
clr = LogisticRegression(random_state = 0)
clr.fit(X_train, y_train)
y_lr_pred = clr.predict(X_test)

cm = confusion_matrix(y_test, y_lr_pred)
print(cm)
accuracy = accuracy_score(y_test, y_lr_pred)
print(accuracy)

[[1525   70]
 [ 309   96]]
0.8105


# KNN

In [13]:
from sklearn.neighbors import KNeighborsClassifier
clknn = KNeighborsClassifier(n_neighbors = 5, metric='minkowski', p = 2)
clknn.fit(X_train, y_train)
y_clknn_pred = clknn.predict(X_test)

cm_knn = confusion_matrix(y_test, y_clknn_pred)
print(cm_knn)
accuracy_knn = accuracy_score(y_test, y_clknn_pred)
print(accuracy_knn)

[[1496   99]
 [ 223  182]]
0.839


# SVM

In [14]:
from sklearn.svm import SVC
clf_svm = SVC(kernel = 'sigmoid', random_state = 0)
clf_svm.fit(X_train, y_train)
y_svm_pred = clf_svm.predict(X_test)

cm_svm = confusion_matrix(y_test, y_svm_pred)
print(cm_svm)
accuracy_svm = accuracy_score(y_test, y_svm_pred)
print(accuracy_svm)

[[1300  295]
 [ 313   92]]
0.696


# Kernel SVM

In [15]:
from sklearn.svm import SVC
clf_ksvm = SVC(kernel = 'rbf', random_state = 0)
clf_ksvm.fit(X_train, y_train)
y_ksvm_pred = clf_ksvm.predict(X_test)

cm_ksvm = confusion_matrix(y_test, y_ksvm_pred)
print(cm_ksvm)
accuracy_ksvm = accuracy_score(y_test, y_ksvm_pred)
print(accuracy_ksvm)

[[1546   49]
 [ 219  186]]
0.866


# Naive Bayess

In [16]:
from sklearn.naive_bayes import GaussianNB
clf_gauss = GaussianNB()
clf_gauss.fit(X_train, y_train)
y_gauss_pred = clf_gauss.predict(X_test)

cm_gauss = confusion_matrix(y_test, y_gauss_pred)
print(cm_gauss)
accuracy_gauss = accuracy_score(y_test, y_gauss_pred)
print(accuracy_gauss)

[[1464  131]
 [ 238  167]]
0.8155


# Decision Tree

In [17]:
from sklearn.tree import DecisionTreeClassifier
clf_tree = DecisionTreeClassifier(criterion = 'entropy')
clf_tree.fit(X_train, y_train)
y_tree_pred = clf_tree.predict(X_test)

cm_tree = confusion_matrix(y_test, y_tree_pred)
print(cm_tree)
accuracy_tree = accuracy_score(y_test, y_tree_pred)
print(accuracy_tree)

[[1376  219]
 [ 184  221]]
0.7985


# Random Forest (Ensamble bagged trees)

In [18]:
from sklearn.ensemble import RandomForestClassifier
clf_forest = RandomForestClassifier(n_estimators = 20, criterion = 'entropy')
clf_forest.fit(X_train, y_train)
y_forest_pred = clf_forest.predict(X_test)

cm_forest = confusion_matrix(y_test, y_forest_pred)
print(cm_forest)
accuracy_forest = accuracy_score(y_test, y_forest_pred)
print(accuracy_forest)

[[1516   79]
 [ 199  206]]
0.861


In [20]:
print(sc)

StandardScaler()
