Importing the libraries

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('Churn_Modelling.csv')

In [3]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [5]:
data.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

Data Preprocessing

In [6]:
data=data.drop(['RowNumber', 'CustomerId', 'Surname'],axis=1)
data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [7]:
data['Geography'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [8]:
data = pd.get_dummies(data, drop_first=True)

In [9]:
data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,0


In [10]:
x = data.drop('Exited', axis=1)
y = data['Exited']

Handling Data Imbalance

In [11]:
from imblearn.over_sampling import SMOTE

x_res, y_res = SMOTE().fit_resample(x, y)

Splitting the data

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_res, y_res, test_size = 0.2, random_state=42)

Data Preprocessing

In [13]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

Classification Models

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [16]:
from pandas.core.reshape.melt import lreshape
# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

lr.fit(x_train, y_train)

y_pred = lr.predict(x_test)

print("accuracy: " + str(accuracy_score(y_test, y_pred)))
print("precision: " + str(precision_score(y_test, y_pred)))
print("recall score: " + str(recall_score(y_test, y_pred)))
print("f1 score: " + str(f1_score(y_test, y_pred)))

accuracy: 0.7846829880728186
precision: 0.7738471257106759
recall score: 0.7887958789439794
f1 score: 0.78125


In [17]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

print("accuracy: " + str(accuracy_score(y_test, y_pred)))
print("precision: " + str(precision_score(y_test, y_pred)))
print("recall score: " + str(recall_score(y_test, y_pred)))
print("f1 score: " + str(f1_score(y_test, y_pred)))

accuracy: 0.8581293157564344
precision: 0.8504137492043284
recall score: 0.8602704443013522
f1 score: 0.8553137003841229


In [18]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()

gbc.fit(x_train, y_train)

y_pred = gbc.predict(x_test)

print("accuracy: " + str(accuracy_score(y_test, y_pred)))
print("precision: " + str(precision_score(y_test, y_pred)))
print("recall score: " + str(recall_score(y_test, y_pred)))
print("f1 score: " + str(f1_score(y_test, y_pred)))

accuracy: 0.844632768361582
precision: 0.8412903225806452
recall score: 0.8396651641983258
f1 score: 0.8404769577827909
