In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
data = pd.read_csv('bank_churn_data_exp.csv')
data.head()

Unnamed: 0,AGE,CUS_Month_Income,CUS_Gender,CUS_Marital_Status,YEARS_WITH_US,total_debit_tran_s1,total_debit_tran_s2,total_debit_tran_s3,total debit amount for S1,total debit amount for S2,...,total credit amount for S2,total credit amount for S3,total debit amount,total debit transactions,total credit amount,total credit transactions,total transactions,CUS_Target,TAR_Desc,Status
0,48,3750000.0,FEMALE,SINGLE,14,40,48,33,50302.37,72102.31,...,78986.72,59854.68,175007.95,121,196499.28,46,167,2223,LOW,0
1,53,10000000.0,MALE,SINGLE,14,3,17,52,4209.0,38719.6,...,109058.0,92939.0,68690.6,72,317436.0,25,97,2223,LOW,0
2,43,2000000.0,MALE,SINGLE,14,5,14,20,5311.83,57480.3,...,0.0,0.0,120146.98,39,0.0,0,39,2223,LOW,0
3,43,486.0,MALE,SINGLE,14,1,1,0,3657.91,175.0,...,84356.94,0.0,3832.91,2,84356.94,1,3,2222,MIDLE,0
4,27,180000.0,FEMALE,SINGLE,14,0,7,1,0.0,72143.6,...,12000.0,0.0,72643.6,8,12000.0,2,10,2222,MIDLE,1


In [3]:
data.TAR_Desc.value_counts()

LOW          1312
MIDLE         609
EXECUTIVE      60
PLATINUM       49
Name: TAR_Desc, dtype: int64

In [4]:
le = LabelEncoder()

data['CUS_Marital_Status'] = le.fit_transform(data['CUS_Marital_Status'])
data['CUS_Gender'] = le.fit_transform(data['CUS_Gender'])
data['TAR_Desc'] = le.fit_transform(data['TAR_Desc'])

In [43]:
print(data.CUS_Gender.value_counts())
print(data.CUS_Marital_Status.value_counts())
print(data.TAR_Desc.value_counts())
print(data.Status.value_counts())

1    1269
0     761
Name: CUS_Gender, dtype: int64
4    1506
1     469
0      37
5      16
2       1
3       1
Name: CUS_Marital_Status, dtype: int64
1    1312
2     609
0      60
3      49
Name: TAR_Desc, dtype: int64
0    1015
1    1015
Name: Status, dtype: int64


In [6]:
data.head()

Unnamed: 0,AGE,CUS_Month_Income,CUS_Gender,CUS_Marital_Status,YEARS_WITH_US,total_debit_tran_s1,total_debit_tran_s2,total_debit_tran_s3,total debit amount for S1,total debit amount for S2,...,total credit amount for S2,total credit amount for S3,total debit amount,total debit transactions,total credit amount,total credit transactions,total transactions,CUS_Target,TAR_Desc,Status
0,48,3750000.0,0,4,14,40,48,33,50302.37,72102.31,...,78986.72,59854.68,175007.95,121,196499.28,46,167,2223,1,0
1,53,10000000.0,1,4,14,3,17,52,4209.0,38719.6,...,109058.0,92939.0,68690.6,72,317436.0,25,97,2223,1,0
2,43,2000000.0,1,4,14,5,14,20,5311.83,57480.3,...,0.0,0.0,120146.98,39,0.0,0,39,2223,1,0
3,43,486.0,1,4,14,1,1,0,3657.91,175.0,...,84356.94,0.0,3832.91,2,84356.94,1,3,2222,2,0
4,27,180000.0,0,4,14,0,7,1,0.0,72143.6,...,12000.0,0.0,72643.6,8,12000.0,2,10,2222,2,1


In [8]:
X = data.drop(['Status'], axis=1)
y = data['Status']

In [9]:
X.shape

(2030, 24)

In [10]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, train_size= 0.8, random_state= 42)

In [11]:
rfc = RandomForestClassifier(n_estimators= 30, n_jobs= -1)

In [12]:
rfc.fit(Xtrain,ytrain)

RandomForestClassifier(n_estimators=30, n_jobs=-1)

In [13]:
ypred = rfc.predict(Xtest)

In [14]:
print('Accuracy Score: ', accuracy_score(ypred, ytest))

Accuracy Score:  0.9655172413793104


In [15]:
print(confusion_matrix(ypred, ytest))

[[195   4]
 [ 10 197]]


In [16]:
print(classification_report(ypred, ytest))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97       199
           1       0.98      0.95      0.97       207

    accuracy                           0.97       406
   macro avg       0.97      0.97      0.97       406
weighted avg       0.97      0.97      0.97       406



The accuracy increased by 1%, from 94% to 96% and this could be the issue of overfitting and multi colinearity so we have to drop the less important features from the dataset.

In [24]:
data.columns

Index(['AGE', 'CUS_Month_Income', 'CUS_Gender', 'CUS_Marital_Status',
       'YEARS_WITH_US', 'total_debit_tran_s1', 'total_debit_tran_s2',
       'total_debit_tran_s3', 'total debit amount for S1',
       'total debit amount for S2', 'total debit amount for S3',
       'total_cred_trans_s1', 'total_cred_trans_s2', 'total_cred_trans_s3',
       'total credit amount for S1', 'total credit amount for S2',
       'total credit amount for S3', 'total debit amount',
       'total debit transactions', 'total credit amount',
       'total credit transactions', 'total transactions', 'CUS_Target',
       'TAR_Desc', 'Status'],
      dtype='object')

In [34]:
data_imp_feat = data[['CUS_Month_Income', 'total_debit_tran_s1', 'total_debit_tran_s3',
                           'total debit amount for S3','TAR_Desc', 'Status']]

In [35]:
data_imp_feat.head()

Unnamed: 0,CUS_Month_Income,total_debit_tran_s1,total_debit_tran_s3,total debit amount for S3,TAR_Desc,Status
0,3750000.0,40,33,52603.27,1,0
1,10000000.0,3,52,25762.0,1,0
2,2000000.0,5,20,57354.85,1,0
3,486.0,1,0,0.0,2,0
4,180000.0,0,1,500.0,2,1


In [36]:
X = data_imp_feat.drop(['Status'], axis = 1)
y = data_imp_feat['Status']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= 0.8, random_state= 42)

In [38]:
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

In [40]:
print('Accuracy Score: ', accuracy_score(y_pred, y_test))

Accuracy Score:  0.9433497536945813


In [41]:
print(confusion_matrix(y_pred, y_test))

[[186   4]
 [ 19 197]]


In [42]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94       190
           1       0.98      0.91      0.94       216

    accuracy                           0.94       406
   macro avg       0.94      0.95      0.94       406
weighted avg       0.95      0.94      0.94       406



In [46]:
X_test.to_csv('test.csv', index= False)

In [44]:
import pickle

pickle.dump(rfc, open('churn_pred.pkl', 'wb'))

In [45]:
model = pickle.load(open('churn_pred.pkl', 'rb'))