In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,recall_score
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

In [2]:
data=pd.read_csv('tel_churn.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [3]:
data=data.drop(['Unnamed: 0'],axis=1)

In [4]:
data.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,0,1,0,0,1,1,0,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,0,1,1,0,1,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,1,0,1,1,0,1,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.3,1840.75,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
4,0,70.7,151.65,1,1,0,1,0,1,0,...,0,0,1,0,1,0,0,0,0,0


In [5]:
X=data.drop(['Churn'],axis=1)

In [6]:
y=data['Churn']

In [7]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [8]:
#Building Decision Tree Classifier
model_tree=DecisionTreeClassifier(criterion="gini",random_state=100,max_depth=6,min_samples_leaf=8)

In [9]:
model_tree.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [10]:
pred_tree=model_tree.predict(X_test)

In [11]:
model_tree.score(X_test,y_test)

0.7839374555792467

In [12]:
print(classification_report(y_test,pred_tree,labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.88      0.86      1038
           1       0.60      0.51      0.55       369

    accuracy                           0.78      1407
   macro avg       0.72      0.70      0.71      1407
weighted avg       0.77      0.78      0.78      1407



Since by observing we can see that accuracy is cursed due to imbalanced dataset
so we have to apply sampling method.

In [13]:
sm=SMOTEENN()

In [14]:
X_resampled, y_resampled = sm.fit_resample(X,y)

In [15]:
Xr_train,Xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [16]:
model_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [17]:
model_smote.fit(Xr_train,yr_train)
yr_predict = model_smote.predict(Xr_test)
model_score_r = model_smote.score(Xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.9180743243243243
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       535
           1       0.92      0.93      0.93       649

    accuracy                           0.92      1184
   macro avg       0.92      0.92      0.92      1184
weighted avg       0.92      0.92      0.92      1184



In [18]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[482  53]
 [ 44 605]]


Now we can see quite better results, i.e. Accuracy: 92 %, and a very good recall, precision & f1 score for minority class.

In [19]:
#let check for Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

In [20]:
model_forest=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [21]:
model_forest.fit(X_train,y_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [22]:
y_pred=model_forest.predict(X_test)

In [23]:
model_forest.score(X_test,y_test)

0.7960199004975125

In [24]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87      1038
           1       0.67      0.44      0.53       369

    accuracy                           0.80      1407
   macro avg       0.75      0.68      0.70      1407
weighted avg       0.78      0.80      0.78      1407



In [25]:
sm=SMOTEENN()

In [26]:
X_resampled, y_resampled = sm.fit_resample(X,y)

In [27]:
Xr_train,Xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [28]:
model_forest_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [29]:
model_forest_smote.fit(Xr_train,yr_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [30]:
yr_predict = model_forest_smote.predict(Xr_test)

In [31]:
model_score_r1 = model_forest_smote.score(Xr_test, yr_test)

In [32]:
print(model_score_r1)
print(metrics.classification_report(yr_test, yr_predict))

0.9336170212765957
              precision    recall  f1-score   support

           0       0.96      0.89      0.92       512
           1       0.92      0.97      0.94       663

    accuracy                           0.93      1175
   macro avg       0.94      0.93      0.93      1175
weighted avg       0.93      0.93      0.93      1175



In [33]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[455  57]
 [ 21 642]]


With RF Classifier, also we are able to get quite good results, infact better than Decision Tree.

In [34]:
import pickle


In [35]:
filename='model.sav'

In [36]:
pickle.dump(model_forest_smote,open(filename,'wb'))

In [37]:
load_model=pickle.load(open(filename,'rb'))

In [38]:
model_score_load=load_model.score(Xr_test,yr_test)

In [39]:
print(model_score_load)

0.9336170212765957
