In [2]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN # first install imbalanced-learn

In [3]:
df=pd.read_csv('tel_churn.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0


In [8]:
df=df.drop('Unnamed: 0',axis=1)
df.head(4)

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,0,1,0,0,1,1,0,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,0,1,1,0,1,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,1,0,1,1,0,1,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.3,1840.75,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0


In [9]:
x=df.drop('Churn', axis=1)
x.head(3)

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [10]:
x.shape

(7032, 50)

In [11]:
y=df['Churn']
y.head(3)

0    0
1    0
2    1
Name: Churn, dtype: int64

In [12]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

# decision tree classifier

In [13]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [14]:
model_dt.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [15]:
y_pred=model_dt.predict(x_test)
y_pred

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [16]:
model_dt.score(x_test,y_test)

0.7874911158493249

In [19]:
print(confusion_matrix(y_test,y_pred))

[[951  91]
 [208 157]]


In [17]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86      1042
           1       0.63      0.43      0.51       365

    accuracy                           0.79      1407
   macro avg       0.73      0.67      0.69      1407
weighted avg       0.77      0.79      0.77      1407



As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.
Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.
Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [21]:
sm=SMOTEENN()

In [26]:
from imblearn.over_sampling import SMOTE
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [27]:

xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [28]:

model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [29]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.934931506849315
              precision    recall  f1-score   support

           0       0.94      0.92      0.93       542
           1       0.93      0.95      0.94       626

    accuracy                           0.93      1168
   macro avg       0.94      0.93      0.93      1168
weighted avg       0.93      0.93      0.93      1168



In [30]:
print(confusion_matrix(yr_test,yr_predict))

[[500  42]
 [ 34 592]]


#from above we can see model has good accuracy of 93%

# Random Forest classifier

In [31]:
from sklearn.ensemble import RandomForestClassifier


In [32]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [33]:

model_rf.fit(x_train,y_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [35]:
y_pred=model_rf.predict(x_test)

In [36]:

model_rf.score(x_test,y_test)

0.7846481876332623

In [37]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.81      0.92      0.86      1042
           1       0.63      0.40      0.49       365

    accuracy                           0.78      1407
   macro avg       0.72      0.66      0.68      1407
weighted avg       0.77      0.78      0.77      1407



In [39]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x,y)

In [40]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [41]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [42]:
model_rf_smote.fit(xr_train1,yr_train1)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [43]:

yr_predict1 = model_rf_smote.predict(xr_test1)

In [44]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [45]:

print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))

0.9395744680851064
              precision    recall  f1-score   support

           0       0.95      0.91      0.93       523
           1       0.93      0.96      0.95       652

    accuracy                           0.94      1175
   macro avg       0.94      0.94      0.94      1175
weighted avg       0.94      0.94      0.94      1175



In [46]:

print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[475  48]
 [ 23 629]]



With RF Classifier, also we are able to get quite good results, infact better than Decision Tree.as it has 94% accuracy

# pickel the model

In [47]:

import pickle

In [48]:

filename = 'model.sav'

In [49]:

pickle.dump(model_rf_smote, open(filename, 'wb'))

In [50]:

load_model = pickle.load(open(filename, 'rb'))

In [51]:

model_score_r1 = load_model.score(xr_test1, yr_test1)

In [52]:

model_score_r1

0.9395744680851064

Our final model i.e. RF Classifier with SMOTEENN, is now ready and dumped in model.sav, which we will use and prepare API's so that we can access our model from UI.