In [61]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN


In [62]:
pip install -U imbalanced-learn




In [63]:
df=pd.read_csv("tel_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [64]:
df=df.drop('Unnamed: 0',axis=1)

In [65]:
x=df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.50,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.30,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.70,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,103.20,7362.90,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,29.60,346.45,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7030,1,74.40,306.60,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [66]:
y=df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

In [67]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [68]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)


In [69]:
model_dt.fit(x_train,y_train)

In [70]:
y_pred=model_dt.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [71]:
model_dt.score(x_test,y_test)

0.7903340440653873

In [72]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87      1036
           1       0.65      0.44      0.52       371

    accuracy                           0.79      1407
   macro avg       0.74      0.68      0.70      1407
weighted avg       0.78      0.79      0.78      1407



As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.
Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.
Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [73]:
from imblearn.combine import SMOTEENN

sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x, y)

In [74]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [75]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [76]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.9407725321888412
              precision    recall  f1-score   support

           0       0.94      0.93      0.93       527
           1       0.94      0.95      0.95       638

    accuracy                           0.94      1165
   macro avg       0.94      0.94      0.94      1165
weighted avg       0.94      0.94      0.94      1165



In [77]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[488  39]
 [ 30 608]]


RANDOM CLASSIFIER

In [78]:
from sklearn.ensemble import RandomForestClassifier

In [79]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [80]:
model_rf.fit(x_train,y_train)


In [81]:
y_pred=model_rf.predict(x_test)

In [82]:
model_rf.score(x_test,y_test)

0.7945984363894811

In [83]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.93      0.87      1036
           1       0.68      0.41      0.51       371

    accuracy                           0.79      1407
   macro avg       0.75      0.67      0.69      1407
weighted avg       0.78      0.79      0.78      1407



In [84]:
from imblearn.combine import SMOTEENN
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x,y)

In [85]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)


In [86]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)


In [87]:
model_rf_smote.fit(xr_train1,yr_train1)

In [88]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [89]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [90]:
print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))

0.9525021204410518
              precision    recall  f1-score   support

           0       0.97      0.92      0.94       521
           1       0.94      0.98      0.96       658

    accuracy                           0.95      1179
   macro avg       0.95      0.95      0.95      1179
weighted avg       0.95      0.95      0.95      1179



In [91]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[480  41]
 [ 15 643]]


With RF Classifier, also we are able to get quite good results, infact better than Decision Tree.
We can now further go ahead and create multiple classifiers to see how the model performance is, but that's not covered here, so you can do it by yourself :)

Performing PCA


In [92]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(0.9)
xr_train_pca = pca.fit_transform(xr_train1)
xr_test_pca = pca.transform(xr_test1)
explained_variance = pca.explained_variance_ratio_

In [93]:
model=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [94]:
model.fit(xr_train_pca,yr_train1)


In [95]:
yr_predict_pca = model.predict(xr_test_pca)


In [96]:
model_score_r_pca = model.score(xr_test_pca, yr_test1)

In [97]:
print(model_score_r_pca)
print(metrics.classification_report(yr_test1, yr_predict_pca))

0.729431721798134
              precision    recall  f1-score   support

           0       0.72      0.64      0.68       521
           1       0.74      0.80      0.77       658

    accuracy                           0.73      1179
   macro avg       0.73      0.72      0.72      1179
weighted avg       0.73      0.73      0.73      1179



With PCA, we couldn't see any better results, hence let's finalise the model which was created by RF Classifier, and save the model so that we can use it in a later stage :)

Pickling the model

In [98]:
import pickle

In [99]:
filename = 'model.sav'


In [100]:
pickle.dump(model_rf_smote, open(filename, 'wb'))


In [101]:
load_model = pickle.load(open(filename, 'rb'))

In [102]:
model_score_r1 = load_model.score(xr_test1, yr_test1)

In [103]:
model_score_r1

0.9525021204410518