# Churn Prediction Model


In [3]:
#Importing libraries
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

In [5]:
df = pd.read_csv('df_tel.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,Churn,total_charges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1-12,tenure_group_13-24,tenure_group_25-36,tenure_group_37-48,tenure_group_49-60,tenure_group_61-72
0,0,0,29.85,0,29.85,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,0,1889.5,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,1,108.15,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,0,1840.75,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,1,151.65,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [9]:
df = df.drop(columns='Unnamed: 0')

In [36]:
df.isnull().sum()

SeniorCitizen                              0
MonthlyCharges                             0
Churn                                      0
total_charges                              0
gender_Female                              0
gender_Male                                0
Partner_No                                 0
Partner_Yes                                0
Dependents_No                              0
Dependents_Yes                             0
PhoneService_No                            0
PhoneService_Yes                           0
MultipleLines_No                           0
MultipleLines_No phone service             0
MultipleLines_Yes                          0
InternetService_DSL                        0
InternetService_Fiber optic                0
InternetService_No                         0
OnlineSecurity_No                          0
OnlineSecurity_No internet service         0
OnlineSecurity_Yes                         0
OnlineBackup_No                            0
OnlineBack

In [35]:
df.dropna(inplace=True)

In [37]:
# creating targter vector and feature matrix
target = 'Churn'
X=df.drop(columns=target) 
y=df[target]

In [38]:
# splitting data into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [39]:
# Baseline accuracy
y_base = y_train.value_counts(normalize=True).max()
print(f"baseline accuracy is {y_base}")

baseline accuracy is 0.7342222222222222


### Modelling

#### Decision tree for classification

In [55]:
model = DecisionTreeClassifier(random_state=42,max_depth=5)

In [56]:
model.fit(X_train,y_train)

In [57]:
y_pred = model.predict(X_test)

In [58]:
print(f"Baseline accuracy is {y_base}")
print(f"Accuracy on Test is {model.score(X_test,y_test)}")

Baseline accuracy is 0.7342222222222222
Accuracy on Test is 0.7690120824449183


In [59]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.82      0.84      1033
           1       0.56      0.63      0.59       374

    accuracy                           0.77      1407
   macro avg       0.71      0.72      0.72      1407
weighted avg       0.78      0.77      0.77      1407



In [60]:
print(confusion_matrix(y_test,y_pred))


[[847 186]
 [139 235]]


Very high no. of FP and FN, since data is highly imbalanced. We will perform SMOTE to deal with it

#### SMOTE Analysis

In [61]:
sm = SMOTEENN()
X_res,y_res = sm.fit_resample(X,y)

In [62]:
Xr_train,Xr_test,yr_train,yr_test = train_test_split(X_res,y_res,test_size=0.2,random_state=42)

In [63]:
model_r = DecisionTreeClassifier(random_state=42,max_depth=5)
model_r.fit(Xr_train,yr_train)

In [67]:
yr_pred = model_r.predict(Xr_test)

In [65]:
print(f"Accuracy on Test is {model_r.score(Xr_test,yr_test)}")

Accuracy on Test is 0.9356775300171527


In [68]:
print(classification_report(yr_test,yr_pred))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93       538
           1       0.94      0.94      0.94       628

    accuracy                           0.94      1166
   macro avg       0.94      0.94      0.94      1166
weighted avg       0.94      0.94      0.94      1166



In [69]:
print(confusion_matrix(yr_test,yr_pred))

[[502  36]
 [ 39 589]]


#### Random Forest Classifier


In [70]:
from sklearn.ensemble import RandomForestClassifier

In [78]:
model_rf = RandomForestClassifier(random_state=42,max_depth=6,n_estimators =150)
model_rf.fit(X_train,y_train)

In [79]:
y_pred_rf = model_rf.predict(X_test)

In [83]:
print(f"Accuracy on Test is {model_rf.score(X_test,y_test)}")

Accuracy on Test is 0.7853589196872779


In [84]:
print(confusion_matrix(y_test,y_pred_rf))

[[942  91]
 [211 163]]


In [85]:
print(classification_report(y_test,y_pred_rf))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86      1033
           1       0.64      0.44      0.52       374

    accuracy                           0.79      1407
   macro avg       0.73      0.67      0.69      1407
weighted avg       0.77      0.79      0.77      1407



#### SMOTE Analysis

In [99]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(X,y)

In [100]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [101]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [102]:
model_rf_smote.fit(xr_train1,yr_train1)

In [103]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [104]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [105]:
print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))

0.9269949066213922
              precision    recall  f1-score   support

           0       0.97      0.87      0.92       551
           1       0.90      0.98      0.93       627

    accuracy                           0.93      1178
   macro avg       0.93      0.92      0.93      1178
weighted avg       0.93      0.93      0.93      1178



In [106]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[480  71]
 [ 15 612]]


#### Saving Model

In [108]:
import pickle

In [110]:
with open("model_telco-tc.pkl",'wb') as f:
    pickle.dump(model_r,f)

In [111]:
with open("model_telco-rf.pkl",'wb') as f:
    pickle.dump(model_rf_smote,f)

In [112]:
load_model = pickle.load(open("model_telco-rf.pkl", 'rb'))

In [114]:
load_model.score(xr_test1, yr_test1)

0.9269949066213922