In [2]:
#importing libraries
from sklearn import metrics
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN 



In [3]:
# Mounting colab with google drive to access the csv from there
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_csv("/content/drive/MyDrive/telco_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,0,1,0,0,1,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0
1,1,0,0,0,1,1,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0
2,2,0,1,0,1,1,0,1,0,0,1,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0
3,3,0,0,0,1,1,0,1,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0
4,4,0,1,1,0,1,0,1,0,0,1,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0


In [5]:
df = df.drop("Unnamed: 0", axis=1)

In [6]:
#Creating x and y variables
x = df.drop("Churn", axis = 1)
y = df["Churn"]

In [7]:
#Getting train and test data
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2)

**Decision Tree Classifier**

In [8]:
model_dt = DecisionTreeClassifier(criterion='gini',random_state=100,splitter='best', max_depth=7, min_samples_split=2)

In [9]:
model_dt.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=7, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=100, splitter='best')

In [10]:
y_pred = model_dt.predict(x_test)

In [11]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.89      0.85      1020
           1       0.63      0.50      0.56       387

    accuracy                           0.78      1407
   macro avg       0.73      0.69      0.70      1407
weighted avg       0.77      0.78      0.77      1407



Whenever we view the classification report we must pay special attention to the minority data. In this case that is churner (value = 1). We can see that for value=1 the precision and recall and other values are low. That is because the data of churner and non churner is imbalanced.

In [12]:
print(confusion_matrix(y_test, y_pred))

[[905 115]
 [194 193]]


You can see that the accuracy is decent for this model. However, its of no use because of the reason mentioned above.

To deal with this problem we would be using SMOTEENN. Over-sampling using SMOTE and cleaning using ENN.

In [13]:
sm = SMOTEENN()
x_resampled, y_resampled = sm.fit_sample(x,y)



In [15]:
#Getting resampled train and test data
xr_train,xr_test,yr_train,yr_test = train_test_split(x_resampled,y_resampled, test_size = 0.2)

In [16]:
model_dt_smote = DecisionTreeClassifier(criterion='gini',random_state=100,splitter='best', max_depth=7, min_samples_split=2)

In [17]:
model_dt_smote.fit(xr_train, yr_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=7, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=100, splitter='best')

In [20]:
yr_pred = model_dt_smote.predict(xr_test)

In [21]:
print(classification_report(yr_test, yr_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93       596
           1       0.92      0.94      0.93       604

    accuracy                           0.93      1200
   macro avg       0.93      0.93      0.93      1200
weighted avg       0.93      0.93      0.93      1200



In [22]:
print(confusion_matrix(yr_test, yr_pred))

[[546  50]
 [ 36 568]]


We can see that the precision, recall and f1-score have definetely improved. We will now try and use Random Forest Clasifier 

**Random Forest Classifier**

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
# Again by using the normal classifier we won't be get accurate results. Hence, using the SMOTEEN dataset 

In [24]:
sm1 = SMOTEENN()
x_resampled1, y_resampled1 = sm1.fit_sample(x,y)



In [25]:
#Getting resampled train and test data
xr_train1,xr_test1,yr_train1,yr_test1 = train_test_split(x_resampled1,y_resampled1, test_size = 0.2)

In [29]:
model_dt_smote1 = RandomForestClassifier(n_estimators=100,criterion='gini',random_state=100, max_depth=7, min_samples_split=2)

In [30]:
model_dt_smote1.fit(xr_train1, yr_train1)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=7, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=100,
                       verbose=0, warm_start=False)

In [31]:
yr_pred1 = model_dt_smote1.predict(xr_test1)

In [32]:
print(classification_report(yr_test1, yr_pred1, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93       630
           1       0.91      0.94      0.93       581

    accuracy                           0.93      1211
   macro avg       0.93      0.93      0.93      1211
weighted avg       0.93      0.93      0.93      1211



In [33]:
print(confusion_matrix(yr_test1, yr_pred1))

[[577  53]
 [ 34 547]]


We can see almost similiar results with Decision Tree and Random Forest. We can create many more models using different techniques and check their permormance.

We will Select Decision Tree as our model and we would save the model in a pickle fiie.

**Pickling the model**

In [34]:
import pickle

In [35]:
filename = 'model.sav'

In [36]:
pickle.dump(model_dt_smote, open(filename, 'wb'))

In [37]:
load_model = pickle.load(open(filename,'rb'))

In [38]:
load_model_score = load_model.score(xr_test,yr_test)

In [39]:
load_model_score

0.9283333333333333

Our final model i.e Decision tree with SMOTEEN is ready to be dumped in model.sav file which we will use along with API's to access this model from UI