In [1]:
# Importing the required libraries
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
import sklearn
%matplotlib inline

In [2]:
# Importing required packages for visualization
from IPython.display import Image  
from six import StringIO  
from sklearn.tree import export_graphviz
import pydotplus, graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
churn_df = pd.read_csv('data/churn_data.csv', delimiter=',', skipinitialspace=True)

In [4]:
internet_df = pd.read_csv('data/internet_data.csv', delimiter=',', skipinitialspace=True)

In [5]:
customer_df = pd.read_csv('data/customer_data.csv', delimiter=',', skipinitialspace=True)

In [6]:
churn_df.columns


Index(['customerID', 'tenure', 'PhoneService', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [7]:
internet_df.columns

Index(['customerID', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies'],
      dtype='object')

In [8]:
df=pd.merge(churn_df, internet_df, on="customerID")

In [9]:
df.columns

Index(['customerID', 'tenure', 'PhoneService', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'],
      dtype='object')

In [10]:
df=pd.merge(df, customer_df, on="customerID")

In [11]:
df.columns

Index(['customerID', 'tenure', 'PhoneService', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'gender', 'SeniorCitizen', 'Partner', 'Dependents'],
      dtype='object')

In [12]:
df=df.drop(['Partner','Dependents','PaperlessBilling','PaymentMethod','TotalCharges','customerID'], axis=1)

In [13]:
df.columns

Index(['tenure', 'PhoneService', 'Contract', 'MonthlyCharges', 'Churn',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'gender', 'SeniorCitizen'],
      dtype='object')

In [14]:
df.shape

(7043, 15)

In [15]:
df['PhoneService'] = df['PhoneService'].map({'Yes': 1, 'No': 0})
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
df['MultipleLines'] = df['MultipleLines'].map({'Yes': 1, 'No': 0,'No phone service':0})
df['OnlineSecurity'] = df['OnlineSecurity'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['OnlineBackup'] = df['OnlineBackup'].map({'Yes': 1, 'No': 0,'No internet service': 0})
df['DeviceProtection'] = df['DeviceProtection'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['TechSupport'] = df['TechSupport'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['StreamingTV'] = df['StreamingTV'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['StreamingMovies'] = df['StreamingMovies'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['gender'] = df['gender'].map({'Female': 1, 'Male': 0})

In [16]:
df.head()


Unnamed: 0,tenure,PhoneService,Contract,MonthlyCharges,Churn,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,gender,SeniorCitizen
0,1,0,Month-to-month,29.85,0,0,DSL,0,1,0,0,0,0,1,0
1,34,1,One year,56.95,0,0,DSL,1,0,1,0,0,0,0,0
2,2,1,Month-to-month,53.85,1,0,DSL,1,1,0,0,0,0,0,0
3,45,0,One year,42.3,0,0,DSL,1,0,1,1,0,0,0,0
4,2,1,Month-to-month,70.7,1,0,Fiber optic,0,0,0,0,0,0,1,0


In [17]:
nan_values = df.isna()
nan_columns = nan_values.any()

columns_with_nan = df.columns[nan_columns].tolist()
print(columns_with_nan)

[]


In [18]:
df['InternetService'].unique()

array(['DSL', 'Fiber optic', 'No'], dtype=object)

In [19]:
df['Contract'].unique()

array(['Month-to-month', 'One year', 'Two year'], dtype=object)

In [20]:
def evaluate_model(dt_classifier):
    print("Train Accuracy :", accuracy_score(y_train, dt_classifier.predict(X_train)))
    print("Train Confusion Matrix:")
    print(confusion_matrix(y_train, dt_classifier.predict(X_train)))
    print("-"*50)
    print("Test Accuracy :", accuracy_score(y_test, dt_classifier.predict(X_test)))
    print("Test Confusion Matrix:")
    print(confusion_matrix(y_test, dt_classifier.predict(X_test)))

In [21]:
status = pd.get_dummies(df['InternetService'],drop_first=True)
df = pd.concat([df,status],axis=1)
df.drop(['InternetService'],axis=1,inplace=True)

In [22]:
status = pd.get_dummies(df['Contract'],drop_first=True)
df = pd.concat([df,status],axis=1)
df.drop(['Contract'],axis=1,inplace=True)

In [23]:
# Putting feature variable to X
X = df.drop('Churn',axis=1)

# Putting response variable to y
y = df['Churn']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)
X_train.shape, X_test.shape

((4930, 16), (2113, 16))

In [25]:
dt_default = DecisionTreeClassifier(random_state=42)
dt_default.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

In [26]:
evaluate_model(dt_default)

Train Accuracy : 0.9935091277890467
Train Confusion Matrix:
[[3630    5]
 [  27 1268]]
--------------------------------------------------
Test Accuracy : 0.7411263606247042
Test Confusion Matrix:
[[1281  258]
 [ 289  285]]


In [27]:
dt = DecisionTreeClassifier(random_state=42)

In [28]:
from sklearn.model_selection import GridSearchCV

In [29]:
# Create the parameter grid based on the results of random search 
params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100]
}

In [30]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=dt, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

In [31]:
%%time
grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 25 candidates, totalling 100 fits
CPU times: user 283 ms, sys: 128 ms, total: 411 ms
Wall time: 2.68 s


GridSearchCV(cv=4, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [2, 3, 5, 10, 20],
                         'min_samples_leaf': [5, 10, 20, 50, 100]},
             scoring='accuracy', verbose=1)

In [32]:
score_df = pd.DataFrame(grid_search.cv_results_)
score_df.nlargest(3,"mean_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
14,0.013334,0.002156,0.00315,0.000194,5,100,"{'max_depth': 5, 'min_samples_leaf': 100}",0.803731,0.780211,0.783279,0.783279,0.787625,0.009383,1
18,0.017371,0.003141,0.003495,0.000287,10,50,"{'max_depth': 10, 'min_samples_leaf': 50}",0.794809,0.776156,0.788961,0.783279,0.785801,0.006902,2
23,0.016903,0.002473,0.003551,0.000192,20,50,"{'max_depth': 20, 'min_samples_leaf': 50}",0.794809,0.776156,0.788961,0.783279,0.785801,0.006902,2


In [33]:
dt_depth = DecisionTreeClassifier(max_depth=5, min_samples_leaf=100)
dt_depth.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=5, min_samples_leaf=100)

In [34]:
evaluate_model(dt_depth)

Train Accuracy : 0.7973630831643002
Train Confusion Matrix:
[[3319  316]
 [ 683  612]]
--------------------------------------------------
Test Accuracy : 0.7936583057264552
Test Confusion Matrix:
[[1397  142]
 [ 294  280]]
