In [80]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [58]:
telco_raw = pd.read_csv('./data/telco.csv')
telco_raw['TotalCharges'] = pd.to_numeric(telco_raw['TotalCharges'], errors = 'coerce')

# ML Strategies and Use Cases

# Preparation for Modeling

In [59]:
telco_raw.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Majority of our columns contain strings. We will have to transform these strings so our model can use it.

In [60]:
# telco_raw.dtypes

**Step 1:** Separate the identifier (customer ID) and target variable (churn flag). 

**Step 2:** store in separate lists to use later.

**Step 3:** separate categorical column names by defining a column as categorical if it has less than 10 unique values. ----- good practice to explore data to see if there are variables with more unique values.

**Step 4:** remove target variable from the list so we dont' do transformations on it

**Step 5:** store remaining columns named into a list called numerical. Use list comprehensions 

In [61]:
custid = ['customerID']
target = ['Churn']

In [62]:
categorical = telco_raw.nunique()[telco_raw.nunique()<10].keys().tolist()
categorical.remove(target[0])
numerical = [col for col in telco_raw.columns if col not in custid+target+categorical]

One hot encoding: convert these variables into binary columns with ones and zeros.

In [63]:
#drop first so it wont be redundant
telco_raw = pd.get_dummies(data=telco_raw, columns =categorical, drop_first = True)

**Scaling Numerical Features:** 

In [64]:
#intialize instance
scaler = StandardScaler()
#fit scaler to numerical columns
scaled_numerical = scaler.fit_transform(telco_raw[numerical])
#build df
scaled_numerical = pd.DataFrame(scaled_numerical, columns = numerical)

In [65]:
#drop non sclaed numerical columns
telco_raw = telco_raw.drop(columns = numerical, axis = 1)
#merge non numerical with the scaled numerical data
telco = telco_raw.merge(right = scaled_numerical, how = 'left', left_index = True, right_index = True)
telco.dropna(inplace = True)

# ML Modeling Steps

Steps to building Supervised learning model
- **Split** data to training and testing. This is important as we want to "train" the model on one set of data, and then measure its performance on unseen values or testing dataset to make sure it works well on unseed data.
- **Intialize** the model
- **Fit** the model on the training data
- **Predict** values on the testing data
- **Measure** model performance on testing data

In [67]:
X = telco.loc[:, ~telco.columns.isin(['customerID', 'Churn', 'TotalCharges'])]
Y = telco['Churn']

In [68]:
#example 1
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.25) #defines the percentage to be reserved to testing dataset
mytree=tree.DecisionTreeClassifier()
treemodel = mytree.fit(train_X, train_Y)
pred_Y = treemodel.predict(test_X)
accuracy_score(test_Y, pred_Y)

0.7155858930602957

## Datacamp example

In [69]:
#example 2 - advance
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.25) #defines the percentage to be reserved to testing dataset
#ensure train df has 75%
print(train_X.shape[0]/X.shape[0])
print(test_X.shape[0]/ X.shape[0])

0.75
0.25


In [70]:
#intialize, fit, predict, 
mytree = tree.DecisionTreeClassifier(max_depth = 5)
treemodel = mytree.fit(train_X, train_Y)
pred_Y = treemodel.predict(test_X)
#measure model perfomrance on testing data
accuracy_score(test_Y, pred_Y)

0.7764505119453925

**Predict Churn WIth Decision Tree**

In [71]:
clf = tree.DecisionTreeClassifier(max_depth = 7, criterion = 'gini', splitter = 'best')
#fit and predict
clf = clf.fit(train_X, train_Y)
pred_Y = clf.predict(test_X)
# Print accuracy values
print("Training accuracy: ", np.round(clf.score(train_X, train_Y), 3)) 
print("Test accuracy: ", np.round(accuracy_score(test_Y, pred_Y), 3))

Training accuracy:  0.825
Test accuracy:  0.779


Unsupervised
- **Initialize** the model
- **Fit** the model
- **Assign** cluster values
- **Explore** Results

In [72]:
kmeans = KMeans(n_clusters = 3)
kmeans.fit(telco.loc[:, ~telco.columns.isin(['customerID', 'Churn', 'TotalCharges'])])
# telco.assign(Cluster = kmeans.labels_)
# telco.groupby().mean()

KMeans(n_clusters=3)

# Churn Predictions and Fundamentals

In [73]:
#print unique Churn values
set(telco['Churn'])

{'No', 'Yes'}

In [74]:
#calculate ratio size of churn groups
telco.groupby(['Churn']).size() / telco.shape[0] * 100

Churn
No     73.421502
Yes    26.578498
dtype: float64

In [75]:
train, test = train_test_split(telco, test_size = 0.25)

In [76]:
target = ['Churn']
custid = ['customerID']
cols = [col for col in telco.columns if col not in custid + target]

In [77]:
#build training and testing datasets
train_X = train[cols]
train_Y = train[target]
test_X = test[cols]
test_Y = test[target]

Predict Churn with Logistic Regression -- revisit modeling steps

In [78]:
logreg = LogisticRegression()

In [79]:
logreg.fit(train_X, train_Y)

  return f(**kwargs)


LogisticRegression()

Key Metrics
- **Accuracy:** The % of correctly predicted labels
- **Precision:** The % of total model's positive class predictions that were correctly classified
- **Recall:** The % of total positive class samples that were correctly classified

In [81]:
pred_train_Y = logreg.predict(train_X)
pred_test_Y = logreg.predict(test_X)
train_accuracy = accuracy_score(train_Y, pred_train_Y)
test_accuracy = accuracy_score(test_Y, pred_test_Y)
print('Training accuracy:', round(train_accuracy, 4))
print('Test accuracy:', round(test_accuracy, 4))

Training accuracy: 0.8075
Test accuracy: 0.8003


In [98]:
from sklearn.metrics import precision_score, recall_score
train_precision =  round(precision_score(train_Y, pred_train_Y, pos_label="Yes"), 4)
test_precision = round(precision_score(test_Y, pred_test_Y,  pos_label="Yes"), 4)
train_recall = round(recall_score(train_Y, pred_train_Y,  pos_label="Yes"), 4)
test_recall = round(recall_score(test_Y, pred_test_Y,  pos_label="Yes"), 4)
print('Training precision: {}, Training recall: {}'.format(train_precision, train_recall))
print('Testing precision: {}, Testing recall: {}'.format(test_precision, test_recall))

Training precision: 0.6641, Training recall: 0.5554
Testing precision: 0.6715, Testing recall: 0.4957


L1 (aka lasso) regularization and feature selection

In [99]:
logreg = LogisticRegression(penalty = 'l1', C = 0.1, solver = 'liblinear')
logreg.fit(train_X, train_Y)

  return f(**kwargs)


LogisticRegression(C=0.1, penalty='l1', solver='liblinear')

In [107]:
# checking which C values are best
C = [1, .5, .25, .1, .05, .025, .01, .005, .0025]
l1_metrics = np.zeros((len(C), 5))
l1_metrics[:,0] = C
for index in range(0, len(C)):
    logreg = LogisticRegression(penalty='l1', C=C[index], solver='liblinear')
    logreg.fit(train_X, train_Y)
    pred_test_Y = logreg.predict(test_X)
    l1_metrics[index,1] = np.count_nonzero(logreg.coef_)
    l1_metrics[index,2] = accuracy_score(test_Y, pred_test_Y)
    l1_metrics[index,3] = precision_score(test_Y, pred_test_Y, pos_label="Yes")
    l1_metrics[index,4] = recall_score(test_Y, pred_test_Y, pos_label="Yes")

col_names = ['C','Non-Zero Coeffs','Accuracy','Precision','Recall']
pd.DataFrame(l1_metrics, columns = col_names)

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


Unnamed: 0,C,Non-Zero Coeffs,Accuracy,Precision,Recall
0,1.0,28.0,0.801479,0.674352,0.497872
1,0.5,21.0,0.80091,0.674419,0.493617
2,0.25,20.0,0.80091,0.674419,0.493617
3,0.1,18.0,0.800341,0.675516,0.487234
4,0.05,15.0,0.796928,0.668657,0.476596
5,0.025,14.0,0.793515,0.670927,0.446809
6,0.01,8.0,0.793515,0.698885,0.4
7,0.005,3.0,0.782139,0.712195,0.310638
8,0.0025,2.0,0.731513,0.0,0.0


**Predict Churn with Decision Trees**

In [109]:
from sklearn.tree import DecisionTreeClassifier
mytree = DecisionTreeClassifier()
treemodel = mytree.fit(train_X, train_Y)

In [111]:
pred_train_Y = mytree.predict(train_X)
pred_test_Y = mytree.predict(test_X)
train_accuracy = accuracy_score(train_Y, pred_train_Y)
test_accuracy = accuracy_score(test_Y, pred_test_Y)
print('Training Accuracy:', round(train_accuracy, 4))
print('Test Accurracy:', round(test_accuracy, 4))

Training Accuracy: 0.9985
Test Accurracy: 0.723


This indicates that the tree memorized the patterns and rules for the training data almost perfectly, but failed to generalize the rules for the testing data.

In [112]:
train_precision =  round(precision_score(train_Y, pred_train_Y, pos_label="Yes"), 4)
test_precision = round(precision_score(test_Y, pred_test_Y,  pos_label="Yes"), 4)
train_recall = round(recall_score(train_Y, pred_train_Y,  pos_label="Yes"), 4)
test_recall = round(recall_score(test_Y, pred_test_Y,  pos_label="Yes"), 4)
print('Training precision: {}, Training recall: {}'.format(train_precision, train_recall))
print('Testing precision: {}, Testing recall: {}'.format(test_precision, test_recall))

Training precision: 1.0, Training recall: 0.9943
Testing precision: 0.4815, Testing recall: 0.4702


Tree Depth Parameter Tuning

In [115]:
depth_list = list(range(2,15))
depth_tuning = np.zeros((len(depth_list), 4))
depth_tuning[:,0] = depth_list
for index in range(len(depth_list)):
    mytree = DecisionTreeClassifier(max_depth=depth_list[index])
    mytree.fit(train_X, train_Y)
    pred_test_Y = mytree.predict(test_X)
    depth_tuning[index,1] = accuracy_score(test_Y, pred_test_Y)
    depth_tuning[index,2] = precision_score(test_Y, pred_test_Y,pos_label="Yes")
    depth_tuning[index,3] = recall_score(test_Y, pred_test_Y,pos_label="Yes")

In [116]:
col_names = ['Max_Depth','Accuracy','Precision','Recall']
print(pd.DataFrame(depth_tuning, columns=col_names))

    Max_Depth  Accuracy  Precision    Recall
0         2.0  0.779295   0.713542  0.291489
1         3.0  0.779295   0.713542  0.291489
2         4.0  0.778726   0.661355  0.353191
3         5.0  0.790102   0.630491  0.519149
4         6.0  0.792947   0.659639  0.465957
5         7.0  0.792378   0.654867  0.472340
6         8.0  0.775313   0.608696  0.446809
7         9.0  0.767349   0.583106  0.455319
8        10.0  0.762230   0.564356  0.485106
9        11.0  0.753697   0.545012  0.476596
10       12.0  0.744596   0.527273  0.431915
11       13.0  0.742321   0.519630  0.478723
12       14.0  0.741183   0.517241  0.478723


**Identifying and Interpreting Churn Drivers**

In [122]:
from sklearn import tree
import graphviz

In [124]:
exported = tree.export_graphviz(decision_tree = mytree, out_file = None,
                               feature_names = cols, precision = 1,
                               class_names = ['Not churn', 'Churn'], filled= True)
graph = graphviz.Source(exported)
# display(graph)

In [127]:
coefficients = pd.concat([pd.DataFrame(train_X.columns),
                          pd.DataFrame(np.transpose(logreg.coef_))],axis = 1)
coefficients.columns = ['Feature','Coefficient']
coefficients['Exp_Coefficient'] = np.exp(coefficients['Coefficient'])
coefficients = coefficients[coefficients['Coefficient']!=0]
print(coefficients.sort_values(by=['Coefficient']))

           Feature  Coefficient  Exp_Coefficient
27          tenure    -0.403667         0.667866
28  MonthlyCharges     0.156126         1.168973
