In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
telco_raw = pd.read_csv('./data/telco.csv')
telco_raw['TotalCharges'] = pd.to_numeric(telco_raw['TotalCharges'], errors = 'coerce')

# ML Strategies and Use Cases

# Preparation for Modeling

In [3]:
telco_raw.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Majority of our columns contain strings. We will have to transform these strings so our model can use it.

In [4]:
# telco_raw.dtypes

**Step 1:** Separate the identifier (customer ID) and target variable (churn flag). 

**Step 2:** store in separate lists to use later.

**Step 3:** separate categorical column names by defining a column as categorical if it has less than 10 unique values. ----- good practice to explore data to see if there are variables with more unique values.

**Step 4:** remove target variable from the list so we dont' do transformations on it

**Step 5:** store remaining columns named into a list called numerical. Use list comprehensions 

In [5]:
custid = ['customerID']
target = ['Churn']

In [6]:
categorical = telco_raw.nunique()[telco_raw.nunique()<10].keys().tolist()
categorical.remove(target[0])
numerical = [col for col in telco_raw.columns if col not in custid+target+categorical]

One hot encoding: convert these variables into binary columns with ones and zeros.

In [7]:
#drop first so it wont be redundant
telco_raw = pd.get_dummies(data=telco_raw, columns =categorical, drop_first = True)

**Scaling Numerical Features:** 

In [8]:
#intialize instance
scaler = StandardScaler()
#fit scaler to numerical columns
scaled_numerical = scaler.fit_transform(telco_raw[numerical])
#build df
scaled_numerical = pd.DataFrame(scaled_numerical, columns = numerical)

In [9]:
#drop non sclaed numerical columns
telco_raw = telco_raw.drop(columns = numerical, axis = 1)
#merge non numerical with the scaled numerical data
telco = telco_raw.merge(right = scaled_numerical, how = 'left', left_index = True, right_index = True)
telco.dropna(inplace = True)

# ML Modeling Steps

Steps to building Supervised learning model
- **Split** data to training and testing. This is important as we want to "train" the model on one set of data, and then measure its performance on unseen values or testing dataset to make sure it works well on unseed data.
- **Intialize** the model
- **Fit** the model on the training data
- **Predict** values on the testing data
- **Measure** model performance on testing data

In [10]:
X = telco.loc[:, ~telco.columns.isin(['customerID', 'Churn', 'TotalCharges'])]
Y = telco['Churn']

In [11]:
#example 1
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.25) #defines the percentage to be reserved to testing dataset
mytree=tree.DecisionTreeClassifier()
treemodel = mytree.fit(train_X, train_Y)
pred_Y = treemodel.predict(test_X)
accuracy_score(test_Y, pred_Y)

0.7093287827076223

## Datacamp example

In [12]:
#example 2 - advance
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.25) #defines the percentage to be reserved to testing dataset
#ensure train df has 75%
print(train_X.shape[0]/X.shape[0])
print(test_X.shape[0]/ X.shape[0])

0.75
0.25


In [13]:
#intialize, fit, predict, 
mytree = tree.DecisionTreeClassifier(max_depth = 5)
treemodel = mytree.fit(train_X, train_Y)
pred_Y = treemodel.predict(test_X)
#measure model perfomrance on testing data
accuracy_score(test_Y, pred_Y)

0.7861205915813424

**Predict Churn WIth Decision Tree**

In [14]:
clf = tree.DecisionTreeClassifier(max_depth = 7, criterion = 'gini', splitter = 'best')
#fit and predict
clf = clf.fit(train_X, train_Y)
pred_Y = clf.predict(test_X)
# Print accuracy values
print("Training accuracy: ", np.round(clf.score(train_X, train_Y), 3)) 
print("Test accuracy: ", np.round(accuracy_score(test_Y, pred_Y), 3))

Training accuracy:  0.82
Test accuracy:  0.768


Unsupervised
- **Initialize** the model
- **Fit** the model
- **Assign** cluster values
- **Explore** Results

In [15]:
kmeans = KMeans(n_clusters = 3)
kmeans.fit(telco.loc[:, ~telco.columns.isin(['customerID', 'Churn', 'TotalCharges'])])
# telco.assign(Cluster = kmeans.labels_)
# telco.groupby().mean()

KMeans(n_clusters=3)

# Churn Predictions and Fundamentals

In [16]:
#print unique Churn values
set(telco['Churn'])

{'No', 'Yes'}

In [17]:
#calculate ratio size of churn groups
telco.groupby(['Churn']).size() / telco.shape[0] * 100

Churn
No     73.421502
Yes    26.578498
dtype: float64

In [18]:
train, test = train_test_split(telco, test_size = 0.25)

In [19]:
target = ['Churn']
custid = ['customerID']
cols = [col for col in telco.columns if col not in custid + target]

In [20]:
#build training and testing datasets
train_X = train[cols]
train_Y = train[target]
test_X = test[cols]
test_Y = test[target]

Predict Churn with Logistic Regression -- revisit modeling steps

In [21]:
logreg = LogisticRegression()

In [22]:
logreg.fit(train_X, train_Y)

  return f(**kwargs)


LogisticRegression()

Key Metrics
- **Accuracy:** The % of correctly predicted labels
- **Precision:** The % of total model's positive class predictions that were correctly classified
- **Recall:** The % of total positive class samples that were correctly classified

In [23]:
pred_train_Y = logreg.predict(train_X)
pred_test_Y = logreg.predict(test_X)
train_accuracy = accuracy_score(train_Y, pred_train_Y)
test_accuracy = accuracy_score(test_Y, pred_test_Y)
print('Training accuracy:', round(train_accuracy, 4))
print('Test accuracy:', round(test_accuracy, 4))

Training accuracy: 0.8104
Test accuracy: 0.8026


In [24]:
from sklearn.metrics import precision_score, recall_score
train_precision =  round(precision_score(train_Y, pred_train_Y, pos_label="Yes"), 4)
test_precision = round(precision_score(test_Y, pred_test_Y,  pos_label="Yes"), 4)
train_recall = round(recall_score(train_Y, pred_train_Y,  pos_label="Yes"), 4)
test_recall = round(recall_score(test_Y, pred_test_Y,  pos_label="Yes"), 4)
print('Training precision: {}, Training recall: {}'.format(train_precision, train_recall))
print('Testing precision: {}, Testing recall: {}'.format(test_precision, test_recall))

Training precision: 0.6739, Training recall: 0.5536
Testing precision: 0.6533, Testing recall: 0.5544


L1 (aka lasso) regularization and feature selection

In [25]:
logreg = LogisticRegression(penalty = 'l1', C = 0.1, solver = 'liblinear')
logreg.fit(train_X, train_Y)

  return f(**kwargs)


LogisticRegression(C=0.1, penalty='l1', solver='liblinear')

In [26]:
# checking which C values are best
C = [1, .5, .25, .1, .05, .025, .01, .005, .0025]
l1_metrics = np.zeros((len(C), 5))
l1_metrics[:,0] = C
for index in range(0, len(C)):
    logreg = LogisticRegression(penalty='l1', C=C[index], solver='liblinear')
    logreg.fit(train_X, train_Y)
    pred_test_Y = logreg.predict(test_X)
    l1_metrics[index,1] = np.count_nonzero(logreg.coef_)
    l1_metrics[index,2] = accuracy_score(test_Y, pred_test_Y)
    l1_metrics[index,3] = precision_score(test_Y, pred_test_Y, pos_label="Yes")
    l1_metrics[index,4] = recall_score(test_Y, pred_test_Y, pos_label="Yes")

col_names = ['C','Non-Zero Coeffs','Accuracy','Precision','Recall']
pd.DataFrame(l1_metrics, columns = col_names)

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,C,Non-Zero Coeffs,Accuracy,Precision,Recall
0,1.0,22.0,0.802048,0.650873,0.556503
1,0.5,21.0,0.799204,0.645,0.550107
2,0.25,20.0,0.80091,0.649123,0.552239
3,0.1,19.0,0.802617,0.655612,0.547974
4,0.05,14.0,0.798066,0.651596,0.522388
5,0.025,13.0,0.800341,0.673529,0.488273
6,0.01,7.0,0.800341,0.697987,0.443497
7,0.005,3.0,0.789534,0.746269,0.319829
8,0.0025,2.0,0.73322,0.0,0.0


**Predict Churn with Decision Trees**

In [27]:
from sklearn.tree import DecisionTreeClassifier
mytree = DecisionTreeClassifier()
treemodel = mytree.fit(train_X, train_Y)

In [28]:
pred_train_Y = mytree.predict(train_X)
pred_test_Y = mytree.predict(test_X)
train_accuracy = accuracy_score(train_Y, pred_train_Y)
test_accuracy = accuracy_score(test_Y, pred_test_Y)
print('Training Accuracy:', round(train_accuracy, 4))
print('Test Accurracy:', round(test_accuracy, 4))

Training Accuracy: 0.9981
Test Accurracy: 0.7162


This indicates that the tree memorized the patterns and rules for the training data almost perfectly, but failed to generalize the rules for the testing data.

In [29]:
train_precision =  round(precision_score(train_Y, pred_train_Y, pos_label="Yes"), 4)
test_precision = round(precision_score(test_Y, pred_test_Y,  pos_label="Yes"), 4)
train_recall = round(recall_score(train_Y, pred_train_Y,  pos_label="Yes"), 4)
test_recall = round(recall_score(test_Y, pred_test_Y,  pos_label="Yes"), 4)
print('Training precision: {}, Training recall: {}'.format(train_precision, train_recall))
print('Testing precision: {}, Testing recall: {}'.format(test_precision, test_recall))

Training precision: 0.9993, Training recall: 0.9936
Testing precision: 0.4696, Testing recall: 0.4947


Tree Depth Parameter Tuning

In [30]:
depth_list = list(range(2,15))
depth_tuning = np.zeros((len(depth_list), 4))
depth_tuning[:,0] = depth_list
for index in range(len(depth_list)):
    mytree = DecisionTreeClassifier(max_depth=depth_list[index])
    mytree.fit(train_X, train_Y)
    pred_test_Y = mytree.predict(test_X)
    depth_tuning[index,1] = accuracy_score(test_Y, pred_test_Y)
    depth_tuning[index,2] = precision_score(test_Y, pred_test_Y,pos_label="Yes")
    depth_tuning[index,3] = recall_score(test_Y, pred_test_Y,pos_label="Yes")

In [31]:
col_names = ['Max_Depth','Accuracy','Precision','Recall']
print(pd.DataFrame(depth_tuning, columns=col_names))

    Max_Depth  Accuracy  Precision    Recall
0         2.0  0.788965   0.664430  0.422175
1         3.0  0.788965   0.664430  0.422175
2         4.0  0.782708   0.680498  0.349680
3         5.0  0.786689   0.658784  0.415778
4         6.0  0.781570   0.622478  0.460554
5         7.0  0.788965   0.621891  0.533049
6         8.0  0.763936   0.573370  0.449893
7         9.0  0.764505   0.563805  0.518124
8        10.0  0.752560   0.540670  0.481876
9        11.0  0.759386   0.548936  0.550107
10       12.0  0.750853   0.533191  0.530917
11       13.0  0.732651   0.499022  0.543710
12       14.0  0.728669   0.491770  0.509595


**Identifying and Interpreting Churn Drivers**
- log odds is difficult to interpret 
- solution: calculate expotent of the coeffiencets.
- This gives us the change in ODDS associated with 1 unite increase in the future

In [32]:
from sklearn import tree
import graphviz

In [33]:
exported = tree.export_graphviz(decision_tree = mytree, out_file = None,
                               feature_names = cols, precision = 1,
                               class_names = ['Not churn', 'Churn'], filled= True)
graph = graphviz.Source(exported)
# display(graph)

In [34]:
coefficients = pd.concat([pd.DataFrame(train_X.columns),
                          pd.DataFrame(np.transpose(logreg.coef_))],axis = 1)
coefficients.columns = ['Feature','Coefficient']
coefficients['Exp_Coefficient'] = np.exp(coefficients['Coefficient'])
coefficients = coefficients[coefficients['Coefficient']!=0]
print(coefficients.sort_values(by=['Coefficient']))

           Feature  Coefficient  Exp_Coefficient
27          tenure    -0.403825         0.667761
28  MonthlyCharges     0.128546         1.137173


## CLV BASICS
**No Dataset Available**

- *Calculate monthly spend per customer:* -monthly_revenue = online.groupby(['customerID', 'InvoiceMonth'])['TotalSum'].sum().mean()
- *Calculate average monthly spend* : monthly_revenue = np.mean(monthly_revenue)
- lifespan_months = 36
- clv_basic = monthly_revenue * lifepsan_months

**Granular CLV Calculation - invoice level data points to calculate the granular customer lifetime value**
- *Calculate average Reveune per invoice* : - revenue_per_purchase = online.groupby(['InvoiceNo'])['TotalSum'].mean().mean()
- *Calculate average number of unique invoices per customer per month* - freq = online.groupby(['CustomerID', 'InvoiceMonth'])['InvoiceNo'].nunique().mean()
- lifespan_months = 26
- clv_grannular = revenue_per_purchase * freq * lifespan_months

**Traditional CLV Calculation -  does not require lifespan to be defined, and instead uses retention to churn rate to assess customer life expectancy.**
- *calculate monthly spend per customer:* monthly_revenue = online.groupby(['customerID', 'InvoiceMonth'])['TotalSum'].sum().mean()
- *calculate average monthly retention rate:* retention_rate = retention.iloc[:,1:].mean().mean()
- *average monthly churn rate:* churn_rate = 1 - retention_rate
- clv_traditional = monthly_revenue * (retention_rate/churn_rate)

# CUSTOMER SEGMENTATION BASICS

In [45]:
import warnings
warnings.filterwarnings('ignore')

In [46]:
wholesale.agg(['mean', 'std']).round(0)
#get the statistics
averages = wholesale.mean()
st_dev = wholesale.std()
x_names = wholesale.columns
x_ix = np.arange(wholesale.shape[1])

plt.bar(x_ix-0.2, averages, color = 'grey', label = 'Average', width = 0.4)
plt.bar(x_i+0.2, st_dv, color = 'orange', label= 'Standard Deviation', width = 0.4)
plt.xticks(x_ix, x_names, rotation = 90)
plt.legend()
plt.show()

NameError: name 'wholesale' is not defined

**Unskewing Data**

Option 1 - log transformation

In [47]:
wholesale_log = np.log(wholesale)
sns.pairplot(wholesale_log, diag_kind = 'kde')

NameError: name 'wholesale' is not defined

Option 2 - Box Cox Transformation

In [48]:
from scipy import stats
def boxcox_df(x):
    x_boxcox, _ = stats.boxcox(x)
    return x_boxcox

wholesale_boxcox = wholesale.apply(boxcox_df, axis = 0)
sns.pairplot(wholesale_boxcox, diag_kind = 'kde')

NameError: name 'wholesale' is not defined

Scale the data - expected for kmeans method.

In [49]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(wholesale_boxcox)
wholesale_scaled = scaler.transform(wholesale_boxcox)
wholesale_scaled_df = pd.DataFrame(data = wholesale_scaled, index = wholesale_boxcox.index,
                                  columns = wholesale_boxcox.columns)
wholesale_scaled_df.agg(['mean', 'std']).round()

NameError: name 'wholesale_boxcox' is not defined

**BUILD CUSTOMER PRODUCT SEGMENTATION**

Segmentation with Kmeans

In [50]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = k)
kmeans.fit(wholesale_scaled_df)
wholesale_kmeans = wholesale.assign(segment = kmeans.labels_)

NameError: name 'wholesale_scaled_df' is not defined

In [51]:
from sklearn.decomposition import NMF
nmf = NMF(k)
nmf.fit(wholesale)
compontents = pd.DataFrame(nmf.compontents_, columns = wholesale.columns)

#extracting segment assignment:
segments_weights = pd.DataFrame(nmf.transform(wholesale, columns = compontents.index))
segments_weights.index = wholesale.index
wholesale_nmf = wholesale.assign(segment = segment_weights.idxmax(axis = 1))

NameError: name 'wholesale' is not defined

**Calculate sum of squared errors and plot results**

In [52]:
sse = {}
for k in range(1, 11):
    kmeans = KMeans(n_cluslters = k, random_state = 333)
    kmeans.fit(wholesale_scaled_df)
    sse[k] = kmeans.inertia_
    
plt.title('Elbow criterion method chart')
sns.pointplot(x=list(sse.keys()), y = list(sse.values()))
plt.show()

TypeError: __init__() got an unexpected keyword argument 'n_cluslters'

NMF SEGMENTATION AVERAGES

In [53]:
#create W Matrix
w = pd.DataFrame(data = nmf.transform(wholesale), columns = compontents.index)
w.index = wholesale.index
#assign column name where the corresponding values is the largest
wholesale_nmf3 = wholesale.assign(segment = w.idxmax(axis = 1))
#calculate the average column values per each segment
nmf3_averages = wholesale_nmf3.groupby('segment').mean().round(0)
#plot the average values as a heatmap
sns.heatmap(nmf3.averages.T, cmap = 'YlGnBu')

NameError: name 'wholesale' is not defined

## visualize and interprete segmentation solutions

In [54]:
kmeans_averages = wholesale_kmeans.groupby(['segment']).mean().round(0)

sns.heatmap(kmeans_averages.T, smap = 'YlGuBu')

#plot average NMF segmentation attributes
nmf_averages = wholesale_nmf.groupby('segment').mean()
sns.heatmap(nmf_averages)

NameError: name 'wholesale_kmeans' is not defined