In [60]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans

In [25]:
telco_raw = pd.read_csv('./data/telco.csv')
telco_raw['TotalCharges'] = pd.to_numeric(telco_raw['TotalCharges'], errors = 'coerce')

# ML Strategies and Use Cases

# Preparation for Modeling

In [26]:
telco_raw.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Majority of our columns contain strings. We will have to transform these strings so our model can use it.

In [27]:
# telco_raw.dtypes

**Step 1:** Separate the identifier (customer ID) and target variable (churn flag). 

**Step 2:** store in separate lists to use later.

**Step 3:** separate categorical column names by defining a column as categorical if it has less than 10 unique values. ----- good practice to explore data to see if there are variables with more unique values.

**Step 4:** remove target variable from the list so we dont' do transformations on it

**Step 5:** store remaining columns named into a list called numerical. Use list comprehensions 

In [28]:
custid = ['customerID']
target = ['Churn']

In [29]:
categorical = telco_raw.nunique()[telco_raw.nunique()<10].keys().tolist()
categorical.remove(target[0])
numerical = [col for col in telco_raw.columns if col not in custid+target+categorical]

One hot encoding: convert these variables into binary columns with ones and zeros.

In [30]:
#drop first so it wont be redundant
telco_raw = pd.get_dummies(data=telco_raw, columns =categorical, drop_first = True)

**Scaling Numerical Features:** 

In [32]:
#intialize instance
scaler = StandardScaler()
#fit scaler to numerical columns
scaled_numerical = scaler.fit_transform(telco_raw[numerical])
#build df
scaled_numerical = pd.DataFrame(scaled_numerical, columns = numerical)

In [33]:
#drop non sclaed numerical columns
telco_raw = telco_raw.drop(columns = numerical, axis = 1)
#merge non numerical with the scaled numerical data
telco = telco_raw.merge(right = scaled_numerical, how = 'left', left_index = True, right_index = True)

# ML Modeling Steps

Steps to building Supervised learning model
- **Split** data to training and testing. This is important as we want to "train" the model on one set of data, and then measure its performance on unseen values or testing dataset to make sure it works well on unseed data.
- **Intialize** the model
- **Fit** the model on the training data
- **Predict** values on the testing data
- **Measure** model performance on testing data

In [48]:
X = telco.loc[:, ~telco.columns.isin(['customerID', 'Churn', 'TotalCharges'])]
Y = telco['Churn']

In [49]:
#example 1
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.25) #defines the percentage to be reserved to testing dataset
mytree=tree.DecisionTreeClassifier()
treemodel = mytree.fit(train_X, train_Y)
pred_Y = treemodel.predict(test_X)
accuracy_score(test_Y, pred_Y)

0.7365133446905168

## Datacamp example

In [57]:
#example 2 - advance
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.25) #defines the percentage to be reserved to testing dataset
#ensure train df has 75%
print(train_X.shape[0]/X.shape[0])
print(test_X.shape[0]/ X.shape[0])

0.7499645037626012
0.25003549623739885


In [58]:
#intialize, fit, predict, 
mytree = tree.DecisionTreeClassifier(max_depth = 5)
treemodel = mytree.fit(train_X, train_Y)
pred_Y = treemodel.predict(test_X)
#measure model perfomrance on testing data
accuracy_score(test_Y, pred_Y)

0.7898921067575241

**Predict Churn WIth Decision Tree**

In [61]:
clf = tree.DecisionTreeClassifier(max_depth = 7, criterion = 'gini', splitter = 'best')
#fit and predict
clf = clf.fit(train_X, train_Y)
pred_Y = clf.predict(test_X)
# Print accuracy values
print("Training accuracy: ", np.round(clf.score(train_X, train_Y), 3)) 
print("Test accuracy: ", np.round(accuracy_score(test_Y, pred_Y), 3))

Training accuracy:  0.823
Test accuracy:  0.785


Unsupervised
- **Initialize** the model
- **Fit** the model
- **Assign** cluster values
- **Explore** Results

In [56]:
kmeans = KMeans(n_clusters = 3)
kmeans.fit(telco.loc[:, ~telco.columns.isin(['customerID', 'Churn', 'TotalCharges'])])
# telco.assign(Cluster = kmeans.labels_)
# telco.groupby().mean()

KMeans(n_clusters=3)

# Churn Predictions and Fundamentals