In [1]:
# pre-process: 
# remove is_cancel = 1 and success = 1 (bad data)
# One Hot Encode categorical columns
# Add One Hot Buckets for 1 Day


# Build Pipeline
# split into 7 day v 30 day.
# split 7 day into is_cancel v not_cancel
# split 30 day into is_cancel v not_cancel
# 1. 7 day + is_cancel = predict 0
# 2. 30 day + is_cancel = predict 0
# 3. 7 day ML
# 4. 30 day ML
# 5. Combine
# 6. Get Scores.  Accuracy, Precision, Recall, F1.

In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder


In [3]:
# load the data.
combo_df = pd.read_csv('../../data/skillshare_combined.csv')
combo_df = combo_df.fillna(0.0)

In [4]:
# PREPROCEESSING
# remove is_cancel = 1 and success = 1 (bad data)
combo_df = combo_df[~((combo_df['success']==1) & (combo_df['is_cancel_during_trial']==1))]

In [5]:
# One Hot Encode categorical columns
encoder = OneHotEncoder(drop='first', sparse=False)

In [6]:
encoder_df = pd.DataFrame(encoder.fit_transform(combo_df[['payment_provider_cat_codes']]))
cols_names = ['pay_provider'+str(x) for x in list(encoder_df.columns)]
encoder_df.columns = cols_names
combo_df = combo_df.join(encoder_df)
del combo_df['payment_provider_cat_codes']

In [7]:
encoder_df = pd.DataFrame(encoder.fit_transform(combo_df[['payment_ux_cat_codes']]))
cols_names = ['pay_ux'+str(x) for x in list(encoder_df.columns)]
encoder_df.columns = cols_names
combo_df = combo_df.join(encoder_df)
del combo_df['payment_ux_cat_codes']

In [8]:
# encoder_df = pd.DataFrame(encoder.fit_transform(combo_df[['sub_utm_channel_cat_codes']]))
# cols_names = ['channel'+str(x) for x in list(encoder_df.columns)]
# encoder_df.columns = cols_names
# combo_df = combo_df.join(encoder_df)
# del combo_df['sub_utm_channel_cat_codes']
### Too Many.  Get the top 5

# 1. YTInfluencer
combo_df['Is_YT'] = 0
combo_df['Is_YT'][combo_df['sub_utm_channel_cat_codes'] == 17] = 1

# 2. Paid Search (Brand)
combo_df['Is_PSb'] = 0
combo_df['Is_PSb'][combo_df['sub_utm_channel_cat_codes'] == 10] = 1

# 3. Paid Search (Non-Brand)
combo_df['Is_PSnb'] = 0
combo_df['Is_PSnb'][combo_df['sub_utm_channel_cat_codes'] == 11] = 1

# 4. Organic Search
combo_df['Is_OS'] = 0
combo_df['Is_OS'][combo_df['sub_utm_channel_cat_codes'] == 4] = 1

# 5. Direct
combo_df['Is_Direct'] = 0
combo_df['Is_Direct'][combo_df['sub_utm_channel_cat_codes'] == 1] = 1

# 6. Other
above = [17, 10, 11, 4, 1]
combo_df['Other_Channel'] = 0
combo_df['Other_Channel'][~combo_df['sub_utm_channel_cat_codes'].isin(above)] = 1

del combo_df['sub_utm_channel_cat_codes']

In [9]:
# encoder_df = pd.DataFrame(encoder.fit_transform(combo_df[['signup_country_name_cat_codes']]))
# cols_names = ['country'+str(x) for x in list(encoder_df.columns)]
# encoder_df.columns = cols_names
# combo_df = combo_df.join(encoder_df)
# del combo_df['signup_country_name_cat_codes']

# encoder_df = pd.DataFrame(encoder.fit_transform(combo_df[['signup_geo_region_cat_codes']]))
# cols_names = ['region'+str(x) for x in list(encoder_df.columns)]
# encoder_df.columns = cols_names
# combo_df = combo_df.join(encoder_df)
# del combo_df['signup_geo_region_cat_codes']

### TOO MANY.  Clean This.


# 1. Is_USA
combo_df['Is_USA'] = 0
combo_df['Is_USA'][combo_df['signup_country_name_cat_codes'] == 214] = 1

# 2. Is_UK
combo_df['Is_UK'] = 0
combo_df['Is_UK'][combo_df['signup_country_name_cat_codes'] == 213] = 1

# 3. Is_WestEuro
weuro = [185, 69, 75, 99, 161, 55, 142]
combo_df['Is_WestEuro'] = 0
combo_df['Is_WestEuro'][combo_df['signup_country_name_cat_codes'].isin(weuro)] = 1

# 4. Is_CA
combo_df['Is_CA'] = 0
combo_df['Is_CA'][combo_df['signup_country_name_cat_codes'] == 35] = 1

# 5. Is_IN
combo_df['Is_IN'] = 0
combo_df['Is_IN'][combo_df['signup_country_name_cat_codes'] == 92] = 1

# 6. Other
above = [214, 213, 185, 69, 75, 99, 161, 55, 142, 35, 92]
combo_df['Other_Geo'] = 0
combo_df['Other_Geo'][~combo_df['signup_country_name_cat_codes'].isin(above)] = 1

del combo_df['signup_country_name_cat_codes']
del combo_df['signup_geo_region_cat_codes']

KeyError: 'signup_country_name_cat_codes'

In [None]:
encoder_df = pd.DataFrame(encoder.fit_transform(combo_df[['signup_platform_cat_codes']]))
cols_names = ['platform'+str(x) for x in list(encoder_df.columns)]
encoder_df.columns = cols_names
combo_df = combo_df.join(encoder_df)
del combo_df['signup_platform_cat_codes']

In [None]:
# Fill NAs
combo_df = combo_df.fillna(0.0)

In [None]:
# delete unneeded column
del combo_df['sub_utm_source_cat_codes']

In [None]:
# make a few cummulative minutes watched columns
combo_df['d3_cumm'] =  combo_df['day-1'] + combo_df['day-2'] + combo_df['day-3']
combo_df['d7_cumm'] =  combo_df['d3_cumm'] + combo_df['day-4'] + combo_df['day-5'] + combo_df['day-6'] + combo_df['day-7']

# make a 30 day cummulative column
combo_df['d30_cumm'] =  combo_df['d7_cumm']
for x in range(23):
    thiscol = 'day-'+str(x+8)
    combo_df['d30_cumm'] =  combo_df['d30_cumm'] + combo_df[thiscol]


In [None]:
# bucket the minutes watched based on the correlation visual.
col = 'day-1'
combo_df['d1_0'] = 0
combo_df['d1_0'][combo_df[col] == 0.0] = 1

combo_df['d1_1'] = 0
combo_df['d1_1'][combo_df[col] > 0.0] = 1
combo_df['d1_1'][combo_df[col] > 60.0] = 0

combo_df['d1_5'] = 0
combo_df['d1_5'][combo_df[col] >= 60.0] = 1
combo_df['d1_5'][combo_df[col] > 300.0] = 0

combo_df['d1_15'] = 0
combo_df['d1_15'][combo_df[col] >= 300.0] = 1
combo_df['d1_15'][combo_df[col] > 900.0] = 0

combo_df['d1_30'] = 0
combo_df['d1_30'][combo_df[col] >= 900.0] = 1
combo_df['d1_30'][combo_df[col] > 1800.0] = 0

combo_df['d1_60'] = 0
combo_df['d1_60'][combo_df[col] >= 1800.0] = 1
combo_df['d1_60'][combo_df[col] > 3600.0] = 0

combo_df['d1_60'] = 0
combo_df['d1_60'][combo_df[col] >= 3600.0] = 1

In [None]:
col = 'd3_cumm'
combo_df['d3_0'] = 0
combo_df['d3_0'][combo_df[col] == 0.0] = 1

combo_df['d3_1'] = 0
combo_df['d3_1'][combo_df[col] > 0.0] = 1
combo_df['d3_1'][combo_df[col] > 60.0] = 0

combo_df['d3_5'] = 0
combo_df['d3_5'][combo_df[col] >= 60.0] = 1
combo_df['d3_5'][combo_df[col] > 300.0] = 0

combo_df['d3_15'] = 0
combo_df['d3_15'][combo_df[col] >= 300.0] = 1
combo_df['d3_15'][combo_df[col] > 900.0] = 0

combo_df['d3_30'] = 0
combo_df['d3_30'][combo_df[col] >= 900.0] = 1
combo_df['d3_30'][combo_df[col] > 1800.0] = 0

combo_df['d3_60'] = 0
combo_df['d3_60'][combo_df[col] >= 1800.0] = 1
combo_df['d3_60'][combo_df[col] > 3600.0] = 0

combo_df['d3_60'] = 0
combo_df['d3_60'][combo_df[col] >= 3600.0] = 1

In [None]:
col = 'd7_cumm'
combo_df['d7_0'] = 0
combo_df['d7_0'][combo_df[col] == 0.0] = 1

combo_df['d7_1'] = 0
combo_df['d7_1'][combo_df[col] > 0.0] = 1
combo_df['d7_1'][combo_df[col] > 60.0] = 0

combo_df['d7_5'] = 0
combo_df['d7_5'][combo_df[col] >= 60.0] = 1
combo_df['d7_5'][combo_df[col] > 300.0] = 0

combo_df['d7_15'] = 0
combo_df['d7_15'][combo_df[col] >= 300.0] = 1
combo_df['d7_15'][combo_df[col] > 900.0] = 0

combo_df['d7_30'] = 0
combo_df['d7_30'][combo_df[col] >= 900.0] = 1
combo_df['d7_30'][combo_df[col] > 1800.0] = 0

combo_df['d7_60'] = 0
combo_df['d7_60'][combo_df[col] >= 1800.0] = 1
combo_df['d7_60'][combo_df[col] > 3600.0] = 0

combo_df['d7_60'] = 0
combo_df['d7_60'][combo_df[col] >= 3600.0] = 1

In [None]:
col = 'd30_cumm'
combo_df['d30_0'] = 0
combo_df['d30_0'][combo_df[col] == 0.0] = 1

combo_df['d30_1'] = 0
combo_df['d30_1'][combo_df[col] > 0.0] = 1
combo_df['d30_1'][combo_df[col] > 60.0] = 0

combo_df['d30_5'] = 0
combo_df['d30_5'][combo_df[col] >= 60.0] = 1
combo_df['d30_5'][combo_df[col] > 300.0] = 0

combo_df['d30_15'] = 0
combo_df['d30_15'][combo_df[col] >= 300.0] = 1
combo_df['d30_15'][combo_df[col] > 900.0] = 0

combo_df['d30_30'] = 0
combo_df['d30_30'][combo_df[col] >= 900.0] = 1
combo_df['d30_30'][combo_df[col] > 1800.0] = 0

combo_df['d30_60'] = 0
combo_df['d30_60'][combo_df[col] >= 1800.0] = 1
combo_df['d30_60'][combo_df[col] > 3600.0] = 0

combo_df['d30_60'] = 0
combo_df['d30_60'][combo_df[col] >= 3600.0] = 1

In [None]:
for x in range(31):
    del combo_df['day-'+str(x+1)]

In [None]:
combo_df.columns

In [None]:
# split into 7 day v 30 day.
month_df = combo_df[combo_df['trial_length_offer_cat_codes'] == 0]
week_df = combo_df[combo_df['trial_length_offer_cat_codes'] == 1]

In [None]:
# split 7 day into is_cancel v not_cancel
week_df_cancel = week_df[week_df['is_cancel_during_trial']==1]
week_df_elig = week_df[week_df['is_cancel_during_trial']==0]

In [None]:
# split 30 day into is_cancel v not_cancel
month_df_cancel = month_df[month_df['is_cancel_during_trial']==1]
month_df_elig = month_df[month_df['is_cancel_during_trial']==0]

In [None]:
# 1. 7 day + is_cancel = predict 0
y_w_c = week_df_cancel[['success']]
y_w_c['predict'] = 0

In [None]:
# 2. 30 day + is_cancel = predict 0
y_m_c = month_df_cancel[['success']]
y_m_c['predict'] = 0

In [None]:
week_df_elig.columns[6:]

In [None]:
# 3. 7 day ML
X = week_df_elig[list(week_df_elig.columns)[6:]]
y = week_df_elig['success']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# run a Decision Tree Classifier
dtclf = DecisionTreeClassifier(max_depth=6).fit(X_train, y_train)
y_pred = dtclf.predict(X_test)
cm_dt = confusion_matrix(y_test, y_pred)
print(cm_dt, dtclf.score(X_test, y_test))

In [None]:
y_w_e = pd.DataFrame({'success': y_test,  'predict' : y_pred})

In [None]:
# 4. 30 day ML
y = month_df_elig['success']
X = month_df_elig[list(month_df_elig.columns)[6:]]

In [None]:
# run a decision tree classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
dtclf = DecisionTreeClassifier(max_depth=6).fit(X_train, y_train)
y_pred = dtclf.predict(X_test)
cm_dt = confusion_matrix(y_test, y_pred)
print(cm_dt, dtclf.score(X_test, y_test))

In [None]:
y_m_e = pd.DataFrame({'success': y_test,  'predict' : y_pred})

In [None]:
# 5. Combine
y_df = y_w_c.append(y_m_c)
y_df = y_df.append(y_w_e)
y_df = y_df.append(y_m_e)

In [None]:
print('Accuracy: %.3f' % accuracy_score(y_df['success'], y_df['predict']))
print('Precision: %.3f' % precision_score(y_df['success'], y_df['predict']))
print('Recall: %.3f' % recall_score(y_df['success'], y_df['predict']))
print('F1 Score: %.3f' % f1_score(y_df['success'], y_df['predict']))

In [None]:
cm_overall = confusion_matrix(y_df['success'], y_df['predict'])
cm_overall

In [None]:
print(len(y_test), y_test.sum(), 1 - y_test.sum() / len(y_test))

In [None]:
# try KNeighbors?
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=8)
neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test)
cm_dt = confusion_matrix(y_test, y_pred)
print(cm_dt, neigh.score(X_test, y_test))

In [None]:
# Try Random Forest?
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=6)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
cm_dt = confusion_matrix(y_test, y_pred)
print(cm_dt, neigh.score(X_test, y_test))
