In [None]:
# pre-process: remove is_cancel = 1 and success = 1 (bad data)


# split into 7 day v 30 day.
# split 7 day into is_cancel v not_cancel
# split 30 day into is_cancel v not_cancel
# 1. 7 day + is_cancel = predict 0
# 2. 30 day + is_cancel = predict 0
# 3. 7 day ML
# 4. 30 day ML
# 5. Combine
# 6. Get Scores.  Accuracy, Precision, Recall, F1.

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [2]:
combo_df = pd.read_csv('~/capstone_data/skillshare_combined.csv')
combo_df = combo_df.fillna(0.0)

In [3]:
# pre-process: remove is_cancel = 1 and success = 1 (bad data)
combo_df = combo_df[~((combo_df['success']==1) & (combo_df['is_cancel_during_trial']==1))]

In [4]:
# split into 7 day v 30 day.
month_df = combo_df[combo_df['trial_length_offer_cat_codes'] == 0]
week_df = combo_df[combo_df['trial_length_offer_cat_codes'] == 1]

In [5]:
# split 7 day into is_cancel v not_cancel
week_df_cancel = week_df[week_df['is_cancel_during_trial']==1]
week_df_elig = week_df[week_df['is_cancel_during_trial']==0]

In [6]:
# split 30 day into is_cancel v not_cancel
month_df_cancel = month_df[month_df['is_cancel_during_trial']==1]
month_df_elig = month_df[month_df['is_cancel_during_trial']==0]

In [7]:
# 1. 7 day + is_cancel = predict 0
y_w_c = week_df_cancel[['success']]
y_w_c['predict'] = 0

In [8]:
# 2. 30 day + is_cancel = predict 0
y_m_c = month_df_cancel[['success']]
y_m_c['predict'] = 0

In [9]:
# 3. 7 day ML
y = week_df_elig['success']
X = week_df_elig[['payment_provider_cat_codes', 'payment_ux_cat_codes',
       'trial_length_offer_cat_codes', 'sub_utm_channel_cat_codes',
       'sub_utm_source_cat_codes',
       'is_cancel_during_trial', 'day-1', 'day-2', 'day-3', 'day-4', 'day-5',
       'day-6', 'day-7', 'day-8', 'day-9', 'day-10', 'day-11', 'day-12',
       'day-13', 'day-14', 'day-15', 'day-16', 'day-17', 'day-18', 'day-19',
       'day-20', 'day-21', 'day-22', 'day-23', 'day-24', 'day-25', 'day-26',
       'day-27', 'day-28', 'day-29', 'day-30', 'day-31', 'comment_volume',
       'comment_score', 'discussion_volume', 'discussion_score',
       'follow_volume', 'projects_volume', 'projects_score', 'review_volume',
       'rating_avg', 'signup_country_name_cat_codes', 
        'signup_geo_region_cat_codes', 'signup_platform_cat_codes']]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
dtclf = DecisionTreeClassifier(max_depth=6).fit(X_train, y_train)
y_pred = dtclf.predict(X_test)
cm_dt = confusion_matrix(y_test, y_pred)
print(cm_dt, dtclf.score(X_test, y_test))

[[4581 3224]
 [2321 4115]] 0.6106312758935468


In [12]:
y_w_e = pd.DataFrame({'success': y_test,  'predict' : y_pred})

In [13]:
# 4. 30 day ML
y = month_df_elig['success']
X = month_df_elig[['payment_provider_cat_codes', 'payment_ux_cat_codes',
       'trial_length_offer_cat_codes', 'sub_utm_channel_cat_codes',
       'sub_utm_source_cat_codes',
       'is_cancel_during_trial', 'day-1', 'day-2', 'day-3', 'day-4', 'day-5',
       'day-6', 'day-7', 'day-8', 'day-9', 'day-10', 'day-11', 'day-12',
       'day-13', 'day-14', 'day-15', 'day-16', 'day-17', 'day-18', 'day-19',
       'day-20', 'day-21', 'day-22', 'day-23', 'day-24', 'day-25', 'day-26',
       'day-27', 'day-28', 'day-29', 'day-30', 'day-31', 'comment_volume',
       'comment_score', 'discussion_volume', 'discussion_score',
       'follow_volume', 'projects_volume', 'projects_score', 'review_volume',
       'rating_avg', 'signup_country_name_cat_codes', 
        'signup_geo_region_cat_codes', 'signup_platform_cat_codes']]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
dtclf = DecisionTreeClassifier(max_depth=6).fit(X_train, y_train)
y_pred = dtclf.predict(X_test)
cm_dt = confusion_matrix(y_test, y_pred)
print(cm_dt, dtclf.score(X_test, y_test))

[[23364  2410]
 [10287  5545]] 0.6948276690861895


In [15]:
y_m_e = pd.DataFrame({'success': y_test,  'predict' : y_pred})

In [16]:
# 5. Combine
y_df = y_w_c.append(y_m_c)
y_df = y_df.append(y_w_e)
y_df = y_df.append(y_m_e)

In [17]:
# 6. Get Scores.  Accuracy, Precision, Recall, F1.


In [18]:
print('Accuracy: %.3f' % accuracy_score(y_df['success'], y_df['predict']))
print('Precision: %.3f' % precision_score(y_df['success'], y_df['predict']))
print('Recall: %.3f' % recall_score(y_df['success'], y_df['predict']))
print('F1 Score: %.3f' % f1_score(y_df['success'], y_df['predict']))

Accuracy: 0.927
Precision: 0.632
Recall: 0.434
F1 Score: 0.514


In [None]:
y_df