In [9]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

In [10]:
# pull starts raw data
starts_df = pd.read_csv('~/capstone_data/skillshare_2022_starts.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [11]:
# filter starts_df to our needed peramenters.
# only annual
starts_df = starts_df[starts_df['plan_length'] == 12]
# not B2B
starts_df = starts_df[starts_df['is_team'] == False]
# no scholarships
starts_df = starts_df[starts_df['is_scholarship'] == False]
# has a free trial
starts_df = starts_df[starts_df['is_direct_to_paid'] == False]
starts_df = starts_df[starts_df['had_trial'] == True]
# no special trial lengths
starts_df = starts_df[starts_df['trial_length_offer'].isin(['One Week', 'One Month'])]

In [12]:
# these are the columns have some potential value to prediction.
prediction_cols = ['user_uid', 'create_time', 'first_payment_time', 'last_payment_attempt',
                   'last_failed_payment_attempt', 'user_cancellation_time', 'cancellation_time', 
                   'refund_time', 'coupon_id', 'coupon_trial_length', 'payment_provider', 'payment_ux', 
                   'is_refunded', 'is_cancelled', 'has_paid', 'trial_end', 'first_payment_currency_code', 'original_trial_end', 
                   'extended_trial_end', 'was_trial_extended', 'is_trial_extension', 'is_split_trial', 
                   'trial_length_days', 'trial_length_offer', 'sub_utm_source', 'sub_utm_campaign', 
                   'sub_utm_medium', 'sub_utm_term', 'sub_utm_channel', 'referral_source', 'eligible_trial_number']

clean_df = starts_df[prediction_cols]

In [18]:
# add a successful conversion column.
clean_df['success'] = 0

# set to 1 if they paid
clean_df['success'][clean_df['first_payment_time'].notnull()] = 1

# return to 0 if they got a refund.
clean_df['success'][clean_df['is_refunded']==1] = 0

# make a day version of the trial start and end dates.
clean_df['cancellation_day'] = pd.to_datetime(clean_df.cancellation_time)
clean_df['trial_end_day'] = pd.to_datetime(clean_df.original_trial_end)
clean_df['trial_start_day'] = pd.to_datetime(clean_df.create_time)

# Need to breakdown if the cancellation came before or after the first payment.
clean_df['is_cancel_during_trial'] = 0
clean_df['is_cancel_during_trial'][clean_df['cancellation_day'] <= clean_df['trial_end_day']] = 1

In [19]:
# for some reason user_uid is a float only on this dataframe. Change it to an int.
clean_df['user_uid'] = clean_df['user_uid'].astype(int)

In [20]:
# create category code columns for each of the most relevant predictive columns
clean_df['payment_provider_cat'] = clean_df['payment_provider'].astype('category')
clean_df['payment_provider_cat_codes'] = clean_df['payment_provider_cat'].cat.codes

clean_df['payment_ux_cat'] = clean_df['payment_ux'].astype('category')
clean_df['payment_ux_cat_codes'] = clean_df['payment_ux_cat'].cat.codes

clean_df['trial_length_offer_cat'] = clean_df['trial_length_offer'].astype('category')
clean_df['trial_length_offer_cat_codes'] = clean_df['trial_length_offer_cat'].cat.codes

clean_df['sub_utm_channel_cat'] = clean_df['sub_utm_channel'].astype('category')
clean_df['sub_utm_channel_cat_codes'] = clean_df['sub_utm_channel_cat'].cat.codes

clean_df['sub_utm_source_cat'] = clean_df['sub_utm_source'].astype('category')
clean_df['sub_utm_source_cat_codes'] = clean_df['sub_utm_source_cat'].cat.codes

clean_df['user_uid'] = clean_df['user_uid'].astype(int)

In [21]:
# export all of the lookup columns for EDA analysis.
payment_provider_lookup_df = clean_df.groupby(
    by=['payment_provider', 'payment_provider_cat_codes']).agg(
        volume=pd.NamedAgg(column='user_uid', aggfunc='count')).reset_index()
payment_provider_lookup_df.to_csv('~/capstone_data/lookup_payment_providers.csv', index=False)

payment_ux_df = clean_df.groupby(
    by=['payment_ux', 'payment_ux_cat_codes']).agg(
        volume=pd.NamedAgg(column='user_uid', aggfunc='count')).reset_index()
payment_ux_df.to_csv('~/capstone_data/lookup_payment_ux.csv', index=False)

trial_length_df = clean_df.groupby(
    by=['trial_length_offer', 'trial_length_offer_cat_codes']).agg(
        volume=pd.NamedAgg(column='user_uid', aggfunc='count')).reset_index()
trial_length_df.to_csv('~/capstone_data/lookup_trial_length_offer.csv', index=False)

sub_utm_channel_df = clean_df.groupby(
    by=['sub_utm_channel', 'sub_utm_channel_cat_codes']).agg(
        volume=pd.NamedAgg(column='user_uid', aggfunc='count')).reset_index()
sub_utm_channel_df.to_csv('~/capstone_data/lookup_sub_utm_channel.csv', index=False)

sub_utm_source_df = clean_df.groupby(
    by=['sub_utm_source', 'sub_utm_source_cat_codes']).agg(
        volume=pd.NamedAgg(column='user_uid', aggfunc='count')).reset_index()
sub_utm_source_df.to_csv('~/capstone_data/lookup_sub_utm_source.csv', index=False)

In [22]:
clean_df.to_csv('~/capstone_data/skillshare_2022_starts_clean.csv', index=False)