In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

In [2]:
# pull starts raw data
starts_df = pd.read_csv('../capstone_data/skillshare_2022_starts.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
# filter starts_df to our needed peramenters.
# only annual
starts_df = starts_df[starts_df['plan_length'] == 12]
# not B2B
starts_df = starts_df[starts_df['is_team'] == False]
# no scholarships
starts_df = starts_df[starts_df['is_scholarship'] == False]
# has a free trial
starts_df = starts_df[starts_df['is_direct_to_paid'] == False]
starts_df = starts_df[starts_df['had_trial'] == True]
# no special trial lengths
starts_df = starts_df[starts_df['trial_length_offer'].isin(['One Week', 'One Month'])]

In [4]:
# these are the columns have some potential value to prediction.
prediction_cols = ['user_uid', 'create_time', 'first_payment_time', 'last_payment_attempt',
                   'last_failed_payment_attempt', 'user_cancellation_time', 'cancellation_time', 
                   'refund_time', 'coupon_id', 'coupon_trial_length', 'payment_provider', 'payment_ux', 
                   'is_refunded', 'is_cancelled', 'has_paid', 'trial_end', 'first_payment_currency_code', 'original_trial_end', 
                   'extended_trial_end', 'was_trial_extended', 'is_trial_extension', 'is_split_trial', 
                   'trial_length_days', 'trial_length_offer', 'sub_utm_source', 'sub_utm_campaign', 
                   'sub_utm_medium', 'sub_utm_term', 'sub_utm_channel', 'referral_source', 'eligible_trial_number']

clean_df = starts_df[prediction_cols]

In [5]:
# add a successful conversion column.
clean_df['success'] = 0

# set to 1 if they paid
clean_df['success'][clean_df['first_payment_time'].notnull()] = 1

# return to 0 if they got a refund.
clean_df['success'][clean_df['is_refunded']==1] = 0

In [6]:
# create category code columns for each of the most relevant predictive columns
clean_df['payment_provider_cat'] = clean_df['payment_provider'].astype('category')
clean_df['payment_provider_cat_codes'] = clean_df['payment_provider_cat'].cat.codes

clean_df['payment_ux_cat'] = clean_df['payment_ux'].astype('category')
clean_df['payment_ux_cat_codes'] = clean_df['payment_ux_cat'].cat.codes

clean_df['trial_length_offer_cat'] = clean_df['trial_length_offer'].astype('category')
clean_df['trial_length_offer_cat_codes'] = clean_df['trial_length_offer_cat'].cat.codes

clean_df['sub_utm_channel_cat'] = clean_df['sub_utm_channel'].astype('category')
clean_df['sub_utm_channel_cat_codes'] = clean_df['sub_utm_channel_cat'].cat.codes

clean_df['sub_utm_source_cat'] = clean_df['sub_utm_source'].astype('category')
clean_df['sub_utm_source_cat_codes'] = clean_df['sub_utm_source_cat'].cat.codes

clean_df['user_uid'] = clean_df['user_uid'].astype(int)

In [7]:
# export all of the lookup columns for EDA analysis.
payment_provider_lookup_df = clean_df.groupby(
    by=['payment_provider', 'payment_provider_cat_codes']).agg(
        volume=pd.NamedAgg(column='user_uid', aggfunc='count')).reset_index()
payment_provider_lookup_df.to_csv('lookup_payment_providers.csv')

payment_ux_df = clean_df.groupby(
    by=['payment_ux', 'payment_ux_cat_codes']).agg(
        volume=pd.NamedAgg(column='user_uid', aggfunc='count')).reset_index()
payment_ux_df.to_csv('lookup_payment_ux.csv')

trial_length_df = clean_df.groupby(
    by=['trial_length_offer', 'trial_length_offer_cat_codes']).agg(
        volume=pd.NamedAgg(column='user_uid', aggfunc='count')).reset_index()
trial_length_df.to_csv('lookup_trial_length_offer.csv')

sub_utm_channel_df = clean_df.groupby(
    by=['sub_utm_channel', 'sub_utm_channel_cat_codes']).agg(
        volume=pd.NamedAgg(column='user_uid', aggfunc='count')).reset_index()
sub_utm_channel_df.to_csv('lookup_sub_utm_channel.csv')

sub_utm_source_df = clean_df.groupby(
    by=['sub_utm_source', 'sub_utm_source_cat_codes']).agg(
        volume=pd.NamedAgg(column='user_uid', aggfunc='count')).reset_index()
sub_utm_source_df.to_csv('lookup_sub_utm_source.csv')

In [12]:
# make a df of only the columms we want for prediction
p_cols = ['user_uid', 'create_time', 'success', 'is_cancelled', 'payment_provider_cat_codes', 'payment_ux_cat_codes', 
          'trial_length_offer_cat_codes', 'sub_utm_channel_cat_codes', 'sub_utm_source_cat_codes']
mlready_df = clean_df[p_cols]

# remove duplicates by keeping the most recent subscription for any user_uid.
mlready_df.sort_values(by='create_time', inplace=True)
mlready_df = mlready_df.drop_duplicates(subset=['user_uid'], keep='last')

In [14]:
# append the video views data onto the starts.
# creation of video views data file found in data_combine_v1.py
vviews_df = pd.read_csv('../capstone_data/skillshare_2022_all_views.csv')

# rename uid and remove unneeded columns.
vviews_df.rename(columns={'uid':'user_uid'}, inplace=True)
del vviews_df['Unnamed: 0']

In [63]:
# merge vviews to starts data and fill in missing data with 0.
combo_df = mlready_df.merge(vviews_df, how='left', on='user_uid')
combo_df = combo_df.fillna(0.0)

In [52]:
#### begin to translate non-video activity to merge.

# first we need to make sure non-video activity is during the trial for the user
# so we need a dataframe of the the trial date range for the user.

# make a day version of the trial start and end dates.
starts_df['trial_end_day'] = pd.to_datetime(starts_df.original_trial_end).dt.date
starts_df['trial_start_day'] = pd.to_datetime(starts_df.create_time).dt.date

# for some reason user_uid is a float only on this dataframe. Change it to an int.
starts_df['user_uid'] = starts_df['user_uid'].astype(int)

# make a df for merging the trial start and end.
trial_ends = starts_df[['user_uid', 'trial_start_day', 'trial_end_day']]

In [65]:
### Merge on Comments Data.
# pull the comments data.
comments_df = pd.read_csv('../capstone_data/skillshare_2022_comments.csv')

# change user_id column name for easy merge.
comments_df.rename(columns={'user_id':'user_uid'}, inplace=True)

# merge on the trial start and end.
comments_df = comments_df.merge(trial_ends, how='left', on='user_uid')

# round create_time to created_day.
comments_df['create_day'] = pd.to_datetime(comments_df.create_time).dt.date

# filter data to comments that happened during the user's trial
comments_df = comments_df[comments_df['create_day'] > comments_df['trial_start_day']]
comments_df = comments_df[comments_df['create_day'] < comments_df['trial_end_day']]

# make a groupby that for each user_uid that includes num of comments and total comment score.
comment_gb_df = comments_df.groupby(by=['user_uid']).agg(
    comment_volume=pd.NamedAgg(column='id', aggfunc='count'), 
    comment_score=pd.NamedAgg(column='score', aggfunc='sum')).reset_index()

# merge onto main dataframe.
combo_df = combo_df.merge(comment_gb_df, how='left', on='user_uid')

In [66]:
### Merge on Discussions Data.
# Follow the same process as comments for discussions.
discussions_df = pd.read_csv('../capstone_data/skillshare_2022_discussions.csv')
discussions_df.rename(columns={'user_id':'user_uid'}, inplace=True)
discussions_df = discussions_df.merge(trial_ends, how='left', on='user_uid')
discussions_df['create_day'] = pd.to_datetime(discussions_df.create_time).dt.date
discussions_df = discussions_df[discussions_df['create_day'] > discussions_df['trial_start_day']]
discussions_df = discussions_df[discussions_df['create_day'] < discussions_df['trial_end_day']]
discussions_gb_df = discussions_df.groupby(by=['user_uid']).agg(
    discussion_volume=pd.NamedAgg(column='id', aggfunc='count'), 
    discussion_score=pd.NamedAgg(column='score', aggfunc='sum')).reset_index()
combo_df = combo_df.merge(discussions_gb_df, how='left', on='user_uid')

In [70]:
### Merge on Follows Data.
# Follow the same process as comments for follows.
follows_df = pd.read_csv('../capstone_data/skillshare_2022_follows.csv')
follows_df.rename(columns={'follower_uid':'user_uid'}, inplace=True)
follows_df = follows_df.merge(trial_ends, how='left', on='user_uid')
follows_df['follow_day'] = pd.to_datetime(follows_df.follow_time).dt.date
follows_df = follows_df[follows_df['follow_day'] > follows_df['trial_start_day']]
follows_df = follows_df[follows_df['follow_day'] < follows_df['trial_end_day']]

# make a groupby by user_uid that counts the number of follows.
follows_gb_df = follows_df.groupby(by=['user_uid']).agg(
    follow_volume=pd.NamedAgg(column='target_uid', aggfunc='count')).reset_index()
combo_df = combo_df.merge(follows_gb_df, how='left', on='user_uid')

In [83]:
### Merge on Projects Data.
# Follow the same process as comments for follows.
projects_df = pd.read_csv('../capstone_data/skillshare_2022_projects.csv')
projects_df.rename(columns={'uid':'user_uid'}, inplace=True)
projects_df = projects_df.merge(trial_ends, how='left', on='user_uid')
projects_df['create_day'] = pd.to_datetime(projects_df.create_time).dt.date
projects_df = projects_df[projects_df['create_day'] > projects_df['trial_start_day']]
projects_df = projects_df[projects_df['create_day'] < projects_df['trial_end_day']]
projects_gb_df = projects_df.groupby(by=['user_uid']).agg(
    projects_volume=pd.NamedAgg(column='id', aggfunc='count'), 
    projects_score=pd.NamedAgg(column='num_up', aggfunc='sum')).reset_index()
combo_df = combo_df.merge(projects_gb_df, how='left', on='user_uid')

In [88]:
### Merge on Reviews Data.
# Follow the same process as comments for follows.
reviews_df = pd.read_csv('../capstone_data/skillshare_2022_reviews.csv')
reviews_df.rename(columns={'uid':'user_uid'}, inplace=True)
reviews_df = reviews_df.merge(trial_ends, how='left', on='user_uid')
reviews_df['create_day'] = pd.to_datetime(reviews_df.create_time).dt.date
reviews_df = reviews_df[reviews_df['create_day'] > reviews_df['trial_start_day']]
reviews_df = reviews_df[reviews_df['create_day'] < reviews_df['trial_end_day']]

# make a groupby for each user and their volume of reviews and avg review score.
reviews_gb_df = reviews_df.groupby(by=['user_uid']).agg(
    review_volume=pd.NamedAgg(column='review_id', aggfunc='count'), 
    rating_avg=pd.NamedAgg(column='rating', aggfunc='mean')).reset_index()
combo_df = combo_df.merge(reviews_gb_df, how='left', on='user_uid')

In [90]:
# export data
combo_df.to_csv('skillshare_combined.csv')