In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

In [2]:
clean_df = pd.read_csv('~/capstone_data/skillshare_2022_starts_clean.csv')

In [18]:
# make a df of only the columms we want for prediction
p_cols = ['user_uid', 'create_time', 'success', 'is_cancelled', 'payment_provider_cat_codes', 'payment_ux_cat_codes', 
          'trial_length_offer_cat_codes', 'sub_utm_channel_cat_codes', 'sub_utm_source_cat_codes', 'is_cancel_during_trial']
mlready_df = clean_df[p_cols]

# remove duplicates by keeping the most recent subscription for any user_uid.
mlready_df.sort_values(by='create_time', inplace=True)
mlready_df = mlready_df.drop_duplicates(subset=['user_uid'], keep='last')

In [19]:
# append the video views data onto the starts.
# creation of video views data file found in data_combine_v1.py
vviews_df = pd.read_csv('~/capstone_data/skillshare_2022_all_views.csv')

# rename uid and remove unneeded columns.
vviews_df.rename(columns={'uid':'user_uid'}, inplace=True)
del vviews_df['Unnamed: 0']

In [20]:
# merge vviews to starts data and fill in missing data with 0.
combo_df = mlready_df.merge(vviews_df, how='left', on='user_uid')
combo_df = combo_df.fillna(0.0)

In [21]:
#### begin to translate non-video activity to merge.
# make a df for merging the trial start and end.

# when saved as csv, the date columns tend to revert to str, so to be sure:
# make a day version of the trial start and end dates.
clean_df['trial_end_day'] = pd.to_datetime(clean_df.original_trial_end).dt.date
clean_df['trial_start_day'] = pd.to_datetime(clean_df.create_time).dt.date

trial_ends = clean_df[['user_uid', 'trial_start_day', 'trial_end_day']]

In [22]:
### Merge on Comments Data.
# pull the comments data.
comments_df = pd.read_csv('~/capstone_data/skillshare_2022_comments.csv')

# change user_id column name for easy merge.
comments_df.rename(columns={'user_id':'user_uid'}, inplace=True)

# merge on the trial start and end.
comments_df = comments_df.merge(trial_ends, how='left', on='user_uid')

# round create_time to created_day.
comments_df['create_day'] = pd.to_datetime(comments_df.create_time).dt.date

# filter data to comments that happened during the user's trial
comments_df = comments_df[comments_df['create_day'] > comments_df['trial_start_day']]
comments_df = comments_df[comments_df['create_day'] < comments_df['trial_end_day']]

# make a groupby that for each user_uid that includes num of comments and total comment score.
comment_gb_df = comments_df.groupby(by=['user_uid']).agg(
    comment_volume=pd.NamedAgg(column='id', aggfunc='count'), 
    comment_score=pd.NamedAgg(column='score', aggfunc='sum')).reset_index()

# merge onto main dataframe.
combo_df = combo_df.merge(comment_gb_df, how='left', on='user_uid')

In [23]:
### Merge on Discussions Data.
# Follow the same process as comments for discussions.
discussions_df = pd.read_csv('~/capstone_data/skillshare_2022_discussions.csv')
discussions_df.rename(columns={'user_id':'user_uid'}, inplace=True)
discussions_df = discussions_df.merge(trial_ends, how='left', on='user_uid')
discussions_df['create_day'] = pd.to_datetime(discussions_df.create_time).dt.date
discussions_df = discussions_df[discussions_df['create_day'] > discussions_df['trial_start_day']]
discussions_df = discussions_df[discussions_df['create_day'] < discussions_df['trial_end_day']]
discussions_gb_df = discussions_df.groupby(by=['user_uid']).agg(
    discussion_volume=pd.NamedAgg(column='id', aggfunc='count'), 
    discussion_score=pd.NamedAgg(column='score', aggfunc='sum')).reset_index()
combo_df = combo_df.merge(discussions_gb_df, how='left', on='user_uid')

In [24]:
### Merge on Follows Data.
# Follow the same process as comments for follows.
follows_df = pd.read_csv('~/capstone_data/skillshare_2022_follows.csv')
follows_df.rename(columns={'follower_uid':'user_uid'}, inplace=True)
follows_df = follows_df.merge(trial_ends, how='left', on='user_uid')
follows_df['follow_day'] = pd.to_datetime(follows_df.follow_time).dt.date
follows_df = follows_df[follows_df['follow_day'] > follows_df['trial_start_day']]
follows_df = follows_df[follows_df['follow_day'] < follows_df['trial_end_day']]

# make a groupby by user_uid that counts the number of follows.
follows_gb_df = follows_df.groupby(by=['user_uid']).agg(
    follow_volume=pd.NamedAgg(column='target_uid', aggfunc='count')).reset_index()
combo_df = combo_df.merge(follows_gb_df, how='left', on='user_uid')

In [25]:
### Merge on Projects Data.
# Follow the same process as comments for follows.
projects_df = pd.read_csv('~/capstone_data/skillshare_2022_projects.csv')
projects_df.rename(columns={'uid':'user_uid'}, inplace=True)
projects_df = projects_df.merge(trial_ends, how='left', on='user_uid')
projects_df['create_day'] = pd.to_datetime(projects_df.create_time).dt.date
projects_df = projects_df[projects_df['create_day'] > projects_df['trial_start_day']]
projects_df = projects_df[projects_df['create_day'] < projects_df['trial_end_day']]
projects_gb_df = projects_df.groupby(by=['user_uid']).agg(
    projects_volume=pd.NamedAgg(column='id', aggfunc='count'), 
    projects_score=pd.NamedAgg(column='num_up', aggfunc='sum')).reset_index()
combo_df = combo_df.merge(projects_gb_df, how='left', on='user_uid')

In [26]:
### Merge on Reviews Data.
# Follow the same process as comments for follows.
reviews_df = pd.read_csv('~/capstone_data/skillshare_2022_reviews.csv')
reviews_df.rename(columns={'uid':'user_uid'}, inplace=True)
reviews_df = reviews_df.merge(trial_ends, how='left', on='user_uid')
reviews_df['create_day'] = pd.to_datetime(reviews_df.create_time).dt.date
reviews_df = reviews_df[reviews_df['create_day'] > reviews_df['trial_start_day']]
reviews_df = reviews_df[reviews_df['create_day'] < reviews_df['trial_end_day']]

# make a groupby for each user and their volume of reviews and avg review score.
reviews_gb_df = reviews_df.groupby(by=['user_uid']).agg(
    review_volume=pd.NamedAgg(column='review_id', aggfunc='count'), 
    rating_avg=pd.NamedAgg(column='rating', aggfunc='mean')).reset_index()
combo_df = combo_df.merge(reviews_gb_df, how='left', on='user_uid')

In [27]:
#### begin to merge on subscriber location meta data
subs_meta_df = pd.read_csv('~/capstone_data/skillshare_subs_meta.csv')

In [28]:
combo_df = combo_df.merge(subs_meta_df, how='left', on='user_uid')

In [29]:
combo_df['signup_country_name_cat'] = combo_df['signup_country_name'].astype('category')
combo_df['signup_country_name_cat_codes'] = combo_df['signup_country_name_cat'].cat.codes

combo_df['signup_geo_region_cat'] = combo_df['signup_geo_region'].astype('category')
combo_df['signup_geo_region_cat_codes'] = combo_df['signup_geo_region_cat'].cat.codes

combo_df['signup_platform_cat'] = combo_df['signup_platform'].astype('category')
combo_df['signup_platform_cat_codes'] = combo_df['signup_platform_cat'].cat.codes

In [30]:
payment_provider_lookup_df = combo_df.groupby(
    by=['signup_country_name', 'signup_country_name_cat_codes']).agg(
        volume=pd.NamedAgg(column='user_uid', aggfunc='count')).reset_index()
payment_provider_lookup_df.to_csv('~/capstone_data/lookup_country_name.csv', index=False)

payment_ux_df = combo_df.groupby(
    by=['signup_geo_region', 'signup_geo_region_cat_codes']).agg(
        volume=pd.NamedAgg(column='user_uid', aggfunc='count')).reset_index()
payment_ux_df.to_csv('~/capstone_data/lookup_geo_region.csv', index=False)

trial_length_df = combo_df.groupby(
    by=['signup_platform', 'signup_platform_cat_codes']).agg(
        volume=pd.NamedAgg(column='user_uid', aggfunc='count')).reset_index()
trial_length_df.to_csv('~/capstone_data/lookup_platform.csv', index=False)

In [31]:
del combo_df['signup_country_name']
del combo_df['signup_geo_region']
del combo_df['signup_platform']
del combo_df['signup_country_name_cat']
del combo_df['signup_geo_region_cat']
del combo_df['signup_platform_cat']

In [32]:
# export data
combo_df.to_csv('~/capstone_data/skillshare_combined.csv', index=False)