In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [2]:
class_info = pd.read_csv("../data/skillshare_2022_classes.csv", index_col=0)
print(class_info.columns)
class_info.head()

Index(['class_id', 'class_title', 'create_time', 'description', 'takeaway',
       'category', 'subcategory', 'status', 'is_featured', 'is_premium_only',
       'publish_time', 'publish_date', 'teacher_uid', 'teacher_name',
       'is_top_teacher', 'teacher_create_time', 'teacher_account_age',
       'is_original', 'class_type', 'quality_grade', 'quality_grade_letter',
       'is_quality_class', 'grade_note', 'n_reviews', 'review_avg', 'level_id',
       'level', 'has_been_staff_pick', 'n_times_staff_pick',
       'first_staff_pick_time', 'last_staff_pick_time', 'has_project',
       'project_title', 'sku', 'class_detail_page_link',
       'total_video_duration', 'n_lessons', 'trailer_duration',
       'first_lesson_duration', 'class_display_name', 'image_huge', 'language',
       'language_grade_notes_1', 'language_grade_notes_2',
       'language_grade_notes_3', 'language_grade_notes_all',
       'has_non_english_content', 'non_english_language_1', 'class_origin',
       'class_origi

  class_info = pd.read_csv("../data/skillshare_2022_classes.csv", index_col=0)


Unnamed: 0,class_id,class_title,create_time,description,takeaway,category,subcategory,status,is_featured,is_premium_only,...,language_grade_notes_all,has_non_english_content,non_english_language_1,class_origin,class_origin_raw,published_class_number,teacher_first_publish_time,is_last_published_class,is_first_original_class,is_first_quality_class
0,23577,A Crash Course in Photographic Composition,2013-01-22 17:41:39,<p>This class is for anybody who wants to crea...,,Creative,Photography,published,True,True,...,,False,,Other,No Source,1,2013-01-22 17:41:39,True,False,True
1,51370,Drawing Time: Illustrator Techniques,2013-01-31 23:54:50,"<p><img src=""https://static.skillshare.com/upl...",,Creative,Illustration,published,True,True,...,,False,,Other,No Source,1,2013-01-31 23:54:50,True,False,False
2,55284,Perfect Southern Fried Chicken + Buttermilk Bi...,2013-02-01 23:50:57,<p><strong>Make The Best Southern Fried Chicke...,,Lifestyle,Culinary,published,True,True,...,,False,,Other,No Source,1,2013-02-01 23:50:57,True,False,False
3,65056,EasyDSLR: Master your Digital Camera in less t...,2013-02-05 00:16:27,<p>So you finally got that DSLR camera you've ...,,,Photography,published,True,True,...,,False,,Other,No Source,1,2013-02-05 00:16:27,True,False,False
4,100968,Pattern Design: Creating Repeat Patterns From ...,2013-03-30 04:02:06,<p>Ever wanted to create your own hand-drawn p...,,Creative,Graphic Design,published,True,True,...,,False,,Other,No Source,1,2013-03-30 04:02:06,True,False,False


In [3]:
# Remove HTML tags from descriptions
# i.e. anything between angled brackets
class_info.description = class_info.description.str\
    .replace(r'<[^<>]*>', '', regex=True)

class_info.description

0        This class is for anybody who wants to create ...
1        \nAI Comic Art will cover all concepts relativ...
2        Make The Best Southern Fried Chicken of Your L...
3        So you finally got that DSLR camera you've bee...
4        Ever wanted to create your own hand-drawn patt...
                               ...                        
44470    In this course I teach you how to make a pillo...
44471    Have you covered the basics of piano, but want...
44472    If you want to become an illustrator there’s n...
44473    Welcome Future Event Planners. \nWe Ensure tha...
44474    Are you looking for learning easy flowers usin...
Name: description, Length: 44475, dtype: object

In [4]:
# Just use 'create time' year
class_info.create_time = pd.to_datetime(class_info.create_time)
class_info['created_year'] = class_info.create_time.dt.year
class_info.created_year

0        2013
1        2013
2        2013
3        2013
4        2013
         ... 
44470    2022
44471    2022
44472    2022
44473    2022
44474    2022
Name: created_year, Length: 44475, dtype: int64

In [5]:
class_info = pd.merge(
    class_info[["class_id", "description", "is_top_teacher", "created_year"]],
    pd.get_dummies(class_info.category),
    left_index=True,
    right_index=True
)
class_info

Unnamed: 0,class_id,description,is_top_teacher,created_year,Business,Creative,Lifestyle,Technology
0,23577,This class is for anybody who wants to create ...,False,2013,0,1,0,0
1,51370,\nAI Comic Art will cover all concepts relativ...,False,2013,0,1,0,0
2,55284,Make The Best Southern Fried Chicken of Your L...,False,2013,0,0,1,0
3,65056,So you finally got that DSLR camera you've bee...,False,2013,0,0,0,0
4,100968,Ever wanted to create your own hand-drawn patt...,False,2013,0,1,0,0
...,...,...,...,...,...,...,...,...
44470,582681,In this course I teach you how to make a pillo...,False,2022,0,0,1,0
44471,583753,"Have you covered the basics of piano, but want...",False,2022,0,1,0,0
44472,583862,If you want to become an illustrator there’s n...,False,2022,0,1,0,0
44473,567663,Welcome Future Event Planners. \nWe Ensure tha...,False,2022,1,0,0,0


In [6]:
# Vectorize description text
vectorizer = TfidfVectorizer(stop_words="english")
vectorized_text = vectorizer.fit_transform(class_info.description)
vectorized_text.shape

(44475, 100220)

In [7]:
class_info

Unnamed: 0,class_id,description,is_top_teacher,created_year,Business,Creative,Lifestyle,Technology
0,23577,This class is for anybody who wants to create ...,False,2013,0,1,0,0
1,51370,\nAI Comic Art will cover all concepts relativ...,False,2013,0,1,0,0
2,55284,Make The Best Southern Fried Chicken of Your L...,False,2013,0,0,1,0
3,65056,So you finally got that DSLR camera you've bee...,False,2013,0,0,0,0
4,100968,Ever wanted to create your own hand-drawn patt...,False,2013,0,1,0,0
...,...,...,...,...,...,...,...,...
44470,582681,In this course I teach you how to make a pillo...,False,2022,0,0,1,0
44471,583753,"Have you covered the basics of piano, but want...",False,2022,0,1,0,0
44472,583862,If you want to become an illustrator there’s n...,False,2022,0,1,0,0
44473,567663,Welcome Future Event Planners. \nWe Ensure tha...,False,2022,1,0,0,0


In [8]:
# Non negative matrix factorization
nmf = NMF(n_components=20)
nmf.fit(vectorized_text)
nmf.components_.shape

(20, 100220)

In [9]:
feature_names = vectorizer.get_feature_names_out()
for topic_index, topic in enumerate(nmf.components_):
    feature_indexes = np.argsort(topic)[::-1]
    print("Topic", topic_index + 1, end=": ")
    print(feature_names[feature_indexes[:10]])

Topic 1: ['class' 'll' 'learn' 'create' 'make' 'use' 'using' 'different' 'need'
 'simple']
Topic 2: ['watercolor' 'painting' 'paint' 'wet' 'color' 'paper' 'brush' 'colors'
 'techniques' 'paintings']
Topic 3: ['lunch' 'graphic' 'design' 'adobe' 'class' 'illustrator' 'amp' '10' 'acr'
 'patterns']
Topic 4: ['drawing' 'draw' 'character' 'pencil' 'drawings' 'shapes' 'portrait'
 'characters' 'pencils' 'face']
Topic 5: ['animation' 'effects' 'motion' 'animate' 'animations' 'character'
 'graphics' 'adobe' 'animated' 'animating']
Topic 6: ['course' 'learn' 'll' 'learning' 'students' 'knowledge' 'courses' 'end'
 'start' 'make']
Topic 7: ['website' 'wordpress' 'web' 'css' 'html' 'javascript' 'page' 'websites'
 'code' 'build']
Topic 8: ['writing' 'write' 'story' 'book' 'writer' 'character' 'writers'
 'creative' 'novel' 'characters']
Topic 9: ['photoshop' 'adobe' 'images' 'photo' 'photos' 'image' 'color' 'effect'
 'create' 'digital']
Topic 10: ['video' 'editing' 'videos' 'youtube' 'pro' 'premiere' 

In [10]:
# This step takes awhile
# Transform from each description via LDA to a topic (top topic)
class_info["my_topic"] = class_info.description.apply(lambda text: np.argmax(nmf.transform(vectorizer.transform([text]))))
class_info["my_topic"]

0         0
1        19
2        17
3        17
4        19
         ..
44470    16
44471    14
44472     3
44473     5
44474     1
Name: my_topic, Length: 44475, dtype: int64

In [5]:
starts = pd.read_csv("skillshare_2022_starts_views_combined.csv", index_col=0)
class_w_starts = pd.merge(class_info[["class_id", "my_topi"]], starts, left_on="class_id", right_on="class_id", how="outer")
class_w_starts

Unnamed: 0,uid,class_id,lesson_id,lesson_number,is_trailer,video_duration,view_date,sum,id,user_uid,...,is_direct_to_paid,trial_length_offer,had_trial,sub_utm_source,sub_utm_campaign,sub_utm_medium,sub_utm_term,sub_utm_channel,referral_source,eligible_trial_number
0,1128075.0,121998.0,1533011.0,6.0,False,407.0,2022-01-07,333.0,8815609.0,1128075.0,...,False,One Month,True,,,,,Missing,,6.0
1,1128075.0,121998.0,1533013.0,7.0,False,93.0,2022-01-07,92.0,8815609.0,1128075.0,...,False,One Month,True,,,,,Missing,,6.0
2,1128075.0,240747.0,2841973.0,0.0,True,73.0,2022-01-07,74.0,8815609.0,1128075.0,...,False,One Month,True,,,,,Missing,,6.0
3,1128075.0,240747.0,2841972.0,1.0,False,90.0,2022-01-07,90.0,8815609.0,1128075.0,...,False,One Month,True,,,,,Missing,,6.0
4,1128075.0,240747.0,2841992.0,2.0,False,319.0,2022-01-07,320.0,8815609.0,1128075.0,...,False,One Month,True,,,,,Missing,,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20122360,,,,,,,NaT,,9454516.0,26283105.0,...,False,One Month,True,,,,,Organic Search,https://www.google.com/,1.0
20122361,,,,,,,NaT,,9454622.0,26283254.0,...,False,Other,False,,,,,Organic Search,https://www.google.com/,
20122362,,,,,,,NaT,,9454815.0,26283973.0,...,False,One Week,True,,,,,Organic Search,https://www.google.com/,1.0
20122363,,,,,,,NaT,,9454818.0,26266500.0,...,False,One Week,True,(direct),,(none),,Direct,,1.0


In [6]:
account_and_views_info = account_and_views_info[account_and_views_info.trial_length_offer.isin(["One Month", "One Week"])]
account_and_views_info

Unnamed: 0,uid,class_id,lesson_id,lesson_number,is_trailer,video_duration,view_date,sum,id,user_uid,...,is_direct_to_paid,trial_length_offer,had_trial,sub_utm_source,sub_utm_campaign,sub_utm_medium,sub_utm_term,sub_utm_channel,referral_source,eligible_trial_number
0,1128075.0,121998.0,1533011.0,6.0,False,407.0,2022-01-07,333.0,8815609.0,1128075.0,...,False,One Month,True,,,,,Missing,,6.0
1,1128075.0,121998.0,1533013.0,7.0,False,93.0,2022-01-07,92.0,8815609.0,1128075.0,...,False,One Month,True,,,,,Missing,,6.0
2,1128075.0,240747.0,2841973.0,0.0,True,73.0,2022-01-07,74.0,8815609.0,1128075.0,...,False,One Month,True,,,,,Missing,,6.0
3,1128075.0,240747.0,2841972.0,1.0,False,90.0,2022-01-07,90.0,8815609.0,1128075.0,...,False,One Month,True,,,,,Missing,,6.0
4,1128075.0,240747.0,2841992.0,2.0,False,319.0,2022-01-07,320.0,8815609.0,1128075.0,...,False,One Month,True,,,,,Missing,,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20122359,,,,,,,NaT,,9454467.0,26283012.0,...,False,One Month,True,youtube,2022-05-01-pjiggles,paid-influencer-video,,Youtube Influencer,,1.0
20122360,,,,,,,NaT,,9454516.0,26283105.0,...,False,One Month,True,,,,,Organic Search,https://www.google.com/,1.0
20122362,,,,,,,NaT,,9454815.0,26283973.0,...,False,One Week,True,,,,,Organic Search,https://www.google.com/,1.0
20122363,,,,,,,NaT,,9454818.0,26266500.0,...,False,One Week,True,(direct),,(none),,Direct,,1.0


In [7]:
account_and_views_info.view_date = pd.to_datetime(account_and_views_info.view_date)
account_and_views_info.create_time = pd.to_datetime(account_and_views_info.create_time)

account_and_views_info["day_of_trial"] = account_and_views_info.view_date - account_and_views_info.create_time
account_and_views_info.dropna(subset=["day_of_trial"], inplace=True)
account_and_views_info["day_of_trial"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  account_and_views_info.view_date = pd.to_datetime(account_and_views_info.view_date)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  account_and_views_info.create_time = pd.to_datetime(account_and_views_info.create_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  account_and_views_info["day_of_tr

0            2 days 20:36:33
1            2 days 20:36:33
2            2 days 20:36:33
3            2 days 20:36:33
4            2 days 20:36:33
                  ...       
19977675    40 days 15:27:17
19977676    42 days 05:21:33
19977677   -1 days +05:52:51
19977678    42 days 05:21:33
19977679   -1 days +05:52:51
Name: day_of_trial, Length: 12639366, dtype: timedelta64[ns]