In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [2]:
class_info = pd.read_csv("../data/skillshare_2022_classes.csv", index_col=0)
print(class_info.columns)
class_info.head()

Index(['class_id', 'class_title', 'create_time', 'description', 'takeaway',
       'category', 'subcategory', 'status', 'is_featured', 'is_premium_only',
       'publish_time', 'publish_date', 'teacher_uid', 'teacher_name',
       'is_top_teacher', 'teacher_create_time', 'teacher_account_age',
       'is_original', 'class_type', 'quality_grade', 'quality_grade_letter',
       'is_quality_class', 'grade_note', 'n_reviews', 'review_avg', 'level_id',
       'level', 'has_been_staff_pick', 'n_times_staff_pick',
       'first_staff_pick_time', 'last_staff_pick_time', 'has_project',
       'project_title', 'sku', 'class_detail_page_link',
       'total_video_duration', 'n_lessons', 'trailer_duration',
       'first_lesson_duration', 'class_display_name', 'image_huge', 'language',
       'language_grade_notes_1', 'language_grade_notes_2',
       'language_grade_notes_3', 'language_grade_notes_all',
       'has_non_english_content', 'non_english_language_1', 'class_origin',
       'class_origi

  class_info = pd.read_csv("../data/skillshare_2022_classes.csv", index_col=0)


Unnamed: 0,class_id,class_title,create_time,description,takeaway,category,subcategory,status,is_featured,is_premium_only,...,language_grade_notes_all,has_non_english_content,non_english_language_1,class_origin,class_origin_raw,published_class_number,teacher_first_publish_time,is_last_published_class,is_first_original_class,is_first_quality_class
0,23577,A Crash Course in Photographic Composition,2013-01-22 17:41:39,<p>This class is for anybody who wants to crea...,,Creative,Photography,published,True,True,...,,False,,Other,No Source,1,2013-01-22 17:41:39,True,False,True
1,51370,Drawing Time: Illustrator Techniques,2013-01-31 23:54:50,"<p><img src=""https://static.skillshare.com/upl...",,Creative,Illustration,published,True,True,...,,False,,Other,No Source,1,2013-01-31 23:54:50,True,False,False
2,55284,Perfect Southern Fried Chicken + Buttermilk Bi...,2013-02-01 23:50:57,<p><strong>Make The Best Southern Fried Chicke...,,Lifestyle,Culinary,published,True,True,...,,False,,Other,No Source,1,2013-02-01 23:50:57,True,False,False
3,65056,EasyDSLR: Master your Digital Camera in less t...,2013-02-05 00:16:27,<p>So you finally got that DSLR camera you've ...,,,Photography,published,True,True,...,,False,,Other,No Source,1,2013-02-05 00:16:27,True,False,False
4,100968,Pattern Design: Creating Repeat Patterns From ...,2013-03-30 04:02:06,<p>Ever wanted to create your own hand-drawn p...,,Creative,Graphic Design,published,True,True,...,,False,,Other,No Source,1,2013-03-30 04:02:06,True,False,False


In [3]:
# Remove HTML tags from descriptions
# i.e. anything between angled brackets
class_info.description = class_info.description.str\
    .replace(r'<[^<>]*>', '', regex=True)

class_info.description

0        This class is for anybody who wants to create ...
1        \nAI Comic Art will cover all concepts relativ...
2        Make The Best Southern Fried Chicken of Your L...
3        So you finally got that DSLR camera you've bee...
4        Ever wanted to create your own hand-drawn patt...
                               ...                        
44470    In this course I teach you how to make a pillo...
44471    Have you covered the basics of piano, but want...
44472    If you want to become an illustrator there’s n...
44473    Welcome Future Event Planners. \nWe Ensure tha...
44474    Are you looking for learning easy flowers usin...
Name: description, Length: 44475, dtype: object

In [4]:
# Just use 'create time' year
class_info.create_time = pd.to_datetime(class_info.create_time)
class_info['created_year'] = class_info.create_time.dt.year
class_info.created_year

0        2013
1        2013
2        2013
3        2013
4        2013
         ... 
44470    2022
44471    2022
44472    2022
44473    2022
44474    2022
Name: created_year, Length: 44475, dtype: int64

In [5]:
class_info = pd.merge(
    class_info[["class_id", "description", "is_top_teacher", "created_year"]],
    pd.get_dummies(class_info.category),
    left_index=True,
    right_index=True
)
class_info

Unnamed: 0,class_id,description,is_top_teacher,created_year,Business,Creative,Lifestyle,Technology
0,23577,This class is for anybody who wants to create ...,False,2013,0,1,0,0
1,51370,\nAI Comic Art will cover all concepts relativ...,False,2013,0,1,0,0
2,55284,Make The Best Southern Fried Chicken of Your L...,False,2013,0,0,1,0
3,65056,So you finally got that DSLR camera you've bee...,False,2013,0,0,0,0
4,100968,Ever wanted to create your own hand-drawn patt...,False,2013,0,1,0,0
...,...,...,...,...,...,...,...,...
44470,582681,In this course I teach you how to make a pillo...,False,2022,0,0,1,0
44471,583753,"Have you covered the basics of piano, but want...",False,2022,0,1,0,0
44472,583862,If you want to become an illustrator there’s n...,False,2022,0,1,0,0
44473,567663,Welcome Future Event Planners. \nWe Ensure tha...,False,2022,1,0,0,0


In [6]:
# Vectorize description text
vectorizer = TfidfVectorizer(stop_words="english")
vectorized_text = vectorizer.fit_transform(class_info.description)
vectorized_text.shape

(44475, 100220)

In [7]:
class_info

Unnamed: 0,class_id,description,is_top_teacher,created_year,Business,Creative,Lifestyle,Technology
0,23577,This class is for anybody who wants to create ...,False,2013,0,1,0,0
1,51370,\nAI Comic Art will cover all concepts relativ...,False,2013,0,1,0,0
2,55284,Make The Best Southern Fried Chicken of Your L...,False,2013,0,0,1,0
3,65056,So you finally got that DSLR camera you've bee...,False,2013,0,0,0,0
4,100968,Ever wanted to create your own hand-drawn patt...,False,2013,0,1,0,0
...,...,...,...,...,...,...,...,...
44470,582681,In this course I teach you how to make a pillo...,False,2022,0,0,1,0
44471,583753,"Have you covered the basics of piano, but want...",False,2022,0,1,0,0
44472,583862,If you want to become an illustrator there’s n...,False,2022,0,1,0,0
44473,567663,Welcome Future Event Planners. \nWe Ensure tha...,False,2022,1,0,0,0


In [8]:
# Non negative matrix factorization
nmf = NMF(n_components=20)
nmf.fit(vectorized_text)
nmf.components_.shape

(20, 100220)

In [9]:
feature_names = vectorizer.get_feature_names_out()
for topic_index, topic in enumerate(nmf.components_):
    feature_indexes = np.argsort(topic)[::-1]
    print("Topic", topic_index + 1, end=": ")
    print(feature_names[feature_indexes[:10]])

Topic 1: ['class' 'll' 'make' 'learn' 'create' 'use' 'need' 'work' 'different'
 'way']
Topic 2: ['watercolor' 'painting' 'paint' 'wet' 'color' 'paper' 'brush'
 'techniques' 'colors' 'paintings']
Topic 3: ['lunch' 'graphic' 'design' 'adobe' 'class' 'illustrator' 'amp' '10' 'acr'
 'patterns']
Topic 4: ['drawing' 'draw' 'pencil' 'drawings' 'shapes' 'portrait' 'pencils'
 'paper' 'line' 'sketching']
Topic 5: ['animation' 'effects' 'motion' 'animate' 'animations' 'graphics' 'adobe'
 'create' 'text' 'animated']
Topic 6: ['course' 'time' 'life' 'learn' 'want' 'help' 'people' 'know' 'make'
 'skills']
Topic 7: ['website' 'wordpress' 'web' 'css' 'html' 'javascript' 'page' 'websites'
 'code' 'seo']
Topic 8: ['character' 'characters' 'illustration' 'animation' 'll' 'story'
 'cartoon' 'rigging' 'process' 'body']
Topic 9: ['photoshop' 'adobe' 'images' 'photo' 'photos' 'image' 'color' 'create'
 'effect' 'digital']
Topic 10: ['video' 'editing' 'videos' 'youtube' 'pro' 'premiere' 'edit' 'audio'
 'camera

In [10]:
# This step takes awhile
# Transform from each description via LDA to a topic (top topic)
class_info["my_topic"] = class_info.description.apply(lambda text: np.argmax(nmf.transform(vectorizer.transform([text]))))
class_info = class_info.set_index("class_id")
class_info["my_topic"]

KeyboardInterrupt: 

In [None]:
one_hot_clusters = pd.get_dummies(class_info["my_topic"])
one_hot_clusters.reset_index(inplace=True)
one_hot_clusters

In [None]:
starts = pd.read_csv("../data/skillshare_2022_starts_views_combined.csv", index_col=0)
starts_w_class_clusters = pd.merge(one_hot_clusters, starts, left_on="class_id", right_on="class_id", how="outer")
starts_w_class_clusters