In [1]:
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.patches import Polygon

## Dataset General Information: ##

In [3]:
Course_info = pd.read_csv('./raw/Course_info.csv')
Course_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209734 entries, 0 to 209733
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  209734 non-null  float64
 1   title               209734 non-null  object 
 2   is_paid             209734 non-null  bool   
 3   price               209734 non-null  float64
 4   headline            209703 non-null  object 
 5   num_subscribers     209734 non-null  float64
 6   avg_rating          209734 non-null  float64
 7   num_reviews         209734 non-null  float64
 8   num_comments        209734 non-null  float64
 9   num_lectures        209734 non-null  float64
 10  content_length_min  209734 non-null  float64
 11  published_time      209734 non-null  object 
 12  last_update_date    209597 non-null  object 
 13  category            209734 non-null  object 
 14  subcategory         209734 non-null  object 
 15  topic               208776 non-nul

In [None]:
Course_info.nunique()

## Udemy Courses information and visialisization: ##

In [None]:
plt.figure(figsize = (6, 6))
plt.pie(Course_info.groupby(['is_paid']).size(),
        labels = ['Free', 'Paid'], 
        autopct = '%1.1f%%',
        startangle = 90, colors = ['#facd75','#fd6767'],
        textprops = {'size': 16},
        wedgeprops= {"edgecolor":"white",
                     'linewidth': 1,
                     'antialiased': True})
plt.axis('equal')
plt.title('Course types:\n',loc='left', fontdict={'fontsize': 18})
plt.show()

In [None]:
plt.rcParams.update({'font.size': 12, 'axes.axisbelow': True})
plt.figure(figsize = (10, 6))
plt.hist(Course_info[Course_info['is_paid']==True]['price'],bins=50,range=(0,1000), rwidth=0.85,
         color='#fd6767')
plt.grid(axis='y', color ='Grey',
        linestyle ='-.', linewidth = 0.1)
plt.xticks(range(0,1001,100))
plt.margins(0.01)
plt.xlabel("Price ($)", labelpad=10)
plt.ylabel("Count")
plt.title('Distribution of course prices:\n',loc='center', fontdict={'fontsize': 16})
plt.show()

In [None]:
print("The number of courses without any subscribers: ", end='')
print(len(Course_info[Course_info['num_subscribers']==0]['num_subscribers']))
print("The number of courses with more than one million subscribers: ", end='')
print(len(Course_info[Course_info['num_subscribers']>1e6]['course_url']))

In [None]:
plt.figure(figsize = (10, 6))
plt.hist(Course_info[(Course_info['num_subscribers']>0) & (Course_info['num_subscribers']<1e6)]['num_subscribers'],bins=89, rwidth=0.85, 
         color='#1b9e77')
plt.grid(axis='y', color ='Grey', which='both',
        linestyle ='-.', linewidth = 0.1)
plt.xticks(range(0,1000001,100000))
plt.margins(0.01)
plt.xlabel("Subscribers", labelpad=10)
plt.ylabel("Count")
plt.yscale('log')
plt.title('Distribution of subscribers:\n',loc='center', fontdict={'fontsize': 16})
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(2,1,figsize = (10, 10))
N, bins, patches = ax1.hist(Course_info['content_length_min'], rwidth=0.85, bins=45,
         color='#1a3f5f')
ax1.grid(axis='y', color ='Grey',
        linestyle ='-.', linewidth = 0.1, which='both',
        alpha = 0.6)
ax1.margins(0.01)
patches[0].set_facecolor('#ed553b')
patches[1].set_facecolor('#ed553b')
plt.sca(ax1)
plt.yscale('log')
plt.ylabel("Count")
ax2.hist(Course_info[Course_info['content_length_min']<1000]['content_length_min'], rwidth=0.85, bins=100,
         color='#ed553b')
ax2.grid(axis='y', color ='Grey',
        linestyle ='-.', linewidth = 0.1,
        alpha = 0.6)
ax2.margins(0.01)
plt.sca(ax2)
plt.xlabel("Content length (min)", labelpad=10)
plt.ylabel("Count")
plt.xticks(range(0,1001,100))
plt.subplots_adjust(hspace=0.1)
plt.suptitle('Distribution of course content length:',fontsize= 16)
plt.show()

In [None]:
a_df = Course_info.groupby(['category']).size().sort_values(ascending=True)
fig, ax = plt.subplots(figsize=(8, 6)) 
ax.barh(a_df.index, a_df, color='#1a3f5f')
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.0f}'.format(x/1000) + 'K'))
plt.margins(0.01)
plt.xlabel("The number of course", labelpad=10)
plt.grid(axis='x', color ='Grey',
        linestyle ='-.', linewidth = 0.1)
plt.suptitle('The number of courses for each category:',fontsize= 16)
plt.show()

In [None]:
b_df = Course_info.groupby(['category']).sum(numeric_only=True).sort_values(by='num_subscribers',ascending=True)
fig, ax = plt.subplots(figsize=(8, 6)) 
ax.barh(b_df.index, b_df['num_subscribers'], color='#ed553b')
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.0f}'.format(x/1000000) + 'M' if x!=0 else 0))
plt.margins(0.01)
plt.xlabel("The number of subscriber", labelpad=10)
plt.grid(axis='x', color ='Grey',
        linestyle ='-.', linewidth = 0.1)
plt.suptitle('The number of subscribers for each category:',fontsize= 16)
plt.show()

In [None]:
e_df = Course_info.groupby(['topic'], as_index=False).size().sort_values(by='size',ascending=False)
print('Top 10 topics by number of courses: \n')
print(e_df[:10].to_markdown(tablefmt="rounded_outline", index=False))

In [None]:
g_df = Course_info.groupby(['topic'], as_index=False).sum(numeric_only=True).sort_values(by='num_subscribers',ascending=False)
print('Top 10 topics by number of subscribers: \n')
print(g_df[:10][['topic', 'num_subscribers']].to_markdown(tablefmt="rounded_outline", index=False,floatfmt=',.0f'))

In [None]:
Course_info['published_QDate'] = pd.PeriodIndex(Course_info.published_time, freq='Q')
c_df = Course_info.groupby(['published_QDate'], as_index=False).size().sort_values(by='published_QDate')
c_df['published_QDate'] = c_df['published_QDate'].apply(str)
fig, ax = plt.subplots(figsize=(10, 6))
plt.plot(c_df.index[:-1], c_df['size'][:-1], color='#eb4124', linewidth=1.75);
plt.margins(x=0, y=0.012)
plt.ylabel("The number of courses", labelpad=5, fontsize=12)
plt.xlabel("Publish date", labelpad=10, fontsize=12)
plt.xticks(c_df.index[:-1])
plt.xticks(rotation = 90)
# plt.xticks(fontsize=11)
# plt.yticks(fontsize=11)
ax.set_xticklabels(list(c_df['published_QDate'][:-1]))
plt.grid(color ='Grey',
        linestyle ='-.', linewidth = 0.1)


verts = [(c_df.index[39], -1000), (c_df.index[39], c_df['size'][39]+10000), 
         (c_df.index[40], c_df['size'][40]+10000),(c_df.index[40], -1000)]
poly = Polygon(verts, facecolor ='yellow',
               edgecolor ='1.0', alpha = 0.5)
ax.add_patch(poly)

plt.title('The number of courses versus publishing date:\n',loc='center', fontdict={'fontsize': 16})
plt.show()

## Comments analysis: ##

In [None]:
Comments = pd.read_csv('./raw/Comments.csv')
Comments.info()

In [None]:
Comments['year'] = Comments['date'].apply(lambda x: x[0:4])
Comments['month'] = Comments['date'].apply(lambda x: x[5:4])
d_df = Comments.groupby(['year'], as_index=False).size().sort_values(by='year')
fig, ax = plt.subplots(figsize=(8, 6)) 
plt.bar(d_df.year, d_df['size'], color='#1a3f5f')
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.1f}'.format(x/1000000) + 'M' if x!=0 else 0))
plt.margins(x=0.01)
plt.xlabel("year", labelpad=10)
plt.ylabel("The Number of comment", labelpad=10)
plt.grid(axis='y', color ='Grey',
        linestyle ='-.', linewidth = 0.1)
plt.title('The number of comments per year:\n',loc='center', fontdict={'fontsize': 16})
plt.show()

In [None]:
plt.figure(figsize = (10, 6))
plt.hist(Comments['comment'].apply(lambda x: len(str(x))), rwidth=0.85, bins=44, range=(0,22000),
         color='#1b9e77')
plt.grid(axis='y', color ='Grey', which='both',
        linestyle ='-.', linewidth = 0.1)
plt.margins(0.01)
plt.xlabel("length of comments", labelpad=10)
plt.ylabel("Count")
plt.yscale('log')
plt.title('Distribution of comment length:\n',loc='center', fontdict={'fontsize': 16})
plt.show()

## Gross sales analysis: ##

In [None]:
Course_info['earned'] = Course_info['price']*Course_info['num_subscribers']
print("Total gross sales of Udemy: ", end='')
print(round(Course_info.earned.sum()/1e9,2),'billion US Dollar')

In [None]:
f_df = Course_info.groupby(['instructor_url'], as_index=False).sum(numeric_only=True)
f1_df = f_df.sort_values(by='earned',ascending=False).join(Course_info[['instructor_url', 'instructor_name']].set_index('instructor_url'), on='instructor_url').drop_duplicates()
print('Top 10 instructor by most gross sales ($): \n')
print(f1_df[0:10][['instructor_name', 'instructor_url', 'earned']].to_markdown(tablefmt="rounded_outline", index=False,floatfmt=',.0f'))

In [None]:
Comments.head()

In [None]:
Course_info.head()

# Bin the dataset on num_lecures, price

In [4]:
def load_pickle(file_name):
    with open(f'{file_name}', 'rb') as handle:
        return pickle.load(handle)
    
def save_pickle(file, file_name):
    with open(f'{file_name}', 'wb') as handle:
        pickle.dump(file, handle, protocol=pickle.HIGHEST_PROTOCOL)

category_dict = load_pickle('./processed/category.pkl')
category_dict

[{'id': 0, 'name': 'IT & Software'},
 {'id': 1, 'name': 'Development'},
 {'id': 2, 'name': 'Teaching & Academics'},
 {'id': 3, 'name': 'Business'},
 {'id': 4, 'name': 'Personal Development'},
 {'id': 5, 'name': 'Design'},
 {'id': 6, 'name': 'Health & Fitness'},
 {'id': 7, 'name': 'Lifestyle'},
 {'id': 8, 'name': 'Finance & Accounting'},
 {'id': 9, 'name': 'Marketing'},
 {'id': 10, 'name': 'Music'},
 {'id': 11, 'name': 'Office Productivity'},
 {'id': 12, 'name': 'Photography & Video'}]

In [17]:
course_info = Course_info[Course_info['language'] == 'English']

course_info = course_info.drop(['is_paid', 'instructor_url', 'published_time', 'num_reviews', 'num_comments', 'last_update_date', 'language'], axis=1)

def convert2dist(interval_list):
    closeness_dict = {}
    # Function to calculate closeness between two intervals
    def calculate_closeness(index_a, index_b):
        distance = np.abs(index_a - index_b)
        # Using Gaussian-like exponential decay for smoothness
        sigma = 2  # Adjust sigma as needed to control the spread of the bell curve
        closeness = np.exp(-(distance**2) / (2 * sigma**2))
        return closeness

    # Create a new dictionary where each key will have an array representing the closeness to each interval
    for index_a, interval_a in enumerate(interval_list):
        closeness_array = []
        for index_b, interval_b in enumerate(interval_list):
            closeness = calculate_closeness(index_a, index_b)
            closeness_array.append(closeness)
        str_interval_a = str(interval_a)
        closeness_dict[str_interval_a] = closeness_array
    return closeness_dict

def quantile_binning(column_name, num_bins=10):
    # Equal-frequency binning (Quantile binning)
    bins_series = pd.qcut(course_info[column_name], q=num_bins, duplicates='drop')
    unique_bins_sorted = bins_series.unique().sort_values()
    # series values are strings
    bins_series = bins_series.astype(str)
    dist = convert2dist(unique_bins_sorted)
    pickle_array = [{bin: dist[str(bin)]} for index, bin in enumerate(bins_series.sort_values().unique())]
    save_pickle(pickle_array, f'./processed/bins2{column_name}.pkl')
    new_bins_series = bins_series.map(lambda x: dist[str(x)])
    course_info[f"{column_name}_dist"] = new_bins_series
    course_info.drop(column_name, axis=1, inplace=True)
    
quantile_binning('price', num_bins=10)
quantile_binning('num_lectures', num_bins=10)
quantile_binning('content_length_min', num_bins=10)
course_info

Unnamed: 0,id,title,headline,num_subscribers,avg_rating,category,subcategory,topic,course_url,instructor_name,price_dist,num_lectures_dist,content_length_min_dist
0,4715.0,Online Vegan Vegetarian Cooking School,Learn to cook delicious vegan recipes. Filmed ...,2231.0,3.75,Lifestyle,Food & Beverage,Vegan Cooking,/course/vegan-vegetarian-cooking-school/,Angela Poch,"[0.6065306597126334, 0.8824969025845955, 1.0, ...","[0.002187491118182885, 0.011108996538242306, 0...","[4.006529739295107e-05, 0.00033546262790251185..."
1,1769.0,The Lean Startup Talk at Stanford E-Corner,Debunking Myths of Entrepreneurship A startup ...,26474.0,4.50,Business,Entrepreneurship,Lean Startup,/course/the-lean-startup-debunking-myths-of-en...,Eric Ries,"[1.0, 0.8824969025845955, 0.6065306597126334, ...","[0.8824969025845955, 1.0, 0.8824969025845955, ...","[0.32465246735834974, 0.6065306597126334, 0.88..."
2,5664.0,"How To Become a Vegan, Vegetarian, or Flexitarian",Get the tools you need for a lifestyle change ...,1713.0,4.40,Lifestyle,Other Lifestyle,Vegan Cooking,/course/see-my-personal-motivation-for-becomin...,Angela Poch,"[0.8824969025845955, 1.0, 0.8824969025845955, ...","[0.32465246735834974, 0.6065306597126334, 0.88...","[0.32465246735834974, 0.6065306597126334, 0.88..."
3,7723.0,How to Train a Puppy,Train your puppy the right way with Dr. Ian Du...,4988.0,4.80,Lifestyle,Pet Care & Training,Pet Training,/course/complete-dunbar-collection/,Ian Dunbar,"[0.002187491118182885, 0.011108996538242306, 0...","[0.002187491118182885, 0.011108996538242306, 0...","[4.006529739295107e-05, 0.00033546262790251185..."
4,8157.0,Web Design from the Ground Up,Learn web design online: Everything you need t...,1266.0,4.75,Design,Web Design,Web Design,/course/web-design-from-the-ground-up/,E Learning Lab,"[0.002187491118182885, 0.011108996538242306, 0...","[0.002187491118182885, 0.011108996538242306, 0...","[4.006529739295107e-05, 0.00033546262790251185..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
209729,4913954.0,Let's Speak Urdu - The Urdu Grammar,Urdu - Become fluent in this beautiful South A...,3.0,0.00,Teaching & Academics,Language Learning,Urdu Language,/course/lets-speak-urdu-the-grammar/,Jawaid Hameed,"[0.8824969025845955, 1.0, 0.8824969025845955, ...","[1.0, 0.8824969025845955, 0.6065306597126334, ...","[0.6065306597126334, 0.8824969025845955, 1.0, ..."
209730,4914146.0,CompTIA Linux+ (XKO-004/005 # 2 Practice Exam ...,Practice Latest exam questions with detailed e...,0.0,0.00,IT & Software,IT Certifications,CompTIA Linux+,/course/comptia-linux-xko-004005-2-practice-ex...,Jean-François d'Halluin,"[0.1353352832366127, 0.32465246735834974, 0.60...","[1.0, 0.8824969025845955, 0.6065306597126334, ...","[1.0, 0.8824969025845955, 0.6065306597126334, ..."
209731,4914002.0,CISSP 4 full exams #1 : All CISSP domains - 12...,Practice Latest exam questions with detailed e...,0.0,0.00,IT & Software,IT Certifications,CISSP - Certified Information Systems Security...,/course/cissp-4-full-exams-1-all-cissp-domains...,Jean-François d'Halluin,"[0.1353352832366127, 0.32465246735834974, 0.60...","[1.0, 0.8824969025845955, 0.6065306597126334, ...","[1.0, 0.8824969025845955, 0.6065306597126334, ..."
209732,4913934.0,JD Edwards EnterpriseOne Fixed Assets Accounti...,Full-length course (Part-3 of 3) on JD Edwards...,0.0,0.00,Finance & Accounting,Other Finance & Accounting,Financial Accounting,/course/jde-fixed-assets-accounting-part-3/,Niranjan Bhatia,"[0.011108996538242306, 0.04393693362340742, 0....","[0.8824969025845955, 1.0, 0.8824969025845955, ...","[0.32465246735834974, 0.6065306597126334, 0.88..."


# Convert to Numerical Data

In [None]:
def process_and_save_category_data(df, column_name, pickle_file):
    data_dict = df.groupby([column_name]).size().sort_values(ascending=False).to_dict()
    # Create an enumerated dictionary with counts
    data_dict = {key: idx for idx, key in enumerate(data_dict)}
    
    # id2name dictionary
    id2name = [{'id': v, 'name': k } for k, v in data_dict.items()]

    # Serialize the dictionary using pickle
    with open(f'./processed/{pickle_file}', 'wb') as handle:
        pickle.dump(id2name, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    return data_dict


# Apply the function to each category and save the mappings as pickle files
category = process_and_save_category_data(course_info, 'category', 'category.pkl')
subcategory = process_and_save_category_data(course_info, 'subcategory', 'subcategory.pkl')
topic = process_and_save_category_data(course_info, 'topic', 'topic.pkl')
instructor = process_and_save_category_data(course_info, 'instructor_name', 'instructor.pkl')

# map the category data to the original dataframe
course_info['category_id'] = course_info['category'].map(category)
course_info['subcategory_id'] = course_info['subcategory'].map(subcategory)
course_info['topic_id'] = course_info['topic'].map(topic)
course_info['instructor_name_id'] = course_info['instructor_name'].map(instructor)

course_info

In [19]:
files = ['./processed/category.pkl', './processed/content_length_min.pkl', './processed/instructor.pkl', './processed/num_lecture.pkl', './processed/num_lectures.pkl', './processed/price.pkl', './processed/subcategory.pkl', './processed/topic.pkl']
# Function to load a pickle file
def load_pickle(file_name):
    with open(f'{file_name}', 'rb') as handle:
        return pickle.load(handle)

# Load all the pickle files
category = load_pickle(files[0])
subcategory = load_pickle(files[6])
load_pickle('./processed/bins2price.pkl')

[{'(-0.001, 6.0]': [1.0,
   0.8824969025845955,
   0.6065306597126334,
   0.32465246735834974,
   0.1353352832366127,
   0.04393693362340742,
   0.011108996538242306,
   0.002187491118182885,
   0.00033546262790251185,
   4.006529739295107e-05]},
 {'(12.0, 16.0]': [0.32465246735834974,
   0.6065306597126334,
   0.8824969025845955,
   1.0,
   0.8824969025845955,
   0.6065306597126334,
   0.32465246735834974,
   0.1353352832366127,
   0.04393693362340742,
   0.011108996538242306]},
 {'(16.0, 21.0]': [0.1353352832366127,
   0.32465246735834974,
   0.6065306597126334,
   0.8824969025845955,
   1.0,
   0.8824969025845955,
   0.6065306597126334,
   0.32465246735834974,
   0.1353352832366127,
   0.04393693362340742]},
 {'(21.0, 27.0]': [0.04393693362340742,
   0.1353352832366127,
   0.32465246735834974,
   0.6065306597126334,
   0.8824969025845955,
   1.0,
   0.8824969025845955,
   0.6065306597126334,
   0.32465246735834974,
   0.1353352832366127]},
 {'(27.0, 35.0]': [0.011108996538242306,
  

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the pre-trained SentenceTransformer model (do this outside the function if possible)
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

def generate_similarity_matrix(name, categories):
    # Extract category names and encode them in a batch
    category_names = [category['name'] for category in categories]
    category_embeddings = model.encode(category_names)

    # Compute the cosine similarity matrix in a vectorized way
    similarity_matrix = cosine_similarity(category_embeddings)
    
    # Save the similarity matrix to a file
    with open(f'./processed/{name}_similarity_matrix.npy', 'wb') as f:
        np.save(f, similarity_matrix)

    return similarity_matrix

# Generate similarity matrices
category_similarity_matrix = generate_similarity_matrix('category', category)
subcategory_similarity_matrix = generate_similarity_matrix('subcategory', subcategory)
topic_similarity_matrix = generate_similarity_matrix('topic', topic)


In [None]:
# category_similarity_matrix, subcategory_similarity_matrix, topic_similarity_matrix
def map_topic(x, topic_similarity_matrix):
    try:
        return topic_similarity_matrix[int(x)]
    except:
        return np.zeros(topic_similarity_matrix.shape[0]) 
    
# course_info['category_dist'] = course_info['category_id'].apply(lambda x: category_similarity_matrix[x])
# course_info['subcategory_dist'] = course_info['subcategory_id'].apply(lambda x: subcategory_similarity_matrix[x])
# course_info['topic_dist'] = course_info['topic_id'].apply(lambda x: map_topic(x, topic_similarity_matrix))

def category_dist(x):
    array = np.zeros(category_similarity_matrix.shape[0])
    array[x] = 1
    return array
    
def subcategory_dist(x):
    array = np.zeros(subcategory_similarity_matrix.shape[0])
    array[x] = 1
    return array


course_info['category_dist'] = course_info['category_id'].apply(category_dist)
course_info['subcategory_dist'] = course_info['subcategory_id'].apply(subcategory_dist)
course_info['topic_dist'] = course_info['topic_id'].apply(lambda x: map_topic(x, topic_similarity_matrix))

course_info

In [None]:
def concatenate_profile(row):
    return np.concatenate([
        row['category_dist'],
        row['subcategory_dist'],
        row['price_dist'],
        row['num_lectures_dist'],
        row['content_length_min_dist']
    ])

# Apply this function across each row
course_info['profile'] = course_info.apply(concatenate_profile, axis=1)

# Create a minified version of the DataFrame
course_info_minified = course_info[['id', 'title', 'profile']]
course_info_minified

In [None]:
course_info_minified.to_pickle('./processed/course_info_minified.pkl')