In [1]:
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.patches import Polygon

# Download the dataset from https://www.kaggle.com/datasets/hossaingh/udemy-courses
## Unzip and place Course_info.csv in the /data/raw folder

In [5]:
Course_info = pd.read_csv('./raw/Course_info.csv')

def load_pickle(file_name):
    with open(f'{file_name}', 'rb') as handle:
        return pickle.load(handle)
    
def save_pickle(file, file_name):
    with open(f'{file_name}', 'wb') as handle:
        pickle.dump(file, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Bin the dataset on num_lecures, price, content_length_min

In [6]:
course_info = Course_info[Course_info['language'] == 'English']

course_info = course_info.drop(['is_paid', 'instructor_url', 'published_time', 'num_reviews', 'num_comments', 'last_update_date', 'language'], axis=1)

def convert2dist(interval_list):
    closeness_dict = {}
    # Function to calculate closeness between two intervals
    def calculate_closeness(index_a, index_b):
        distance = np.abs(index_a - index_b)
        # Using Gaussian-like exponential decay for smoothness
        sigma = 2  # Adjust sigma as needed to control the spread of the bell curve
        closeness = np.exp(-(distance**2) / (2 * sigma**2))
        return closeness

    # Create a new dictionary where each key will have an array representing the closeness to each interval
    for index_a, interval_a in enumerate(interval_list):
        closeness_array = []
        for index_b, interval_b in enumerate(interval_list):
            closeness = calculate_closeness(index_a, index_b)
            closeness_array.append(closeness)
        str_interval_a = str(interval_a)
        closeness_dict[str_interval_a] = closeness_array
    return closeness_dict

def quantile_binning(column_name, num_bins=10):
    # Equal-frequency binning (Quantile binning)
    bins_series = pd.qcut(course_info[column_name], q=num_bins, duplicates='drop')
    unique_bins_sorted = bins_series.unique().sort_values()
    # series values are strings
    bins_series = bins_series.astype(str)
    dist = convert2dist(unique_bins_sorted)
    pickle_array = [{bin: dist[str(bin)]} for index, bin in enumerate(bins_series.sort_values().unique())]
    save_pickle(pickle_array, f'./processed/bins2{column_name}.pkl')
    new_bins_series = bins_series.map(lambda x: dist[str(x)])
    course_info[f"{column_name}_dist"] = new_bins_series
    course_info.drop(column_name, axis=1, inplace=True)
    
quantile_binning('price', num_bins=10)
quantile_binning('num_lectures', num_bins=10)
quantile_binning('content_length_min', num_bins=10)
course_info

Unnamed: 0,id,title,headline,num_subscribers,avg_rating,category,subcategory,topic,course_url,instructor_name,price_dist,num_lectures_dist,content_length_min_dist
0,4715.0,Online Vegan Vegetarian Cooking School,Learn to cook delicious vegan recipes. Filmed ...,2231.0,3.75,Lifestyle,Food & Beverage,Vegan Cooking,/course/vegan-vegetarian-cooking-school/,Angela Poch,"[0.6065306597126334, 0.8824969025845955, 1.0, ...","[0.002187491118182885, 0.011108996538242306, 0...","[4.006529739295107e-05, 0.00033546262790251185..."
1,1769.0,The Lean Startup Talk at Stanford E-Corner,Debunking Myths of Entrepreneurship A startup ...,26474.0,4.50,Business,Entrepreneurship,Lean Startup,/course/the-lean-startup-debunking-myths-of-en...,Eric Ries,"[1.0, 0.8824969025845955, 0.6065306597126334, ...","[0.8824969025845955, 1.0, 0.8824969025845955, ...","[0.32465246735834974, 0.6065306597126334, 0.88..."
2,5664.0,"How To Become a Vegan, Vegetarian, or Flexitarian",Get the tools you need for a lifestyle change ...,1713.0,4.40,Lifestyle,Other Lifestyle,Vegan Cooking,/course/see-my-personal-motivation-for-becomin...,Angela Poch,"[0.8824969025845955, 1.0, 0.8824969025845955, ...","[0.32465246735834974, 0.6065306597126334, 0.88...","[0.32465246735834974, 0.6065306597126334, 0.88..."
3,7723.0,How to Train a Puppy,Train your puppy the right way with Dr. Ian Du...,4988.0,4.80,Lifestyle,Pet Care & Training,Pet Training,/course/complete-dunbar-collection/,Ian Dunbar,"[0.002187491118182885, 0.011108996538242306, 0...","[0.002187491118182885, 0.011108996538242306, 0...","[4.006529739295107e-05, 0.00033546262790251185..."
4,8157.0,Web Design from the Ground Up,Learn web design online: Everything you need t...,1266.0,4.75,Design,Web Design,Web Design,/course/web-design-from-the-ground-up/,E Learning Lab,"[0.002187491118182885, 0.011108996538242306, 0...","[0.002187491118182885, 0.011108996538242306, 0...","[4.006529739295107e-05, 0.00033546262790251185..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
209729,4913954.0,Let's Speak Urdu - The Urdu Grammar,Urdu - Become fluent in this beautiful South A...,3.0,0.00,Teaching & Academics,Language Learning,Urdu Language,/course/lets-speak-urdu-the-grammar/,Jawaid Hameed,"[0.8824969025845955, 1.0, 0.8824969025845955, ...","[1.0, 0.8824969025845955, 0.6065306597126334, ...","[0.6065306597126334, 0.8824969025845955, 1.0, ..."
209730,4914146.0,CompTIA Linux+ (XKO-004/005 # 2 Practice Exam ...,Practice Latest exam questions with detailed e...,0.0,0.00,IT & Software,IT Certifications,CompTIA Linux+,/course/comptia-linux-xko-004005-2-practice-ex...,Jean-François d'Halluin,"[0.1353352832366127, 0.32465246735834974, 0.60...","[1.0, 0.8824969025845955, 0.6065306597126334, ...","[1.0, 0.8824969025845955, 0.6065306597126334, ..."
209731,4914002.0,CISSP 4 full exams #1 : All CISSP domains - 12...,Practice Latest exam questions with detailed e...,0.0,0.00,IT & Software,IT Certifications,CISSP - Certified Information Systems Security...,/course/cissp-4-full-exams-1-all-cissp-domains...,Jean-François d'Halluin,"[0.1353352832366127, 0.32465246735834974, 0.60...","[1.0, 0.8824969025845955, 0.6065306597126334, ...","[1.0, 0.8824969025845955, 0.6065306597126334, ..."
209732,4913934.0,JD Edwards EnterpriseOne Fixed Assets Accounti...,Full-length course (Part-3 of 3) on JD Edwards...,0.0,0.00,Finance & Accounting,Other Finance & Accounting,Financial Accounting,/course/jde-fixed-assets-accounting-part-3/,Niranjan Bhatia,"[0.011108996538242306, 0.04393693362340742, 0....","[0.8824969025845955, 1.0, 0.8824969025845955, ...","[0.32465246735834974, 0.6065306597126334, 0.88..."


# Convert to Numerical Data

In [15]:
def process_and_save_category_data(df, column_name, pickle_file):
    data_dict = df.groupby([column_name]).size().sort_values(ascending=False).to_dict()
    # Create an enumerated dictionary with counts
    data_dict = {key: idx for idx, key in enumerate(data_dict)}
    
    # id2name dictionary
    id2name = [{'id': v, 'name': k } for k, v in data_dict.items()]
 
    # Serialize the dictionary using pickle
    with open(f'./processed/{pickle_file}', 'wb') as handle:
        pickle.dump(id2name, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    return data_dict


# Apply the function to each category and save the mappings as pickle files
category = process_and_save_category_data(course_info, 'category', 'category.pkl')
subcategory = process_and_save_category_data(course_info, 'subcategory', 'subcategory.pkl')
topic = process_and_save_category_data(course_info, 'topic', 'topic.pkl')
instructor = process_and_save_category_data(course_info, 'instructor_name', 'instructor.pkl')

# map the category data to the original dataframe
course_info['category_id'] = course_info['category'].map(category)
course_info['subcategory_id'] = course_info['subcategory'].map(subcategory)
course_info['topic_id'] = course_info['topic'].map(topic)
course_info['instructor_name_id'] = course_info['instructor_name'].map(instructor)

course_info

[{'id': 0, 'name': 'IT & Software'}, {'id': 1, 'name': 'Development'}, {'id': 2, 'name': 'Teaching & Academics'}, {'id': 3, 'name': 'Business'}, {'id': 4, 'name': 'Personal Development'}, {'id': 5, 'name': 'Design'}, {'id': 6, 'name': 'Health & Fitness'}, {'id': 7, 'name': 'Lifestyle'}, {'id': 8, 'name': 'Finance & Accounting'}, {'id': 9, 'name': 'Marketing'}, {'id': 10, 'name': 'Music'}, {'id': 11, 'name': 'Office Productivity'}, {'id': 12, 'name': 'Photography & Video'}]
[{'id': 0, 'name': 'IT Certifications'}, {'id': 1, 'name': 'Other IT & Software'}, {'id': 2, 'name': 'Web Development'}, {'id': 3, 'name': 'Language Learning'}, {'id': 4, 'name': 'Programming Languages'}, {'id': 5, 'name': 'Personal Transformation'}, {'id': 6, 'name': 'Network & Security'}, {'id': 7, 'name': 'Entrepreneurship'}, {'id': 8, 'name': 'Engineering'}, {'id': 9, 'name': 'Arts & Crafts'}, {'id': 10, 'name': 'Investing & Trading'}, {'id': 11, 'name': 'Microsoft'}, {'id': 12, 'name': 'Career Development'}, {'i

Unnamed: 0,id,title,headline,num_subscribers,avg_rating,category,subcategory,topic,course_url,instructor_name,price_dist,num_lectures_dist,content_length_min_dist,category_id,subcategory_id,topic_id,instructor_name_id
0,4715.0,Online Vegan Vegetarian Cooking School,Learn to cook delicious vegan recipes. Filmed ...,2231.0,3.75,Lifestyle,Food & Beverage,Vegan Cooking,/course/vegan-vegetarian-cooking-school/,Angela Poch,"[0.6065306597126334, 0.8824969025845955, 1.0, ...","[0.002187491118182885, 0.011108996538242306, 0...","[4.006529739295107e-05, 0.00033546262790251185...",7,42,426.0,2328.0
1,1769.0,The Lean Startup Talk at Stanford E-Corner,Debunking Myths of Entrepreneurship A startup ...,26474.0,4.50,Business,Entrepreneurship,Lean Startup,/course/the-lean-startup-debunking-myths-of-en...,Eric Ries,"[1.0, 0.8824969025845955, 0.6065306597126334, ...","[0.8824969025845955, 1.0, 0.8824969025845955, ...","[0.32465246735834974, 0.6065306597126334, 0.88...",3,7,1522.0,16406.0
2,5664.0,"How To Become a Vegan, Vegetarian, or Flexitarian",Get the tools you need for a lifestyle change ...,1713.0,4.40,Lifestyle,Other Lifestyle,Vegan Cooking,/course/see-my-personal-motivation-for-becomin...,Angela Poch,"[0.8824969025845955, 1.0, 0.8824969025845955, ...","[0.32465246735834974, 0.6065306597126334, 0.88...","[0.32465246735834974, 0.6065306597126334, 0.88...",7,71,426.0,2328.0
3,7723.0,How to Train a Puppy,Train your puppy the right way with Dr. Ian Du...,4988.0,4.80,Lifestyle,Pet Care & Training,Pet Training,/course/complete-dunbar-collection/,Ian Dunbar,"[0.002187491118182885, 0.011108996538242306, 0...","[0.002187491118182885, 0.011108996538242306, 0...","[4.006529739295107e-05, 0.00033546262790251185...",7,108,2321.0,1353.0
4,8157.0,Web Design from the Ground Up,Learn web design online: Everything you need t...,1266.0,4.75,Design,Web Design,Web Design,/course/web-design-from-the-ground-up/,E Learning Lab,"[0.002187491118182885, 0.011108996538242306, 0...","[0.002187491118182885, 0.011108996538242306, 0...","[4.006529739295107e-05, 0.00033546262790251185...",5,59,104.0,4369.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209729,4913954.0,Let's Speak Urdu - The Urdu Grammar,Urdu - Become fluent in this beautiful South A...,3.0,0.00,Teaching & Academics,Language Learning,Urdu Language,/course/lets-speak-urdu-the-grammar/,Jawaid Hameed,"[0.8824969025845955, 1.0, 0.8824969025845955, ...","[1.0, 0.8824969025845955, 0.6065306597126334, ...","[0.6065306597126334, 0.8824969025845955, 1.0, ...",2,3,1768.0,7316.0
209730,4914146.0,CompTIA Linux+ (XKO-004/005 # 2 Practice Exam ...,Practice Latest exam questions with detailed e...,0.0,0.00,IT & Software,IT Certifications,CompTIA Linux+,/course/comptia-linux-xko-004005-2-practice-ex...,Jean-François d'Halluin,"[0.1353352832366127, 0.32465246735834974, 0.60...","[1.0, 0.8824969025845955, 0.6065306597126334, ...","[1.0, 0.8824969025845955, 0.6065306597126334, ...",0,0,857.0,761.0
209731,4914002.0,CISSP 4 full exams #1 : All CISSP domains - 12...,Practice Latest exam questions with detailed e...,0.0,0.00,IT & Software,IT Certifications,CISSP - Certified Information Systems Security...,/course/cissp-4-full-exams-1-all-cissp-domains...,Jean-François d'Halluin,"[0.1353352832366127, 0.32465246735834974, 0.60...","[1.0, 0.8824969025845955, 0.6065306597126334, ...","[1.0, 0.8824969025845955, 0.6065306597126334, ...",0,0,233.0,761.0
209732,4913934.0,JD Edwards EnterpriseOne Fixed Assets Accounti...,Full-length course (Part-3 of 3) on JD Edwards...,0.0,0.00,Finance & Accounting,Other Finance & Accounting,Financial Accounting,/course/jde-fixed-assets-accounting-part-3/,Niranjan Bhatia,"[0.011108996538242306, 0.04393693362340742, 0....","[0.8824969025845955, 1.0, 0.8824969025845955, ...","[0.32465246735834974, 0.6065306597126334, 0.88...",8,101,203.0,957.0


{'IT & Software': 0,
 'Development': 1,
 'Teaching & Academics': 2,
 'Business': 3,
 'Personal Development': 4,
 'Design': 5,
 'Health & Fitness': 6,
 'Lifestyle': 7,
 'Finance & Accounting': 8,
 'Marketing': 9,
 'Music': 10,
 'Office Productivity': 11,
 'Photography & Video': 12}

In [16]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the pre-trained SentenceTransformer model (do this outside the function if possible)
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

def generate_similarity_matrix(name, categories):
    # Extract category names and encode them in a batch
    category_names = [category['name'] for category in categories]
    category_embeddings = model.encode(category_names)

    # Compute the cosine similarity matrix in a vectorized way
    similarity_matrix = cosine_similarity(category_embeddings)
    
    # Save the similarity matrix to a file
    with open(f'./processed/{name}_similarity_matrix.npy', 'wb') as f:
        np.save(f, similarity_matrix)

    return similarity_matrix

is2category = load_pickle('./processed/category.pkl')
is2subcategory = load_pickle('./processed/subcategory.pkl')
is2topic = load_pickle('./processed/topic.pkl')

# Generate similarity matrices
category_similarity_matrix = generate_similarity_matrix('category', is2category)
subcategory_similarity_matrix = generate_similarity_matrix('subcategory', is2subcategory)
topic_similarity_matrix = generate_similarity_matrix('topic', is2topic)


In [17]:
# category_similarity_matrix, subcategory_similarity_matrix, topic_similarity_matrix
def map_topic(x, topic_similarity_matrix):
    try:
        return topic_similarity_matrix[int(x)]
    except:
        return np.zeros(topic_similarity_matrix.shape[0]) 
    
# course_info['category_dist'] = course_info['category_id'].apply(lambda x: category_similarity_matrix[x])
# course_info['subcategory_dist'] = course_info['subcategory_id'].apply(lambda x: subcategory_similarity_matrix[x])
# course_info['topic_dist'] = course_info['topic_id'].apply(lambda x: map_topic(x, topic_similarity_matrix))

def category_dist(x):
    array = np.zeros(category_similarity_matrix.shape[0])
    array[x] = 1
    return array
    
def subcategory_dist(x):
    array = np.zeros(subcategory_similarity_matrix.shape[0])
    array[x] = 1
    return array


course_info['category_dist'] = course_info['category_id'].apply(category_dist)
course_info['subcategory_dist'] = course_info['subcategory_id'].apply(subcategory_dist)
course_info['topic_dist'] = course_info['topic_id'].apply(lambda x: map_topic(x, topic_similarity_matrix))

course_info

Unnamed: 0,id,title,headline,num_subscribers,avg_rating,category,subcategory,topic,course_url,instructor_name,price_dist,num_lectures_dist,content_length_min_dist,category_id,subcategory_id,topic_id,instructor_name_id,category_dist,subcategory_dist,topic_dist
0,4715.0,Online Vegan Vegetarian Cooking School,Learn to cook delicious vegan recipes. Filmed ...,2231.0,3.75,Lifestyle,Food & Beverage,Vegan Cooking,/course/vegan-vegetarian-cooking-school/,Angela Poch,"[0.6065306597126334, 0.8824969025845955, 1.0, ...","[0.002187491118182885, 0.011108996538242306, 0...","[4.006529739295107e-05, 0.00033546262790251185...",7,42,426.0,2328.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.025601188, -0.03020808, 0.007689017, 0.0639..."
1,1769.0,The Lean Startup Talk at Stanford E-Corner,Debunking Myths of Entrepreneurship A startup ...,26474.0,4.50,Business,Entrepreneurship,Lean Startup,/course/the-lean-startup-debunking-myths-of-en...,Eric Ries,"[1.0, 0.8824969025845955, 0.6065306597126334, ...","[0.8824969025845955, 1.0, 0.8824969025845955, ...","[0.32465246735834974, 0.6065306597126334, 0.88...",3,7,1522.0,16406.0,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.07359637, 0.14380836, 0.11632879, 0.1602298..."
2,5664.0,"How To Become a Vegan, Vegetarian, or Flexitarian",Get the tools you need for a lifestyle change ...,1713.0,4.40,Lifestyle,Other Lifestyle,Vegan Cooking,/course/see-my-personal-motivation-for-becomin...,Angela Poch,"[0.8824969025845955, 1.0, 0.8824969025845955, ...","[0.32465246735834974, 0.6065306597126334, 0.88...","[0.32465246735834974, 0.6065306597126334, 0.88...",7,71,426.0,2328.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.025601188, -0.03020808, 0.007689017, 0.0639..."
3,7723.0,How to Train a Puppy,Train your puppy the right way with Dr. Ian Du...,4988.0,4.80,Lifestyle,Pet Care & Training,Pet Training,/course/complete-dunbar-collection/,Ian Dunbar,"[0.002187491118182885, 0.011108996538242306, 0...","[0.002187491118182885, 0.011108996538242306, 0...","[4.006529739295107e-05, 0.00033546262790251185...",7,108,2321.0,1353.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.27874702, 0.21694547, 0.048422933, 0.113733..."
4,8157.0,Web Design from the Ground Up,Learn web design online: Everything you need t...,1266.0,4.75,Design,Web Design,Web Design,/course/web-design-from-the-ground-up/,E Learning Lab,"[0.002187491118182885, 0.011108996538242306, 0...","[0.002187491118182885, 0.011108996538242306, 0...","[4.006529739295107e-05, 0.00033546262790251185...",5,59,104.0,4369.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.33328012, 0.22095649, 0.25241905, 0.5220012..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209729,4913954.0,Let's Speak Urdu - The Urdu Grammar,Urdu - Become fluent in this beautiful South A...,3.0,0.00,Teaching & Academics,Language Learning,Urdu Language,/course/lets-speak-urdu-the-grammar/,Jawaid Hameed,"[0.8824969025845955, 1.0, 0.8824969025845955, ...","[1.0, 0.8824969025845955, 0.6065306597126334, ...","[0.6065306597126334, 0.8824969025845955, 1.0, ...",2,3,1768.0,7316.0,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.3135087, 0.0916836, 0.14131688, 0.20290011,..."
209730,4914146.0,CompTIA Linux+ (XKO-004/005 # 2 Practice Exam ...,Practice Latest exam questions with detailed e...,0.0,0.00,IT & Software,IT Certifications,CompTIA Linux+,/course/comptia-linux-xko-004005-2-practice-ex...,Jean-François d'Halluin,"[0.1353352832366127, 0.32465246735834974, 0.60...","[1.0, 0.8824969025845955, 0.6065306597126334, ...","[1.0, 0.8824969025845955, 0.6065306597126334, ...",0,0,857.0,761.0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.095693454, 0.20492217, 0.04716213, 0.197652..."
209731,4914002.0,CISSP 4 full exams #1 : All CISSP domains - 12...,Practice Latest exam questions with detailed e...,0.0,0.00,IT & Software,IT Certifications,CISSP - Certified Information Systems Security...,/course/cissp-4-full-exams-1-all-cissp-domains...,Jean-François d'Halluin,"[0.1353352832366127, 0.32465246735834974, 0.60...","[1.0, 0.8824969025845955, 0.6065306597126334, ...","[1.0, 0.8824969025845955, 0.6065306597126334, ...",0,0,233.0,761.0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.18413027, 0.51831377, 0.16663486, 0.1225767..."
209732,4913934.0,JD Edwards EnterpriseOne Fixed Assets Accounti...,Full-length course (Part-3 of 3) on JD Edwards...,0.0,0.00,Finance & Accounting,Other Finance & Accounting,Financial Accounting,/course/jde-fixed-assets-accounting-part-3/,Niranjan Bhatia,"[0.011108996538242306, 0.04393693362340742, 0....","[0.8824969025845955, 1.0, 0.8824969025845955, ...","[0.32465246735834974, 0.6065306597126334, 0.88...",8,101,203.0,957.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.20753694, 0.19434406, 0.42266774, 0.1982400..."


In [18]:
def concatenate_profile(row):
    return np.concatenate([
        row['category_dist'],
        row['subcategory_dist'],
        row['price_dist'],
        row['num_lectures_dist'],
        row['content_length_min_dist']
    ])

# Apply this function across each row
course_info['profile'] = course_info.apply(concatenate_profile, axis=1)

# Create a minified version of the DataFrame
course_info_minified = course_info[['id', 'title', 'profile']]
course_info_minified

Unnamed: 0,id,title,profile
0,4715.0,Online Vegan Vegetarian Cooking School,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
1,1769.0,The Lean Startup Talk at Stanford E-Corner,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,5664.0,"How To Become a Vegan, Vegetarian, or Flexitarian","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
3,7723.0,How to Train a Puppy,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
4,8157.0,Web Design from the Ground Up,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
209729,4913954.0,Let's Speak Urdu - The Urdu Grammar,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
209730,4914146.0,CompTIA Linux+ (XKO-004/005 # 2 Practice Exam ...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
209731,4914002.0,CISSP 4 full exams #1 : All CISSP domains - 12...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
209732,4913934.0,JD Edwards EnterpriseOne Fixed Assets Accounti...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."


In [19]:
course_info_minified.to_pickle('./processed/course_info_minified.pkl')