In [None]:
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import pandas as pd
import numpy as np
import pickle

Course_info = pd.read_csv('./raw/Course_info.csv')

In [None]:
def load_pickle(file_name):
    with open(f'{file_name}', 'rb') as handle:
        return pickle.load(handle)
    
def save_pickle(file, file_name):
    with open(f'{file_name}', 'wb') as handle:
        pickle.dump(file, handle, protocol=pickle.HIGHEST_PROTOCOL)

category_dict = load_pickle('./processed/category.pkl')
category_dict

In [None]:
course_info = Course_info[Course_info['language'] == 'English']

course_info = course_info.drop(['is_paid', 'instructor_url', 'published_time', 'num_reviews', 'num_comments', 'last_update_date', 'language'], axis=1)

def convert2dist(interval_list):
    closeness_dict = {}
    # Function to calculate closeness between two intervals
    def calculate_closeness(index_a, index_b):
        distance = np.abs(index_a - index_b)
        # Using Gaussian-like exponential decay for smoothness
        sigma = 2  # Adjust sigma as needed to control the spread of the bell curve
        closeness = np.exp(-(distance**2) / (2 * sigma**2))
        return closeness

    # Create a new dictionary where each key will have an array representing the closeness to each interval
    for index_a, interval_a in enumerate(interval_list):
        closeness_array = []
        for index_b, interval_b in enumerate(interval_list):
            closeness = calculate_closeness(index_a, index_b)
            closeness_array.append(closeness)
        str_interval_a = str(interval_a)
        closeness_array = (closeness_array - np.min(closeness_array)) / (np.max(closeness_array) - np.min(closeness_array))
        closeness_dict[str_interval_a] = closeness_array
    return closeness_dict

def quantile_binning(column_name, num_bins=10):
    # Equal-frequency binning (Quantile binning)
    bins_series = pd.qcut(course_info[column_name], q=num_bins, duplicates='drop')
    unique_bins_sorted = bins_series.unique().sort_values()
    # series values are strings
    bins_series = bins_series.astype(str)
    dist = convert2dist(unique_bins_sorted)
    pickle_array = [{bin: dist[str(bin)]} for index, bin in enumerate(bins_series.sort_values().unique())]    
    save_pickle(pickle_array, f'./processed/bins2{column_name}.pkl')
    new_bins_series = bins_series.map(lambda x: dist[str(x)])
    course_info[f"{column_name}_dist"] = new_bins_series
    # course_info.drop(column_name, axis=1, inplace=True)
    
quantile_binning('price', num_bins=10)
quantile_binning('num_lectures', num_bins=10)
quantile_binning('content_length_min', num_bins=10)
course_info

# Convert to Numerical Data

In [None]:
def process_and_save_category_data(df, column_name, pickle_file):
    data_dict = df.groupby([column_name]).size().sort_values(ascending=False).to_dict()
    # Create an enumerated dictionary with counts
    data_dict = {key: idx for idx, key in enumerate(data_dict)}
    
    # id2name dictionary
    id2name = [{'id': v, 'name': k } for k, v in data_dict.items()]

    # Serialize the dictionary using pickle
    with open(f'./processed/{pickle_file}', 'wb') as handle:
        pickle.dump(id2name, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    return data_dict


# Apply the function to each category and save the mappings as pickle files
category = process_and_save_category_data(course_info, 'category', 'category.pkl')
subcategory = process_and_save_category_data(course_info, 'subcategory', 'subcategory.pkl')
topic = process_and_save_category_data(course_info, 'topic', 'topic.pkl')
instructor = process_and_save_category_data(course_info, 'instructor_name', 'instructor.pkl')

# map the category data to the original dataframe
course_info['category_id'] = course_info['category'].map(category)
course_info['subcategory_id'] = course_info['subcategory'].map(subcategory)
course_info['topic_id'] = course_info['topic'].map(topic)
course_info['instructor_name_id'] = course_info['instructor_name'].map(instructor)

course_info

In [None]:
files = ['./processed/category.pkl', './processed/content_length_min.pkl', './processed/instructor.pkl', './processed/num_lecture.pkl', './processed/num_lectures.pkl', './processed/price.pkl', './processed/subcategory.pkl', './processed/topic.pkl']
# Function to load a pickle file
def load_pickle(file_name):
    with open(f'{file_name}', 'rb') as handle:
        return pickle.load(handle)

# Load all the pickle files
category = load_pickle(files[0])
subcategory = load_pickle(files[6])
topic = load_pickle(files[7])

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the pre-trained SentenceTransformer model (do this outside the function if possible)
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

def generate_similarity_matrix(name, categories):
    # Extract category names and encode them in a batch
    category_names = [category['name'] for category in categories]
    category_embeddings = model.encode(category_names)

    # Compute the cosine similarity matrix in a vectorized way
    similarity_matrix = cosine_similarity(category_embeddings)
    similarity_matrix = (similarity_matrix - similarity_matrix.min()) / (similarity_matrix.max() - similarity_matrix.min())
    # Save the similarity matrix to a file
    with open(f'./processed/{name}_similarity_matrix.npy', 'wb') as f:
        np.save(f, similarity_matrix)

    return similarity_matrix

# Generate similarity matrices
category_similarity_matrix = generate_similarity_matrix('category', category)
subcategory_similarity_matrix = generate_similarity_matrix('subcategory', subcategory)
topic_similarity_matrix = generate_similarity_matrix('topic', topic)


In [None]:
# category_similarity_matrix, subcategory_similarity_matrix, topic_similarity_matrix
def map_topic(x, topic_similarity_matrix):
    try:
        return topic_similarity_matrix[int(x)]
    except:
        return np.zeros(topic_similarity_matrix.shape[0]) 
    
# course_info['category_dist'] = course_info['category_id'].apply(lambda x: category_similarity_matrix[x])
# course_info['subcategory_dist'] = course_info['subcategory_id'].apply(lambda x: subcategory_similarity_matrix[x])
# course_info['topic_dist'] = course_info['topic_id'].apply(lambda x: map_topic(x, topic_similarity_matrix))

def category_dist(x):
    array = np.zeros(category_similarity_matrix.shape[0])
    array[x] = 1
    return array
    
def subcategory_dist(x):
    array = np.zeros(subcategory_similarity_matrix.shape[0])
    array[x] = 1
    return array


course_info['category_dist'] = course_info['category_id'].apply(lambda x: category_dist(x))
course_info['subcategory_dist'] = course_info['subcategory_id'].apply(lambda x: subcategory_dist(x))
course_info['topic_dist'] = course_info['topic_id'].apply(lambda x: map_topic(x, topic_similarity_matrix))

course_info

In [None]:
def concatenate_profile(row):
    return np.concatenate([
        row['category_dist'],
        row['subcategory_dist'],
        row['price_dist'],
        row['num_lectures_dist'],
        row['content_length_min_dist'],
    ])

# Apply this function across each row
course_info['profile'] = course_info.apply(concatenate_profile, axis=1)

# Create a minified version of the DataFrame
course_info_minified = course_info[['id', 'title', 'topic', 'avg_rating', 'course_url', 'num_lectures', 'price', 'profile']]
course_info_minified

In [None]:
course_info_minified.to_pickle('./processed/course_info_minified_v2.pkl')