Load embeddings and build recommendation function
Here we load the DataFrame and embeddings, then define a function recommend_for_keywords. We compute cosine similarity between a user query embedding and all course embeddings, then return top-K.

In [10]:
import os
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load DataFrame and embeddings
process_dir = "../data/processed"
model_dir = "../outputs/models"
catalog_path = os.path.join(process_dir, "courses_combined_cleaned.csv")
emb_path = os.path.join(model_dir, "course_embeddings.npy")

df = pd.read_csv(catalog_path)
embeddings = np.load(emb_path)  # shape (N, D)
print("Loaded catalog with shape:", df.shape, "and embeddings shape:", embeddings.shape)


Loaded catalog with shape: (1343, 9) and embeddings shape: (1343, 384)


In [3]:
# Load the same embedding model for user queries
model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

# Precompute normalization of embeddings to speed up cosine
# Normalize embeddings so that cosine similarity = dot-product
emb_norms = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

In [6]:
def recommend_for_keywords(keywords, top_k=10, filter_platform=None, filter_level=None):
    """
    Recommend top_k courses given a string of keywords (e.g. "machine learning basics").
    - keywords: str, user query or combined keywords.
    - top_k: number of recommendations to return.
    - filter_platform: if not None, only consider courses where df['platform']==filter_platform (case-insensitive).
    - filter_level: if not None, only consider df['level']==filter_level (case-insensitive).
    Returns a DataFrame with columns: global_id, platform, title, provider, level, similarity_score, url (if exists), description snippet, etc.
    """
    # Filter DataFrame and corresponding embeddings
    mask = np.ones(len(df), dtype=bool)
    if filter_platform:
        mask &= df['platform'].astype(str).str.lower() == filter_platform.lower()
    if filter_level:
        mask &= df['level'].astype(str).str.lower() == filter_level.lower()
    if not mask.any():
        print("No courses match the given filters.")
        return pd.DataFrame()
    
    filtered_df = df[mask].reset_index(drop=False)  # keep original index in column 'index'
    filtered_indices = filtered_df['index'].values   # indices into original df
    # Select normalized embeddings for filtered courses
    emb_sub = emb_norms[filtered_indices]  # shape (M, D)
    
    # Encode user keywords
    user_text = keywords if isinstance(keywords, str) else str(keywords)
    user_emb = model.encode(user_text, convert_to_numpy=True)
    # Normalize user_emb
    if np.linalg.norm(user_emb) == 0:
        print("User embedding is zero vector; check input keywords.")
        return pd.DataFrame()
    user_emb_norm = user_emb / np.linalg.norm(user_emb)
    
    # Compute cosine similarities via dot product with normalized embeddings
    sims = emb_sub.dot(user_emb_norm)  # shape (M,)
    
    # Get top_k indices
    top_idx = np.argsort(-sims)[:top_k]
    top_scores = sims[top_idx]
    orig_indices = filtered_indices[top_idx]  # indices into original df
    
    # Build result DataFrame
    results = df.loc[orig_indices, ['global_id', 'platform', 'title', 'provider', 'level', 'description']].copy()
    results = results.reset_index(drop=True)
    results['similarity_score'] = top_scores
    # Include a snippet of description
    #results['desc_snippet'] = results['description'].str.slice(0, 200) + "..."
    
    return results


In [8]:

# Recommend 10 courses for keywords "data science beginner"
#recs = recommend_for_keywords("data science beginner", top_k=10)
#display(recs)


# Example with filters
recs_beginner_edx = recommend_for_keywords("data science", top_k=10, filter_platform="edx", filter_level="Beginner")
display(recs_beginner_edx)

Unnamed: 0,global_id,platform,title,provider,level,description,similarity_score
0,edx_91,edx,Introduction to Data Science,IBM,Beginner,Learn about the world of data science first-ha...,0.650836
1,edx_366,edx,The Data Science Method,IBM,Beginner,"Learn about the methodology, practices and req...",0.597701
2,edx_194,edx,Statistical Thinking for Data Science and Anal...,Columbia University,Beginner,Learn how statistics plays a central role in t...,0.57784
3,edx_643,edx,Enabling Technologies for Data Science and Ana...,Columbia University,Beginner,Discover the relationship between Big Data and...,0.550807
4,edx_99,edx,SQL for Data Science,IBM,Beginner,Learn how to use and apply the powerful langua...,0.54075
5,edx_205,edx,Machine Learning for Data Science and Analytics,Columbia University,Beginner,Learn the principles of machine learning and t...,0.538376
6,edx_286,edx,Introduction to Data Analytics for Managers,The University of Michigan,Beginner,Explore data science and analyze business data...,0.5375
7,edx_278,edx,Data Science: Computational Thinking with Python,"University of California, Berkeley",Beginner,"Learn the basics of computational thinking, an...",0.534974
8,edx_61,edx,Data Science: Capstone,Harvard University,Beginner,Show what you've learned from the Professional...,0.527834
9,edx_193,edx,Data Science Tools,IBM,Beginner,Learn about the most popular data science tool...,0.520418
