In [3]:
import json
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix

users = pd.read_csv("../database/users.csv")
students_profile = pd.read_csv("../database/students_profile.csv")
content = pd.read_csv("../database/content_items.csv")
interactions = pd.read_csv("../database/interactions.csv")
recommendation_logs = pd.read_csv("../database/recommendation_logs.csv")

In [7]:
# Try embeddings
def parse_embedding(x):
    if pd.isna(x):
        return None
    if isinstance(x, (list, np.ndarray)):
        return np.array(x, dtype=float)
    try:
        return np.array(json.loads(x), dtype=float)
    except Exception:
        return None

content['embed'] = content.get('embedding_vector', None).apply(parse_embedding) if 'embedding_vector' in content.columns else None

# If no embeddings or many missing, construct a simple skill-tag TF vector
if content['embed'] is None or content['embed'].apply(lambda v: v is None).sum() > len(content) * 0.4:
    print("Embeddings unavailable or sparse — building tag-based vectors from skills_tags.")
    # build vocabulary of tags
    content['skills_tags'] = content.get('skills_tags', "").fillna("").astype(str)
    tag_lists = content['skills_tags'].apply(lambda s: [t.strip().lower() for t in s.split(",") if t.strip()])
    all_tags = sorted({t for tags in tag_lists for t in tags})
    tag_to_idx = {t:i for i,t in enumerate(all_tags)}
    def tags_to_vec(tags):
        vec = np.zeros(len(all_tags), dtype=float)
        for t in tags:
            vec[tag_to_idx[t]] = 1.0
        return vec
    content['embed_vec'] = tag_lists.apply(tags_to_vec)
else:
    dims = max([len(e) for e in content['embed'].dropna()]) if content['embed'].dropna().shape[0] > 0 else 0  #filling missing
    content['embed_vec'] = content['embed'].apply(lambda e: np.zeros(dims) if e is None else e)


In [8]:
# Stack into matrix and normalize
X = np.vstack(content['embed_vec'].values)
X = np.nan_to_num(X)   # safety
Xn = normalize(X)      # normalize rows for cosine
content_sim = cosine_similarity(Xn)  # content_count x content_count
# Map content index to id/title
content_idx_to_id = content['id'].tolist() if 'id' in content.columns else content.index.tolist()
content_id_to_idx = {cid:i for i,cid in enumerate(content_idx_to_id)}


In [11]:
if 'item_id' not in interactions.columns and 'content_id' in interactions.columns:
    interactions = interactions.rename(columns={'content_id': 'item_id'})

interactions['user_id'] = interactions['user_id'].astype(str)
interactions['item_id'] = interactions['item_id'].astype(str)

In [12]:
interactions['time_min'] = pd.to_numeric(interactions.get('time_taken', 0), errors='coerce').fillna(0) / 60.0
interactions['correct'] = pd.to_numeric(interactions.get('correct', 0), errors='coerce').fillna(0).astype(float)

if interactions['time_min'].max() > 0:
    interactions['time_norm'] = interactions['time_min'] / interactions['time_min'].max()
else:
    interactions['time_norm'] = 0.0

In [13]:
alpha, beta = 0.3, 0.7  # weight more on correctness
interactions['implicit_score'] = alpha * interactions['time_norm'] + beta * interactions['correct']

# group by (user, item)
ui = interactions.groupby(['user_id', 'item_id'])['implicit_score'].sum().reset_index()

In [14]:
user_ids = ui['user_id'].unique().tolist()
item_ids = ui['item_id'].unique().tolist()
user_to_idx = {u: i for i, u in enumerate(user_ids)}
item_to_idx = {it: i for i, it in enumerate(item_ids)}

In [15]:
# sparse matrix
rows = ui['user_id'].map(user_to_idx)
cols = ui['item_id'].map(item_to_idx)
data = ui['implicit_score'].values
ui_matrix = csr_matrix((data, (rows, cols)), shape=(len(user_ids), len(item_ids)))

SVD decomposition

In [16]:
n_comp = min(40, min(ui_matrix.shape) - 1) if min(ui_matrix.shape) > 1 else 1
svd = TruncatedSVD(n_components=n_comp, random_state=42)
user_factors = svd.fit_transform(ui_matrix)  # users × latent_dims
item_factors = svd.components_.T             # items × latent_dims

In [25]:
pred_cf = user_factors.dot(item_factors.T)
# pred_cf = (pred_cf - pred_cf.min()) / (pred_cf.max() - pred_cf.min() + 1e-9)  # normalize 0–1
# Rescale collaborative filtering predictions for readability
pred_cf_scaled = 0.6 + 0.4 * ((pred_cf - pred_cf.min()) / (pred_cf.max() - pred_cf.min() + 1e-9))



In [26]:
idx_to_user = {i: u for u, i in user_to_idx.items()}
idx_to_item = {i: it for it, i in item_to_idx.items()}

def recommend_items_for_user(user_id, n=5, exclude_seen=True):
    """Return top-N recommended item_ids for a given user_id."""
    if user_id not in user_to_idx:
        return []
    
    u_idx = user_to_idx[user_id]
    user_scores = pred_cf[u_idx]
    
    if exclude_seen:
        seen_items = ui[ui['user_id'] == user_id]['item_id'].tolist()
        candidate_indices = [item_to_idx[i] for i in item_ids if i not in seen_items]
    else:
        candidate_indices = list(range(len(item_ids)))
    
    if not candidate_indices:
        return []
    
    top_idx = np.argsort(user_scores[candidate_indices])[::-1][:n]
    top_item_ids = [idx_to_item[candidate_indices[i]] for i in top_idx]
    return top_item_ids

In [27]:
sample_user = user_ids[0]
top_recs = recommend_items_for_user(sample_user, n=5)
print(f"Recommended items for {sample_user}: {top_recs}")

Recommended items for user_1: ['content_34', 'content_22', 'content_43', 'content_28', 'content_13']


In [28]:
content_df = pd.read_csv("../database/content_items.csv")

# make sure id column matches your item_id naming
content_df['id'] = content_df['id'].astype(str)

def show_enriched_recommendations(user_id, n=5):
    """Return a rich recommendation table (title, difficulty, etc.) for a given user."""
    rec_ids = recommend_items_for_user(user_id, n=n)
    recs = content_df[content_df['id'].isin(rec_ids)].copy()
    
    # keep the same recommendation order
    recs['order'] = recs['id'].apply(lambda x: rec_ids.index(x) if x in rec_ids else 999)
    recs = recs.sort_values('order').drop(columns=['order'])
    
    # add predicted CF score for each item
    u_idx = user_to_idx[user_id]
    recs['predicted_score'] = recs['id'].map(
        lambda cid: pred_cf[u_idx, item_to_idx[cid]] if cid in item_to_idx else np.nan
    )
    
    return recs[['id', 'title', 'type', 'skills_tags', 'difficulty', 'predicted_score', 'url']]

In [29]:

sample_user = user_ids[75]
recommendations = show_enriched_recommendations(sample_user, n=5)
print(f"Top Recommendations for {sample_user}:")
display(recommendations)


Top Recommendations for user_167:


Unnamed: 0,id,title,type,skills_tags,difficulty,predicted_score,url
14,content_15,Communication - writing Level 1,video,['writing'],1,0.248971,https://educhat.example.com/communication/writ...
55,content_56,Mathematics - calculus Level 3,exercise,['calculus'],3,0.20734,https://educhat.example.com/mathematics/calcul...
35,content_36,Physics - thermodynamics Level 3,article,['thermodynamics'],3,0.203218,https://educhat.example.com/physics/thermodyna...
19,content_20,Communication - writing Level 2,article,['writing'],2,0.184628,https://educhat.example.com/communication/writ...
0,content_1,Mathematics - geometry Level 3,video,['geometry'],3,0.175709,https://educhat.example.com/mathematics/geomet...
