In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
import random
from tqdm import tqdm
from collections import defaultdict
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout
import itertools
import os
import pickle

# pip install tensorflow

In [5]:
df_ratings = pd.read_csv("ml-latest-small/ratings.csv", sep=",", encoding="ISO-8859-1")
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [6]:
df_movies = pd.read_csv("ml-latest-small/movies.csv", sep=",", encoding="ISO-8859-1")
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [7]:
df = df_ratings.merge(df_movies, on='movieId', how='left')
df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama|Horror|Thriller
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action|Crime|Thriller
100833,610,168250,5.0,1494273047,Get Out (2017),Horror
100834,610,168252,5.0,1493846352,Logan (2017),Action|Sci-Fi


In [8]:
df['genres_list'] = df['genres'].str.split('|')
df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,genres_list
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]"
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,"[Action, Crime, Thriller]"
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,"[Mystery, Thriller]"
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,"[Crime, Mystery, Thriller]"
...,...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama|Horror|Thriller,"[Drama, Horror, Thriller]"
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action|Crime|Thriller,"[Action, Crime, Thriller]"
100833,610,168250,5.0,1494273047,Get Out (2017),Horror,[Horror]
100834,610,168252,5.0,1493846352,Logan (2017),Action|Sci-Fi,"[Action, Sci-Fi]"


In [9]:
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(df['genres_list'])
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)

genres_df

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0
100832,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
100833,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
100834,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [10]:
df = df.join(genres_df)

df.drop(columns=['genres', 'genres_list'], inplace=True)

df

Unnamed: 0,userId,movieId,rating,timestamp,title,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,964982703,Toy Story (1995),0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,3,4.0,964981247,Grumpier Old Men (1995),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,6,4.0,964982224,Heat (1995),0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
100833,610,168250,5.0,1494273047,Get Out (2017),0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
100834,610,168252,5.0,1493846352,Logan (2017),0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [11]:
genre_columns = [col for col in df.columns if col not in ['userId', 'movieId', 'rating', 'timestamp', 'title']]
print("Available genres:", genre_columns)

Available genres: ['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


## Non personalised

In [12]:
def get_weighted_recommendations(df, genre=None, m_threshold=100, top_n=10):
    if genre:
        if genre not in df.columns:
            raise ValueError(f"Genre '{genre}' not found in the dataset.")
        df = df[df[genre] == 1]

    # Calculate the global mean rating (C)
    C = df['rating'].mean()

    # Group by movie and compute v (count) and U(j) (mean)
    movie_stats = df.groupby('title').agg(
        v=('rating', 'count'),
        U=('rating', 'mean')
    ).reset_index()

    # Filter for movies with enough votes
    qualified = movie_stats[movie_stats['v'] >= m_threshold].copy()

    # Use m as the threshold
    m = m_threshold

    # Compute WR(j)
    qualified['WR'] = (qualified['v'] / (qualified['v'] + m)) * qualified['U'] + \
                      (m / (qualified['v'] + m)) * C

    # Return top N movies sorted by WR
    return qualified.sort_values('WR', ascending=False).head(top_n)


In [13]:
top_movies = get_weighted_recommendations(df, genre='Comedy', m_threshold=100, top_n=10)
print(top_movies)

                                                  title    v         U  \
2713                                Pulp Fiction (1994)  307  4.197068   
1181                                Forrest Gump (1994)  329  4.164134   
2689                         Princess Bride, The (1987)  142  4.232394   
1095                                       Fargo (1996)  181  4.116022   
2268             Monty Python and the Holy Grail (1975)  136  4.161765   
149   Amelie (Fabuleux destin d'AmÃ©lie Poulain, Le)...  120  4.183333   
280                           Back to the Future (1985)  171  4.038012   
3442                                   Toy Story (1995)  215  3.920930   
1129                                Finding Nemo (2003)  141  3.960993   
3451                               Trainspotting (1996)  102  4.039216   

            WR  
2713  3.997474  
1181  3.982452  
2689  3.882116  
1095  3.855773  
2268  3.832509  
149   3.820328  
280   3.796945  
3442  3.750705  
1129  3.721876  
3451  3.715208 

## Personalized Recommender

In [14]:
user_movie_matrix = df.pivot_table(
    index='userId',
    columns='movieId',
    values='rating',
    aggfunc='mean',
    fill_value=0
)

print(f"Customer-Movie Matrix Shape: {user_movie_matrix.shape}")

user_movie_matrix.head()

Customer-Movie Matrix Shape: (610, 9724)


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
def train_test_split_per_user(matrix, test_size=0.2, seed=42):
    train = matrix.copy()
    test = pd.DataFrame(0.0, index=matrix.index, columns=matrix.columns)
    random.seed(seed)

    for user in matrix.index:
        purchased_items = matrix.loc[user]
        purchased_items = purchased_items[purchased_items > 0].index.tolist()

        if len(purchased_items) < 2:
            continue  # Skip users with too few purchases

        test_items = random.sample(purchased_items, max(1, int(len(purchased_items) * test_size)))

        for item in test_items:
            train.loc[user, item] = 0
            test.loc[user, item] = matrix.loc[user, item]

    return train, test

train_matrix, test_matrix = train_test_split_per_user(user_movie_matrix, test_size=0.2)

ITEM-BASED COLLABORATIVE FILTERING

In [16]:
# Transpose the matrix to have products as rows
item_matrix = user_movie_matrix.T

# Calculate similarity
item_sim = cosine_similarity(item_matrix)

# Convert to DataFrame
sim_item = pd.DataFrame(item_sim, index=item_matrix.index, columns=item_matrix.index)

In [17]:
def recommend_movies_item_based(
    userId,
    train_matrix,
    sim_item,
    dataframe,
    top_n=5,
    filter_genre=None
):
    if userId not in train_matrix.index:
        return f"User {userId} not found."

    user_row = train_matrix.loc[userId]
    watched_movies = user_row[user_row > 0].index.tolist()
    
    if not watched_movies:
        return f"Customer {userId} has no purchases."
    
    scores = defaultdict(float)

    for movie in watched_movies:
        similar_movies = sim_item[movie].drop(index=movie)
        top_similar_movies = similar_movies.nlargest(1000)
        for similar_movie, score in top_similar_movies.items():
            if similar_movie in watched_movies:
                continue
            scores[similar_movie] += score

    ranked_movies = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    recommended_movie_ids = [movie_id for movie_id, _ in ranked_movies[:top_n*3]] # Get more than top_n to filter by genre later

    # Filter by genre
    if filter_genre and filter_genre in dataframe.columns:
        filtered_df = dataframe[(dataframe['movieId'].isin(recommended_movie_ids)) & (dataframe[filter_genre] == 1)]
        recommended_titles = filtered_df['title'].unique().tolist()[:top_n]
    else:
        recommended_titles = dataframe[dataframe['movieId'].isin(recommended_movie_ids)]['title'].unique().tolist()[:top_n]

    return recommended_titles

In [18]:
recommended_item = recommend_movies_item_based(
    userId=42,                          
    train_matrix=train_matrix,         
    sim_item=sim_item,                 
    dataframe=df,                 
    top_n=5,                         
    filter_genre='Comedy'              
)

print("Item-Based Recommendations:", recommended_item)


Item-Based Recommendations: ['Crocodile Dundee (1986)', 'Big (1988)', 'Who Framed Roger Rabbit? (1988)', 'Spaceballs (1987)', "There's Something About Mary (1998)"]


In [None]:
def evaluate_item_based(
    train_matrix,
    test_matrix,
    recommender_fn,
    sim_item,
    dataframe,
    k=5,
    max_users=100,
    filter_genre=None
):
    precisions, recalls = [], []
    users = list(test_matrix.index)[:max_users]

    for user in tqdm(users, desc="Evaluating Item-Based CF"):
        true_items = set(test_matrix.loc[user][test_matrix.loc[user] > 0].index)
        if not true_items:
            continue

        try:
            recommended = recommender_fn(
                userId=user,
                train_matrix=train_matrix,
                sim_item=sim_item,
                dataframe=dataframe,
                top_n=k,
                filter_genre=filter_genre
            )
        except Exception as e:
            print(f"Skipping user {user} due to error: {e}")
            continue

        predicted_ids = dataframe[dataframe['title'].isin(recommended)]['movieId'].unique()
        hits = set(predicted_ids) & true_items

        precisions.append(len(hits) / k)
        recalls.append(len(hits) / len(true_items))

    return sum(precisions) / len(precisions), sum(recalls) / len(recalls)


USER-BASED COLLABORATIVE FILTERING

In [20]:
# Transpose the matrix to have products as rows
item_matrix = user_movie_matrix

# Calculate similarity
item_sim = cosine_similarity(item_matrix)

# Convert to DataFrame
sim_user = pd.DataFrame(item_sim, index=item_matrix.index, columns=item_matrix.index)

In [21]:
def recommend_movies_user_based(
    userId,
    train_matrix,
    sim_user,
    dataframe,
    top_n=5,
    filter_genre=None,
    top_sim_users=1000
):
    if userId not in train_matrix.index:
        return f"User {userId} not found."

    # Get similarity scores for all other users
    similar_users = sim_user.loc[userId].drop(index=userId)
    similar_users = similar_users.nlargest(top_sim_users)

    # Movies already rated by the user
    already_rated = set(train_matrix.loc[userId][train_matrix.loc[userId] > 0].index)

    # Get ratings from similar users only for movies the user hasn't rated
    candidate_ratings = train_matrix.loc[similar_users.index]
    # Zero out columns for movies already rated by the user
    candidate_ratings = candidate_ratings.loc[:, ~candidate_ratings.columns.isin(already_rated)]

    # Compute weighted sum of ratings
    weighted_scores = candidate_ratings.T.dot(similar_users)
    # Sort by score
    ranked_movies = weighted_scores.sort_values(ascending=False).index.tolist()

    # Filter by genre if needed
    if filter_genre and filter_genre in dataframe.columns:
        filtered_df = dataframe[(dataframe['movieId'].isin(ranked_movies)) & (dataframe[filter_genre] == 1)]
        recommended_titles = filtered_df['title'].unique().tolist()[:top_n]
    else:
        recommended_titles = dataframe[dataframe['movieId'].isin(ranked_movies)]['title'].unique().tolist()[:top_n]

    return recommended_titles if recommended_titles else f"No recommendations found for User {userId}."

In [22]:
recommended_user = recommend_movies_user_based(
    userId=42,
    train_matrix=train_matrix,
    sim_user=sim_user,                
    dataframe=df,
    top_n=5,
    filter_genre='Action'
)

print("User-Based Recommendations:", recommended_user)

User-Based Recommendations: ['Heat (1995)', 'From Dusk Till Dawn (1996)', 'Rob Roy (1995)', 'Star Wars: Episode IV - A New Hope (1977)', 'Mask, The (1994)']


In [23]:
def evaluate_user_based(
    train_matrix,
    test_matrix,
    recommender_fn,
    sim_user,
    dataframe,
    k=5,
    max_users=1000,
    filter_genre=None
):
    precisions, recalls = [], []
    users = list(test_matrix.index)[:max_users]

    for user in tqdm(users, desc="Evaluating User-Based CF"):
        true_items = set(test_matrix.loc[user][test_matrix.loc[user] > 0].index)
        if not true_items:
            continue

        try:
            recommended = recommender_fn(
                userId=user,
                train_matrix=train_matrix,
                sim_user=sim_user,
                dataframe=dataframe,
                top_n=k,
                filter_genre=filter_genre
            )
        except Exception as e:
            print(f"Skipping user {user} due to error: {e}")
            continue

        predicted_ids = dataframe[dataframe['title'].isin(recommended)]['movieId'].unique()
        hits = set(predicted_ids) & true_items

        precisions.append(len(hits) / k)
        recalls.append(len(hits) / len(true_items))

    return sum(precisions) / len(precisions), sum(recalls) / len(recalls)


In [24]:
precision_item, recall_item = evaluate_item_based(
    train_matrix, test_matrix, recommend_movies_item_based, sim_item, df,
    k=5, max_users=500, filter_genre=None
)
print(f"Item-Based — Precision@5: {precision_item:.4f}, Recall@5: {recall_item:.4f}")


Evaluating Item-Based CF: 100%|██████████| 500/500 [08:13<00:00,  1.01it/s]

Item-Based — Precision@5: 0.3412, Recall@5: 0.0908





In [25]:
precision_user, recall_user = evaluate_user_based(
    train_matrix, test_matrix, recommend_movies_user_based, sim_user, df,
    k=5, max_users=500, filter_genre=None
)

print(f"User-Based — Precision@5: {precision_user:.4f}, Recall@5: {recall_user:.4f}")

Evaluating User-Based CF: 100%|██████████| 500/500 [00:36<00:00, 13.65it/s]

User-Based — Precision@5: 0.0620, Recall@5: 0.0163





## Content-based Filtering

In [26]:
item_profiles = df.drop_duplicates(subset='movieId').set_index('movieId')[genre_columns]
item_profiles = item_profiles[[col for col in item_profiles.columns if col in ['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']]]

In [27]:
ratings_df = train_matrix.reset_index().melt(
    id_vars='userId',
    var_name='movieId',
    value_name='rating'
)

# Keep only rated entries (non-zero)
ratings_df = ratings_df[ratings_df['rating'] > 0]

In [28]:
def build_user_profile(user_id, ratings_df, item_profiles):
    user_ratings = ratings_df[ratings_df['userId'] == user_id]
    rated_movies = user_ratings.merge(item_profiles, left_on='movieId', right_index=True)

    # Multiply each genre vector by the user's rating
    weighted = rated_movies[item_profiles.columns].multiply(rated_movies['rating'], axis=0)

    # Average (weighted sum / sum of ratings)
    user_profile = weighted.sum(axis=0) / rated_movies['rating'].sum()

    return user_profile

In [29]:
def recommend_movies_content_based(user_id, ratings_df, item_profiles, df, top_n=5, filter_genre=None):
    user_profile = build_user_profile(user_id, ratings_df, item_profiles)
    
    # Compute cosine similarity
    similarities = cosine_similarity(item_profiles, user_profile.values.reshape(1, -1)).flatten()
    similarity_series = pd.Series(similarities, index=item_profiles.index)

    # Remove already rated movies
    rated = ratings_df[ratings_df['userId'] == user_id]['movieId'].tolist()
    similarity_series = similarity_series.drop(index=rated, errors='ignore')

    # Get top N recommendations
    top_movie_ids = similarity_series.sort_values(ascending=False).head(top_n * 3).index.tolist()

    # Apply genre filtering if needed
    if filter_genre and filter_genre in df.columns:
        filtered = df[(df['movieId'].isin(top_movie_ids)) & (df[filter_genre] == 1)]
    else:
        filtered = df[df['movieId'].isin(top_movie_ids)]

    return filtered['title'].unique().tolist()[:top_n]


In [30]:
user_id = 42
user_profile = build_user_profile(user_id, ratings_df, item_profiles)
print(user_profile)

(no genres listed)    0.000000
Action                0.248408
Adventure             0.148089
Animation             0.010350
Children              0.019904
Comedy                0.472930
Crime                 0.176752
Documentary           0.010350
Drama                 0.425159
Fantasy               0.044586
Film-Noir             0.005573
Horror                0.033439
IMAX                  0.003185
Musical               0.019108
Mystery               0.069268
Romance               0.191083
Sci-Fi                0.078822
Thriller              0.259554
War                   0.042994
Western               0.037420
dtype: float64


In [31]:
recommended_titles = recommend_movies_content_based(
    user_id=user_id,
    ratings_df=ratings_df,
    item_profiles=item_profiles,
    df=df,
    top_n=5,
    filter_genre=None
)

print("Content-Based Recommendations:", recommended_titles)

Content-Based Recommendations: ['Bad Boys (1995)', 'Nurse Betty (2000)', 'Hunting Party, The (2007)', 'Last Boy Scout, The (1991)', 'Out of Sight (1998)']


Evaluation

In [32]:
def evaluate_content_based(
    train_df,  # ratings only
    test_matrix,
    recommender_fn,
    item_profiles,
    dataframe,
    k=5,
    max_users=100,
    filter_genre=None
):
    precisions, recalls = [], []
    users = list(test_matrix.index)[:max_users]

    for user in tqdm(users, desc="Evaluating Content-Based CF"):
        true_items = set(test_matrix.loc[user][test_matrix.loc[user] > 0].index)
        if not true_items:
            continue

        try:
            recommended = recommender_fn(
                user_id=user,
                ratings_df=train_df,
                item_profiles=item_profiles,
                df=dataframe,
                top_n=k,
                filter_genre=filter_genre
            )
        except Exception as e:
            print(f"Skipping user {user} due to error: {e}")
            continue

        predicted_ids = dataframe[dataframe['title'].isin(recommended)]['movieId'].unique()
        hits = set(predicted_ids) & true_items

        precisions.append(len(hits) / k)
        recalls.append(len(hits) / len(true_items))

    return sum(precisions) / len(precisions), sum(recalls) / len(recalls)


In [33]:
precision_cb, recall_cb = evaluate_content_based(
    train_df=ratings_df,
    test_matrix=test_matrix,
    recommender_fn=recommend_movies_content_based,
    item_profiles=item_profiles,
    dataframe=df,          
    k=5,
    max_users=500,
    filter_genre=None      
)

print(f"Content-Based CF — Precision@5: {precision_cb:.4f}, Recall@5: {recall_cb:.4f}")

Evaluating Content-Based CF: 100%|██████████| 500/500 [00:11<00:00, 43.22it/s]

Content-Based CF — Precision@5: 0.0228, Recall@5: 0.0050





# Matrix factorization

In [34]:
# Fit SVD on the training matrix
def train_svd_model(train_matrix, n_components=20, random_state=42):
    svd = TruncatedSVD(n_components=n_components, random_state=random_state)
    svd.fit(train_matrix)
    
    user_factors = svd.transform(train_matrix)  # U × Σ
    item_factors = svd.components_.T            # V

    return svd, user_factors, item_factors


In [35]:
def recommend_svd(
    userId,
    train_matrix,
    svd,
    user_factors,
    item_factors,
    dataframe,
    top_n=5,
    filter_genre=None
):
    if userId not in train_matrix.index:
        return f"User {userId} not found."

    user_idx = train_matrix.index.get_loc(userId)
    user_vector = user_factors[user_idx]

    # Predict scores for all items
    scores = np.dot(item_factors, user_vector)

    # Build score Series with movieIds
    movie_ids = train_matrix.columns
    score_series = pd.Series(scores, index=movie_ids)

    # Remove already-rated movies
    rated_items = train_matrix.loc[userId]
    already_rated = rated_items[rated_items > 0].index
    score_series = score_series.drop(already_rated, errors='ignore')

    # Top N * 3 for genre filtering
    top_movie_ids = score_series.sort_values(ascending=False).head(top_n * 3).index.tolist()

    # Filter by genre
    if filter_genre and filter_genre in dataframe.columns:
        filtered_df = dataframe[
            (dataframe['movieId'].isin(top_movie_ids)) &
            (dataframe[filter_genre] == 1)
        ]
    else:
        filtered_df = dataframe[dataframe['movieId'].isin(top_movie_ids)]

    return filtered_df['title'].unique().tolist()[:top_n]


In [36]:
# Train SVD
svd_model, user_factors, item_factors = train_svd_model(train_matrix, n_components=20)

# Recommend
recommendations = recommend_svd(
    userId=42,
    train_matrix=train_matrix,
    svd=svd_model,
    user_factors=user_factors,
    item_factors=item_factors,
    dataframe=df,
    top_n=5,
    filter_genre=None  # or 'Comedy'
)

print("Matrix Factorization Recommendations:", recommendations)


Matrix Factorization Recommendations: ['Star Wars: Episode IV - A New Hope (1977)', 'Forrest Gump (1994)', 'Jurassic Park (1993)', 'Princess Bride, The (1987)', 'Terminator, The (1984)']


Evaluation

In [37]:
def evaluate_svd(
    train_matrix,
    test_matrix,
    svd,
    user_factors,
    item_factors,
    dataframe,
    k=5,
    max_users=100,
    filter_genre=None
):
    precisions, recalls = [], []
    users = list(test_matrix.index)[:max_users]

    for user in tqdm(users, desc="Evaluating SVD"):
        true_items = set(test_matrix.loc[user][test_matrix.loc[user] > 0].index)
        if not true_items:
            continue

        try:
            recommended = recommend_svd(
                userId=user,
                train_matrix=train_matrix,
                svd=svd,
                user_factors=user_factors,
                item_factors=item_factors,
                dataframe=dataframe,
                top_n=k,
                filter_genre=filter_genre
            )
        except Exception as e:
            print(f"Skipping user {user} due to error: {e}")
            continue

        predicted_ids = dataframe[dataframe['title'].isin(recommended)]['movieId'].unique()
        hits = set(predicted_ids) & true_items

        precisions.append(len(hits) / k)
        recalls.append(len(hits) / len(true_items))

    return sum(precisions) / len(precisions), sum(recalls) / len(recalls)


In [38]:
precision_svd, recall_svd = evaluate_svd(
    train_matrix=train_matrix,
    test_matrix=test_matrix,
    svd=svd_model,
    user_factors=user_factors,
    item_factors=item_factors,
    dataframe=df,          
    k=5,
    max_users=500,
    filter_genre=None        
)

print(f"SVD — Precision@5: {precision_svd:.4f}, Recall@5: {recall_svd:.4f}")


Evaluating SVD: 100%|██████████| 500/500 [00:05<00:00, 89.16it/s]

SVD — Precision@5: 0.2448, Recall@5: 0.0619





## Hybrid Model

In [96]:
# Define a score for each one of the previous personalized models

# For the item-based model
def score_movies_item_based(userId, train_matrix, sim_item):
    if userId not in train_matrix.index:
        return {}

    user_row = train_matrix.loc[userId]
    watched = set(user_row[user_row > 0].index)
    scores = defaultdict(float)

    for movie in watched:
        if movie not in sim_item:
            continue
        similar = sim_item[movie].drop(index=movie, errors='ignore').nlargest(50)
        for sim_movie, score in similar.items():
            if sim_movie not in watched:
                scores[sim_movie] += score

    return scores



# For the SVD model
def score_movies_svd(userId, train_matrix, user_factors, item_factors):
    if userId not in train_matrix.index:
        return {}

    user_idx = train_matrix.index.get_loc(userId)
    user_vector = user_factors[user_idx]
    scores = np.dot(item_factors, user_vector)
    movie_ids = train_matrix.columns
    score_series = pd.Series(scores, index=movie_ids)

    rated = train_matrix.loc[userId]
    return score_series[rated == 0].to_dict()

In [97]:
item_scores = score_movies_item_based(42, train_matrix, sim_item)
svd_scores = score_movies_svd(42, train_matrix, user_factors, item_factors)

# Convert to sorted DataFrames for easy viewing
pd.Series(item_scores).sort_values(ascending=False).head(5)
pd.Series(svd_scores).sort_values(ascending=False).head(5)

2858    3.364467
589     3.272934
356     2.857604
260     2.766909
16      2.410037
dtype: float64

In [98]:
top_items = pd.Series(item_scores).sort_values(ascending=False).head(5).index
df[df['movieId'].isin(top_items)][['movieId', 'title']]

Unnamed: 0,movieId,title
26,480,Jurassic Park (1993)
81,1240,"Terminator, The (1984)"
134,2115,Indiana Jones and the Temple of Doom (1984)
194,2987,Who Framed Roger Rabbit? (1988)
446,2683,Austin Powers: The Spy Who Shagged Me (1999)
...,...,...
99076,2683,Austin Powers: The Spy Who Shagged Me (1999)
99110,2987,Who Framed Roger Rabbit? (1988)
99518,480,Jurassic Park (1993)
99569,480,Jurassic Park (1993)


In [120]:
top_items = pd.Series(svd_scores).sort_values(ascending=False).head(5).index
df[df['movieId'].isin(top_items)][['movieId', 'title']]

Unnamed: 0,movieId,title
15,260,Star Wars: Episode IV - A New Hope (1977)
20,356,Forrest Gump (1994)
184,2858,American Beauty (1999)
318,260,Star Wars: Episode IV - A New Hope (1977)
454,2858,American Beauty (1999)
...,...,...
99536,16,Casino (1995)
99550,260,Star Wars: Episode IV - A New Hope (1977)
99559,356,Forrest Gump (1994)
99573,589,Terminator 2: Judgment Day (1991)


In [None]:
def recommend_hybrid(
    userId,
    train_matrix,
    sim_item,
    user_factors,
    item_factors,
    dataframe,
    top_n=5,
    weights=(0.4, 0.6),     # defined through the grid search ahead
    filter_genre=None
):
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()

    # Score from each model
    item_scores = score_movies_item_based(userId, train_matrix, sim_item)
    svd_scores  = score_movies_svd(userId, train_matrix, user_factors, item_factors)

    all_movie_ids = set(item_scores) | set(svd_scores)
    combined_scores = {}

    for movie_id in all_movie_ids:
        i = item_scores.get(movie_id, 0)
        s = svd_scores.get(movie_id, 0)
        combined_scores[movie_id] = (
            weights[0] * i + weights[1] * s
        )

    # Normalize scores
    if combined_scores:
        score_df = pd.DataFrame.from_dict(combined_scores, orient='index', columns=['score'])
        score_df['score'] = scaler.fit_transform(score_df[['score']])
        top_ids = score_df.sort_values('score', ascending=False).head(top_n * 3).index
    else:
        return f"No recommendations found for User {userId}."

    # Apply genre filter
    if filter_genre and filter_genre in dataframe.columns:
        filtered_df = dataframe[(dataframe['movieId'].isin(top_ids)) & (dataframe[filter_genre] == 1)]
    else:
        filtered_df = dataframe[dataframe['movieId'].isin(top_ids)]

    return filtered_df['title'].unique().tolist()[:top_n]

In [102]:
recommendations = recommend_hybrid(
    userId=42,
    train_matrix=train_matrix,
    sim_item=sim_item,
    user_factors=user_factors,
    item_factors=item_factors,
    dataframe=df,
    top_n=5,
    weights=(0.4, 0.3, 0.3),
    filter_genre=None
)

print("Hybrid Recommendations:", recommendations)


Hybrid Recommendations: ['Star Wars: Episode IV - A New Hope (1977)', 'Forrest Gump (1994)', 'Jurassic Park (1993)', 'Monty Python and the Holy Grail (1975)', 'Terminator, The (1984)']


Evaluation

In [None]:
def evaluate_hybrid(
    train_matrix,
    test_matrix,
    sim_item,
    user_factors,
    item_factors,
    dataframe,
    k=5,
    max_users=100,
    weights=(0.4, 0.6),
    filter_genre=None
):
    precisions, recalls = [], []
    users = list(test_matrix.index)[:max_users]

    for user in tqdm(users, desc="Evaluating Hybrid Recommender"):
        true_items = set(test_matrix.loc[user][test_matrix.loc[user] > 0].index)
        if not true_items:
            continue

        try:
            recommended = recommend_hybrid(
                userId=user,
                train_matrix=train_matrix,
                sim_item=sim_item,
                user_factors=user_factors,
                item_factors=item_factors,
                dataframe=dataframe,
                top_n=k,
                weights=weights,
                filter_genre=filter_genre
            )
        except Exception as e:
            print(f"Skipping user {user} due to error: {e}")
            continue

        predicted_ids = dataframe[dataframe['title'].isin(recommended)]['movieId'].unique()
        hits = set(predicted_ids) & true_items

        precisions.append(len(hits) / k)
        recalls.append(len(hits) / len(true_items))

    avg_precision = sum(precisions) / len(precisions) if precisions else 0
    avg_recall = sum(recalls) / len(recalls) if recalls else 0

    return avg_precision, avg_recall


In [109]:
import itertools

steps = [0.3, 0.4, 0.5, 0.6, 0.7]
combinations = [w for w in itertools.product(steps, repeat=2) if abs(sum(w) - 1.0) < 1e-5]

best_score = 0
best_weights = None

for w in combinations:
    precision, recall = evaluate_hybrid(
        train_matrix=train_matrix,
        test_matrix=test_matrix,
        sim_item=sim_item,
        user_factors=user_factors,
        item_factors=item_factors,
        dataframe=df,
        weights=w,
        k=5,
        max_users=200
    )
    if precision > best_score:
        best_score = precision
        best_weights = w

print("Best Weights:", best_weights)
print(f"Best Precision@5: {best_score:.4f}")


Evaluating Hybrid Recommender: 100%|██████████| 200/200 [00:38<00:00,  5.23it/s]
Evaluating Hybrid Recommender: 100%|██████████| 200/200 [00:37<00:00,  5.30it/s]
Evaluating Hybrid Recommender: 100%|██████████| 200/200 [00:37<00:00,  5.34it/s]
Evaluating Hybrid Recommender: 100%|██████████| 200/200 [00:37<00:00,  5.34it/s]
Evaluating Hybrid Recommender: 100%|██████████| 200/200 [00:37<00:00,  5.37it/s]

Best Weights: (0.4, 0.6)
Best Precision@5: 0.2910





In [113]:
precision_hybrid, recall_hybrid = evaluate_hybrid(
        train_matrix=train_matrix,
        test_matrix=test_matrix,
        sim_item=sim_item,
        user_factors=user_factors,
        item_factors=item_factors,
        dataframe=df,
        weights=[0.4, 0.6],  # Best weights found
        k=5,
        max_users=500
    )

print(f"Hybrid — Precision@5: {precision_hybrid:.4f}, Recall@5: {recall_hybrid:.4f}")

Evaluating Hybrid Recommender: 100%|██████████| 500/500 [01:42<00:00,  4.86it/s]

Hybrid — Precision@5: 0.2832, Recall@5: 0.0762





## Gen AI Model: Neural Network with Keras using TensorFlow

In [None]:
# Encode userId and movieId to integers (needed for embedding layers)
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

ratings_df['user'] = user_encoder.fit_transform(ratings_df['userId'])
ratings_df['movie'] = movie_encoder.fit_transform(ratings_df['movieId'])

num_users = ratings_df['user'].nunique()
num_movies = ratings_df['movie'].nunique()

In [None]:
def build_ncf_model(num_users, num_movies, embedding_size=50):
    user_input = Input(shape=(1,))
    movie_input = Input(shape=(1,))

    user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size)(user_input)
    movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_size)(movie_input)

    user_vec = Flatten()(user_embedding)
    movie_vec = Flatten()(movie_embedding)

    x = Concatenate()([user_vec, movie_vec])
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.2)(x)
    output = Dense(1, activation='linear')(x)

    model = Model(inputs=[user_input, movie_input], outputs=output)
    model.compile(optimizer='adam', loss='mse')

    return model

In [47]:
# Train the NCF model
model = build_ncf_model(num_users, num_movies)

# Prepare training data
X = [ratings_df['user'], ratings_df['movie']]
y = ratings_df['rating']

model.fit(X, y, batch_size=256, epochs=5, validation_split=0.1)

Epoch 1/5
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 4.9519 - val_loss: 1.1006
Epoch 2/5
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.9041 - val_loss: 1.1242
Epoch 3/5
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.8464 - val_loss: 1.0868
Epoch 4/5
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.8119 - val_loss: 1.0066
Epoch 5/5
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.7883 - val_loss: 1.0736


<keras.src.callbacks.history.History at 0x1d98c51fb00>

In [50]:
def recommend_ncf(user_id, ratings_df, model, user_encoder, movie_encoder, df, top_n=5):
    if user_id not in user_encoder.classes_:
        return f"User {user_id} not found."

    encoded_user = user_encoder.transform([user_id])[0]
    all_movie_ids = df['movieId'].unique()

    # Only use movieIds that the encoder knows
    valid_movie_ids = np.array([mid for mid in all_movie_ids if mid in movie_encoder.classes_])

    # Remove already rated movies
    rated_movie_ids = ratings_df[ratings_df['userId'] == user_id]['movieId'].values
    mask = ~np.isin(valid_movie_ids, rated_movie_ids)
    candidate_movies = valid_movie_ids[mask]

    # Only transform if there are candidates
    if len(candidate_movies) == 0:
        return []

    encoded_candidates = movie_encoder.transform(candidate_movies)

    # Predict ratings
    user_batch = np.full_like(encoded_candidates, encoded_user)
    predictions = model.predict([user_batch, encoded_candidates], verbose=0)

    top_indices = predictions.flatten().argsort()[-top_n:][::-1]
    top_movie_ids = candidate_movies[top_indices]

    return df[df['movieId'].isin(top_movie_ids)]['title'].unique().tolist()

In [51]:
recommendations = recommend_ncf(
    user_id=42,
    ratings_df=ratings_df,
    model=model,
    user_encoder=user_encoder,
    movie_encoder=movie_encoder,
    df=df,
    top_n=5
)

print("Generative AI (NCF) Recommendations:", recommendations)

Generative AI (NCF) Recommendations: ['Jules and Jim (Jules et Jim) (1961)', 'Religulous (2008)', 'Eddie Murphy Delirious (1983)', 'Woman in the Dunes (Suna no onna) (1964)', 'Big Bird Cage, The (1972)']


Evaluation

In [54]:
from tqdm import tqdm
import numpy as np

def evaluate_ncf(
    train_df,           # ratings dataframe with columns: userId, movieId, rating
    test_matrix,        # test user-movie matrix
    model,              # trained NCF model
    user_encoder,       # fitted LabelEncoder for userId
    movie_encoder,      # fitted LabelEncoder for movieId
    df,                 # full movie metadata with movieId and title
    k=5,
    max_users=100
):
    precisions, recalls = [], []
    users = list(test_matrix.index)[:max_users]

    for user_id in tqdm(users, desc="Evaluating NCF"):
        true_items = test_matrix.loc[user_id]
        true_items = set(true_items[true_items > 0].index)

        if not true_items or user_id not in user_encoder.classes_:
            continue

        try:
            encoded_user = user_encoder.transform([user_id])[0]

            all_movie_ids = df['movieId'].unique()
            rated_movies = train_df[train_df['userId'] == user_id]['movieId'].values

            # Filter candidate movies to only those in encoder
            candidate_movies = np.setdiff1d(all_movie_ids, rated_movies)
            candidate_movies = [mid for mid in candidate_movies if mid in movie_encoder.classes_]

            if not candidate_movies:
                continue  # Skip user if no valid candidates

            encoded_candidates = movie_encoder.transform(candidate_movies)
            user_batch = np.full_like(encoded_candidates, encoded_user)

            # Predict scores
            predictions = model.predict([user_batch, encoded_candidates], verbose=0).flatten()

            # Top-k predictions
            top_indices = predictions.argsort()[-k:][::-1]
            top_movie_ids = np.array(candidate_movies)[top_indices]
        except Exception as e:
            print(f"Skipping user {user_id} due to error: {e}")
            continue

        # Evaluate hits
        hits = set(top_movie_ids) & true_items
        precisions.append(len(hits) / k)
        recalls.append(len(hits) / len(true_items))

    avg_precision = sum(precisions) / len(precisions) if precisions else 0
    avg_recall = sum(recalls) / len(recalls) if recalls else 0

    return avg_precision, avg_recall


In [55]:
precision_ncf, recall_ncf = evaluate_ncf(
    train_df=ratings_df,
    test_matrix=test_matrix,
    model=model,
    user_encoder=user_encoder,
    movie_encoder=movie_encoder,
    df=df,
    k=5,
    max_users=500
)

print(f"NCF — Precision@5: {precision_ncf:.4f}, Recall@5: {recall_ncf:.4f}")


Evaluating NCF: 100%|██████████| 500/500 [21:51<00:00,  2.62s/it]

NCF — Precision@5: 0.0020, Recall@5: 0.0001





| Model                      | Precision@5 | Recall@5 |
|----------------------------|-------------|----------|
| Item-Based CF              | 0.3412      | 0.0908   |
| User-Based CF              | 0.0620      | 0.0163   |
| Content-Based Filtering     | 0.0228      | 0.0050   |
| Matrix Factorization (SVD) | 0.2448      | 0.0619   |
| Hybrid                     | 0.2832      | 0.0762   |
| Gen AI (NCF)               | 0.0020      | 0.0001    |

## Exporting 

In [117]:
import os
import pickle

os.makedirs("streamlit_files", exist_ok=True)


# Save required components
to_pickle = {
    "train_matrix": train_matrix,
    "sim_item": sim_item,
    "sim_user": sim_user,
    "user_factors": user_factors,
    "item_factors": item_factors,
    "df": df,
    "genres": df.columns[df.dtypes == int].tolist(),  # genre columns
}

with open("streamlit_files/recommender_assets.pkl", "wb") as f:
    pickle.dump(to_pickle, f)

# Save the hybrid model function separately if needed
with open("streamlit_files/hybrid_model_function.pkl", "wb") as f:
    pickle.dump(recommend_hybrid, f)

# Output check
if os.path.exists("streamlit_files/recommender_assets.pkl") and os.path.exists("streamlit_files/hybrid_model_function.pkl"):
    print("All files were saved successfully!")
else:
    print("Error: Some files were not saved.")


All files were saved successfully!
