<a href="https://colab.research.google.com/github/avirupdevzone/Matrix-Factorization/blob/main/MovieLens20M.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
grouplens_movielens_20m_dataset_path = kagglehub.dataset_download('grouplens/movielens-20m-dataset')

print('Data source import complete.')


Data source import complete.


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
import os

for dirname, _, filenames in os.walk(f"{grouplens_movielens_20m_dataset_path}"):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/root/.cache/kagglehub/datasets/grouplens/movielens-20m-dataset/versions/1/genome_scores.csv
/root/.cache/kagglehub/datasets/grouplens/movielens-20m-dataset/versions/1/tag.csv
/root/.cache/kagglehub/datasets/grouplens/movielens-20m-dataset/versions/1/rating.csv
/root/.cache/kagglehub/datasets/grouplens/movielens-20m-dataset/versions/1/link.csv
/root/.cache/kagglehub/datasets/grouplens/movielens-20m-dataset/versions/1/movie.csv
/root/.cache/kagglehub/datasets/grouplens/movielens-20m-dataset/versions/1/genome_tags.csv


In [None]:
# base_dataset_path = "/kaggle/input/movielens-20m-dataset"

# Load ratings
ratings = pd.read_csv(f"{grouplens_movielens_20m_dataset_path}/rating.csv")

# Load movies
movies = pd.read_csv(f"{grouplens_movielens_20m_dataset_path}/movie.csv")

# Load tags (For Content based recommendation)
tags = pd.read_csv(f"{grouplens_movielens_20m_dataset_path}/tag.csv")

# Load genome tags & scores (For richer content-based)
genome_scores = pd.read_csv(f"{grouplens_movielens_20m_dataset_path}/genome_scores.csv")
genome_tags = pd.read_csv(f"{grouplens_movielens_20m_dataset_path}/genome_tags.csv")

In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
# Check for missing values
print("\nRatings \n",ratings.isnull().sum())
print("\nMovies \n",movies.isnull().sum())
print("\nTags \n",tags.isnull().sum())


Ratings 
 userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

Movies 
 movieId    0
title      0
genres     0
dtype: int64

Tags 
 userId        0
movieId       0
tag          16
timestamp     0
dtype: int64


In [None]:
# How many ratings per user?
ratings_per_user = ratings.groupby('userId').size()
print("Min ratings per user:", ratings_per_user.min())
print("Max ratings per user:", ratings_per_user.max())
print("Median ratings per user:", ratings_per_user.median())

# How many ratings per movie?
ratings_per_movie = ratings.groupby('movieId').size()
print("Min ratings per movie:", ratings_per_movie.min())
print("Max ratings per movie:", ratings_per_movie.max())
print("Median ratings per movie:", ratings_per_movie.median())

Min ratings per user: 20
Max ratings per user: 9254
Median ratings per user: 68.0
Min ratings per movie: 1
Max ratings per movie: 67310
Median ratings per movie: 18.0


In [None]:
# Shape of datasets before Downsampling
print(f"Shape of ratings = {ratings.shape}")
print(f"Shape of movies = {movies.shape}")
print(f"Shape of tags = {tags.shape}")
print(f"Shape of genome_scores = {genome_scores.shape}")
print(f"Shape of genome_tags = {genome_tags.shape}")

Shape of ratings = (20000263, 4)
Shape of movies = (27278, 3)
Shape of tags = (465564, 4)
Shape of genome_scores = (11709768, 3)
Shape of genome_tags = (1128, 2)


In [None]:
import pandas as pd

"""
Loads & reduces the MovieLens 20M dataset:
- Keeps only users with >= min_user_ratings
- Keeps only movies with >= min_movie_ratings
- Subsamples to max_ratings
- Ensures movies/tags/genomes only contain valid movies

Returns: dict of clean DataFrames.

Subsample to your target size
Ensure referential integrity across movies, tags, genome_scores, genome_tags
NO reindexing inside the utility
"""

def filter_movielens_subset(
    ratings, movies, tags,
    genome_scores, genome_tags,
    min_user_ratings=5,
    min_movie_ratings=5,
    max_ratings=50000
):

    print(f"Loaded {len(ratings)} total ratings.")

    # Filter users
    user_counts = ratings['userId'].value_counts()
    active_users = user_counts[user_counts >= min_user_ratings].index
    ratings = ratings[ratings['userId'].isin(active_users)]
    print(f"After filtering users: {len(ratings)} ratings, {ratings['userId'].nunique()} users.")

    # Filter movies
    movie_counts = ratings['movieId'].value_counts()
    popular_movies = movie_counts[movie_counts >= min_movie_ratings].index
    ratings = ratings[ratings['movieId'].isin(popular_movies)]
    print(f"After filtering movies: {len(ratings)} ratings, {ratings['movieId'].nunique()} movies.")

    # Subsample
    ratings = ratings.sample(frac=1, random_state=42)  # shuffle
    ratings = ratings.head(max_ratings)
    print(f"Final subsample: {len(ratings)} ratings.")

    # Final valid sets
    valid_users = ratings['userId'].unique()
    valid_movies = ratings['movieId'].unique()

    movies = movies[movies['movieId'].isin(valid_movies)].copy()
    tags = tags[tags['movieId'].isin(valid_movies)].copy()
    genome_scores = genome_scores[genome_scores['movieId'].isin(valid_movies)].copy()
    # genome_tags untouched — only has tagId, global

    return {
        "ratings": ratings,
        "movies": movies,
        "tags": tags,
        "genome_scores": genome_scores,
        "genome_tags": genome_tags
    }

"""
Loads & reduces the MovieLens 20M dataset:
- Keeps only users with >= min_user_ratings
- Keeps only movies with >= min_movie_ratings
- Subsamples to max_ratings
- Ensures movies/tags/genomes only contain valid movies

Returns: dict of clean DataFrames.

Subsample to your target size
Ensure referential integrity across movies, tags, genome_scores, genome_tags
NO reindexing inside the utility

Smart subset for denser R:
- Pick top max_users active users
- Pick top max_movies popular movies
- Only keep ratings involving these

Returns dict of clean DataFrames.
"""
def filter_movielens_subset_dense(
    ratings, movies, tags,
    genome_scores, genome_tags,
    min_user_ratings=20,
    min_movie_ratings=50,
    max_users=500,
    max_movies=1000
):
    print(f"Loaded: {len(ratings)} total ratings.")

    # --- Get top users ---
    user_counts = ratings['userId'].value_counts()
    top_users = user_counts[user_counts >= min_user_ratings].head(max_users).index
    print(f"Selected {len(top_users)} top users.")

    # --- Get top movies ---
    movie_counts = ratings['movieId'].value_counts()
    top_movies = movie_counts[movie_counts >= min_movie_ratings].head(max_movies).index
    print(f"Selected {len(top_movies)} top movies.")

    # --- Filter ---
    ratings = ratings[
        ratings['userId'].isin(top_users) &
        ratings['movieId'].isin(top_movies)
    ]

    print(f"After filter: {len(ratings)} ratings")
    print(f"Unique users: {ratings['userId'].nunique()}, Unique movies: {ratings['movieId'].nunique()}")

    # Final valid sets
    valid_users = ratings['userId'].unique()
    valid_movies = ratings['movieId'].unique()

    movies = movies[movies['movieId'].isin(valid_movies)].copy()
    tags = tags[tags['movieId'].isin(valid_movies)].copy()
    genome_scores = genome_scores[genome_scores['movieId'].isin(valid_movies)].copy()

    return {
        "ratings": ratings,
        "movies": movies,
        "tags": tags,
        "genome_scores": genome_scores,
        "genome_tags": genome_tags
    }

# Downsampled Datasets
downsampled_datasets = filter_movielens_subset_dense(
    ratings, movies, tags,
    genome_scores, genome_tags,

    min_user_ratings=20, min_movie_ratings=50,
    max_users=500, max_movies=1000
)

# Downsampled datasets
ratings = downsampled_datasets["ratings"]
movies = downsampled_datasets["movies"]
tags = downsampled_datasets["tags"]
genome_scores = downsampled_datasets["genome_scores"]
genome_tags = downsampled_datasets["genome_tags"]

# Shape of Downsampled datasets
print(f"Shape of ratings = {ratings.shape}")
print(f"Shape of movies = {movies.shape}")
print(f"Shape of tags = {tags.shape}")
print(f"Shape of genome_scores = {genome_scores.shape}")
print(f"Shape of genome_tags = {genome_tags.shape}")



Loaded: 20000263 total ratings.
Selected 500 top users.
Selected 1000 top movies.
After filter: 340639 ratings
Unique users: 500, Unique movies: 1000
Shape of ratings = (340639, 4)
Shape of movies = (1000, 3)
Shape of tags = (186830, 4)
Shape of genome_scores = (1128000, 3)
Shape of genome_tags = (1128, 2)


In [None]:
# Drop timestamp
# We’re not doing time-aware MF yet (Hence temporal factor not required)
ratings = ratings.drop(columns=['timestamp'])

In [None]:
# Reindex userId and movieId to consecutive integers.
user_ids = ratings['userId'].unique()
movie_ids = ratings['movieId'].unique()

raw_userid_2_user_idx = {int(id_): idx for idx, id_ in enumerate(user_ids)}
raw_movieid_2_movie_idx = {int(id_): idx for idx, id_ in enumerate(movie_ids)}

# Add new index columns
ratings['user_idx'] = ratings['userId'].map(raw_userid_2_user_idx)
ratings['movie_idx'] = ratings['movieId'].map(raw_movieid_2_movie_idx)

ratings.head()


Unnamed: 0,userId,movieId,rating,user_idx,movie_idx
19846,156,1,5.0,0,0
19847,156,2,5.0,0,1
19848,156,3,2.0,0,2
19850,156,5,3.0,0,3
19851,156,6,4.0,0,4


(Optional) Filter cold users/items

In [None]:
# Get counts of ratings per user and movie (in-place compatible)
user_counts = ratings['user_idx'].value_counts()
movie_counts = ratings['movie_idx'].value_counts()

# Create masks for in-place filtering
keep_users = user_counts[user_counts >= 20].index
keep_movies = movie_counts[movie_counts >= 20].index

# Modify DataFrame in-place using .loc
ratings.drop(
    ratings[
        ~ratings['user_idx'].isin(keep_users) |
        ~ratings['movie_idx'].isin(keep_movies)
    ].index,
    inplace=True
)

# Update mappings after in-place filtering
rawuserid2index = {id_: idx for idx, id_ in enumerate(ratings['user_idx'].unique())}
rawmovieid2index = {id_: idx for idx, id_ in enumerate(ratings['movie_idx'].unique())}

# Update the index columns (if needed)
ratings['user_idx'] = ratings['user_idx'].map(raw_userid_2_user_idx)
ratings['movie_idx'] = ratings['movie_idx'].map(raw_movieid_2_movie_idx)


In [None]:
ratings["userId"].unique()[:10]

array([ 156,  741,  768,  775,  903,  982, 1849, 2261, 2397, 3284])

In [None]:
print(f"Shape of ratings after dropping users/movies = {ratings.shape}")

Shape of ratings after dropping users/movies = (340639, 5)


In [None]:
import numpy as np

n_users, n_items = len(raw_userid_2_user_idx), len(raw_movieid_2_movie_idx)

R = np.empty((n_users, n_items))
R[:] = np.nan  # Initialize with NaN

for row in ratings.itertuples():
    R[row.user_idx, row.movie_idx] = row.rating

print(R.shape)
print(f"Sparsity: {100 * (1 - np.count_nonzero(~np.isnan(R)) / R.size):.2f}%")


(500, 1000)
Sparsity: 31.87%


In [None]:
"""
R : User-Item interaction matrix (NumPy 2D array)
K : Number of latent factors
"""

"""
Vanilla MF with Mini Batch SGD(batch_size=256) + Early stopping

Mini Batch SGD
    - Instead of pure SGD, each epoch processes the data in batches of 256
    - Within each batch, updates are still simple SGD

Early Stopping
    - Keeps track of the best RMSE seen so far
    - If RMSE doesn’t improve for patience epochs, training stops early.
"""
class VanillaMFWithSGD:

    def __init__(self, R, K=10, learning_rate=0.01, lambda_coeff=0.05, n_epochs=50, patience=5):
        self.R = R
        self.K = K

        self.learning_rate = learning_rate
        self.lambda_coeff = lambda_coeff
        self.n_epochs = n_epochs

        self.n_users, self.n_items = R.shape

        # Store min/max for denormalizing
        self.min_rating = np.nanmin(R)
        self.max_rating = np.nanmax(R)

        self.patience = patience  # Early stopping patience

        # Normalize R for stable training
        self.R_norm = self.normalize(R)

        # Create training samples
        self.X = self.matrix_to_trainingsamples(self.R_norm)

        # Initialize factor matrices
        self.U = np.random.rand(self.n_users, K)
        self.V = np.random.rand(self.n_items, K)

    """ Scale known ratings to [0, 1] """
    def normalize(self, X):
        return (X - np.nanmin(X)) / (np.nanmax(X) - np.nanmin(X))

    """ Convert back to original rating scale """
    def denormalize(self, x):
        return (x * (self.max_rating - self.min_rating)) + self.min_rating

    """
    Converts matrix R into a list of observed rating triples: [user_id, item_id, rating]

    Converts the user-item rating matrix R into a list of training samples.
    Each data point is a tuple: (user_id, item_id, rating) — only for non-missing ratings.
    """
    def matrix_to_trainingsamples(self, R):
        samples = []
        for u in range(R.shape[0]):
            for i in range(R.shape[1]):
                if not np.isnan(R[u, i]):
                    samples.append([u, i, R[u, i]])
        return np.array(samples)

    """ Train U, V using SGD """
    def train(self):
        for epoch in range(self.n_epochs):
            np.random.shuffle(self.X)

            for u, i, r in self.X:
                u, i = int(u), int(i)

                # Predicted
                r_hat = self.predict_rating(u, i)

                # Error
                error = r - r_hat

                # Store old values for update
                U_u_old = self.U[u].copy()
                V_i_old = self.V[i].copy()

                # SGD update rules
                self.U[u] += self.learning_rate * (error * V_i_old - self.lambda_coeff * self.U[u])
                self.V[i] += self.learning_rate * (error * U_u_old - self.lambda_coeff * self.V[i])

            # Periodic reporting
            if epoch % 50 == 0 or epoch == self.n_epochs - 1:
                rmse = self.compute_rmse()
                print(f"Epoch {epoch+1}/{self.n_epochs} — RMSE: {rmse:.4f}")

    def train_minibatch(self):
        best_rmse = float('inf')
        no_improve = 0

        for epoch in range(self.n_epochs):
            np.random.shuffle(self.X)

            # Mini-batch SGD
            batch_size = 256
            for batch_start in range(0, len(self.X), batch_size):
                batch = self.X[batch_start:batch_start + batch_size]

                for u, i, r in batch:
                    u, i = int(u), int(i)
                    r_hat = self.predict_rating(u, i)
                    error = r - r_hat

                    U_u_old = self.U[u].copy()
                    V_i_old = self.V[i].copy()

                    self.U[u] += self.learning_rate * (error * V_i_old - self.lambda_coeff * self.U[u])
                    self.V[i] += self.learning_rate * (error * U_u_old - self.lambda_coeff * self.V[i])

            # End of epoch: check RMSE
            rmse = self.compute_rmse()
            print(f"Epoch {epoch+1}/{self.n_epochs} — RMSE: {rmse:.4f}")

            # Early stopping logic
            if rmse < best_rmse:
                best_rmse = rmse
                no_improve = 0
            else:
                no_improve += 1

            if no_improve >= self.patience:
                print(f"Early stopping triggered at epoch {epoch+1}. Best RMSE: {best_rmse:.4f}")
                break

    """
    ADAM optimizer for Vanilla MF — no biases.
    """
    def train_adam(self, beta1=0.9, beta2=0.999, epsilon=1e-8):

        m_U = np.zeros_like(self.U)
        v_U = np.zeros_like(self.U)
        m_V = np.zeros_like(self.V)
        v_V = np.zeros_like(self.V)

        t = 0

        for epoch in range(self.n_epochs):
            np.random.shuffle(self.X)

            for u, i, r in self.X:
                t += 1
                u, i = int(u), int(i)

                r_hat = self.predict_rating(u, i)
                error = r - r_hat

                grad_U = -error * self.V[i] + self.lambda_coeff * self.U[u]
                grad_V = -error * self.U[u] + self.lambda_coeff * self.V[i]

                # Update moments for U
                m_U[u] = beta1 * m_U[u] + (1 - beta1) * grad_U
                v_U[u] = beta2 * v_U[u] + (1 - beta2) * (grad_U ** 2)

                m_hat_U = m_U[u] / (1 - beta1 ** t)
                v_hat_U = v_U[u] / (1 - beta2 ** t)

                self.U[u] -= self.learning_rate * m_hat_U / (np.sqrt(v_hat_U) + epsilon)

                # Update moments for V
                m_V[i] = beta1 * m_V[i] + (1 - beta1) * grad_V
                v_V[i] = beta2 * v_V[i] + (1 - beta2) * (grad_V ** 2)

                m_hat_V = m_V[i] / (1 - beta1 ** t)
                v_hat_V = v_V[i] / (1 - beta2 ** t)

                self.V[i] -= self.learning_rate * m_hat_V / (np.sqrt(v_hat_V) + epsilon)

            rmse = self.compute_rmse()
            print(f"Epoch {epoch+1}/{self.n_epochs} — RMSE: {rmse:.4f}")

    """
    Step decay scheduler for Vanilla MF.
    """

    def train_with_scheduler(self, decay_factor=0.5, decay_epochs=10):
        current_lr = self.learning_rate

        for epoch in range(self.n_epochs):
            if epoch > 0 and epoch % decay_epochs == 0:
                current_lr *= decay_factor
                print(f"Step Decay: New LR = {current_lr:.6f}")

            np.random.shuffle(self.X)

            for u, i, r in self.X:
                u, i = int(u), int(i)

                r_hat = self.predict_rating(u, i)
                error = r - r_hat

                U_u_old = self.U[u].copy()
                V_i_old = self.V[i].copy()

                self.U[u] += current_lr * (error * V_i_old - self.lambda_coeff * self.U[u])
                self.V[i] += current_lr * (error * U_u_old - self.lambda_coeff * self.V[i])

            rmse = self.compute_rmse()
            print(f"Epoch {epoch+1}/{self.n_epochs} — RMSE: {rmse:.4f}")

    """ Compute RMSE on known entries """
    def compute_rmse(self):
        squared_errors = []
        for u, i, r in self.X:
            u, i = int(u), int(i)
            r_hat = self.predict_rating(u, i)
            squared_errors.append((r - r_hat) ** 2)
        rmse = np.sqrt(np.mean(squared_errors))
        # Return in original scale
        return rmse * (self.max_rating - self.min_rating)

    """ Predict rating for user u and item i """
    def predict_rating(self, u, i):
        return np.dot(self.U[u], self.V[i].T)

    """ Predict the full rating matrix in the original scale """
    def predict_full_matrix(self):
        R_hat = np.dot(self.U, self.V.T)
        return self.denormalize(R_hat)


def recommendTopNMoviesForUser(
    mf,  # your trained MF object
    username,  # e.g., "Alice"
    user_name_to_id,  # "Alice" → raw userId
    raw_userid_2_user_idx,  # raw userId → user_idx
    movie_idx_2_raw_movieid,  # movie_idx → raw movieId
    movieId2title,  # raw movieId → title
    N=5 # Top N recommendations
):
    # 1.1 Resolve raw ID
    raw_user_id = user_name_to_id[username]

    # 1.2 Map to user_idx
    user_idx = raw_userid_2_user_idx[raw_user_id]

    # 3. Predict all scores
    R_hat = mf.predict_full_matrix()
    user_scores = R_hat[user_idx]

    # 4. Remove seen
    # Mask out movies the user has already rated
    rated_items_idx = np.where(~np.isnan(mf.R[user_idx]))[0]
    user_scores[rated_items_idx] = -np.inf # block known

    # 5. Get top N
    top_N_idx = np.argsort(user_scores)[-N:][::-1]

    # 6. Build result
    recommendations = []
    for idx in top_N_idx:
        raw_movie_id = movie_idx_2_raw_movieid[idx]
        title = movieId2title.get(raw_movie_id, "Unknown Title")
        score = user_scores[idx]
        recommendations.append( (raw_movie_id, title, score) )

    return recommendations



In [None]:
"""
This will be needed later for:-
    - Interpreting results
    - Making top-N recommendations human-readable
"""

user_name_to_id = {"Alice": 156, "Triparna": 741, "Srishti": 768}

# Create ID mappings for later
movie_idx_2_raw_movieid = {idx: id_ for id_, idx in raw_movieid_2_movie_idx.items()}
movieId2title = pd.Series(movies["title"].values, index=movies["movieId"]).to_dict()

vanilla_mf = VanillaMFWithSGD(R, K=10)
vanilla_mf.train_minibatch()

# Recommend:
recommendations = recommendTopNMoviesForUser(
    vanilla_mf,
    username="Alice",
    user_name_to_id=user_name_to_id,
    raw_userid_2_user_idx=raw_userid_2_user_idx,
    movie_idx_2_raw_movieid=movie_idx_2_raw_movieid,
    movieId2title=movieId2title,
    N=5
)

for movieId, title, score in recommendations:
    print(f"MovieID: {movieId} | Title: {title} | Predicted Score: {score:.2f}")


Epoch 1/50 — RMSE: 0.9497
Epoch 2/50 — RMSE: 0.8572
Epoch 3/50 — RMSE: 0.8427
Epoch 4/50 — RMSE: 0.8410
Epoch 5/50 — RMSE: 0.8421
Epoch 6/50 — RMSE: 0.8421
Epoch 7/50 — RMSE: 0.8435
Epoch 8/50 — RMSE: 0.8443
Epoch 9/50 — RMSE: 0.8446
Early stopping triggered at epoch 9. Best RMSE: 0.8410
Raw User ID =  156
User Idx =  0
MovieID: 922 | Title: Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) | Predicted Score: 4.72
MovieID: 260 | Title: Star Wars: Episode IV - A New Hope (1977) | Predicted Score: 4.68
MovieID: 1213 | Title: Goodfellas (1990) | Predicted Score: 4.67
MovieID: 1203 | Title: 12 Angry Men (1957) | Predicted Score: 4.67
MovieID: 2019 | Title: Seven Samurai (Shichinin no samurai) (1954) | Predicted Score: 4.65


In [None]:
import numpy as np
import math as math

"""
R : User-Item interaction matrix
k : Latent factors
"""
class MFWithBias:

    def __init__(self, R, K=10, n_epochs=100, alpha=0.01, lambda_reg=0.02):
        self.R = R
        self.K = K

        self.n_epochs = n_epochs
        self.alpha = alpha
        self.lambda_reg = lambda_reg

        self.n_users, self.n_items = R.shape

        # Training data: list of observed triplets
        self.X = self.matrix_to_samples(R)

        # Global mean
        self.mu = np.nanmean(R)

        # Initialize biases
        self.b_u = np.zeros(self.n_users)
        self.b_i = np.zeros(self.n_items)

        # Initialize latent factor matrices
        self.U = np.random.normal(scale=0.1, size=(self.n_users, K))
        self.V = np.random.normal(scale=0.1, size=(self.n_items, K))

    """ Scale known ratings to [0, 1] """
    def normalize(self, X):
        return (X - np.nanmin(X)) / (np.nanmax(X) - np.nanmin(X))

    """ Convert back to original rating scale """
    def denormalize(self, x):
        return (x * (self.max_rating - self.min_rating)) + self.min_rating

    """
    Extract known ratings as list of (user, item, rating)
    """
    def matrix_to_samples(self, R):
        samples = []

        for u in range(R.shape[0]):
            for i in range(R.shape[1]):
                if not np.isnan(R[u, i]):
                    samples.append( (u, i, R[u, i]) )

        return samples

    """Predict rating with biases and latent factors"""
    def predict_rating(self, u, i):
        pred = self.mu + self.b_u[u] + self.b_i[i] + np.dot(self.U[u], self.V[i])
        return pred

    """Run SGD training loop"""
    def train(self):
        for epoch in range(self.n_epochs):
            np.random.shuffle(self.X)
            for u, i, r in self.X:
                pred = self.predict_rating(u, i)
                error = r - pred

                # Bias updates
                self.b_u[u] += self.alpha * (error - self.lambda_reg * self.b_u[u])
                self.b_i[i] += self.alpha * (error - self.lambda_reg * self.b_i[i])

                # Save copies for simultaneous updates
                U_old = self.U[u].copy()
                V_old = self.V[i].copy()

                # Latent vector updates
                self.U[u] += self.alpha * (error * V_old - self.lambda_reg * U_old)
                self.V[i] += self.alpha * (error * U_old - self.lambda_reg * V_old)

            # Optional: Monitor training loss
            rmse = self.compute_rmse()
            print(f"Epoch {epoch+1}/{self.n_epochs} — RMSE: {rmse:.4f}")

    """
    Mini-batch SGD for MFWithBias with early stopping.

    Args:
        batch_size (int): how many samples per batch.
        patience (int): how many epochs to wait for improvement.
        validation_samples: list of (u, i, r) for validation RMSE.
    """

    def train_minibatch(self, batch_size=256, patience=5, validation_samples=None):


        best_val_rmse = float('inf')
        patience_counter = 0

        for epoch in range(self.n_epochs):

            np.random.shuffle(self.X)

            for batch_start in range(0, len(self.X), batch_size):
                batch = self.X[batch_start: batch_start + batch_size]

                for u, i, r in batch:
                    u, i = int(u), int(i)
                    pred = self.predict_rating(u, i)
                    error = r - pred

                    # Bias gradients
                    grad_b_u = -error + self.lambda_reg * self.b_u[u]
                    grad_b_i = -error + self.lambda_reg * self.b_i[i]

                    # Latent factor gradients
                    grad_U = -error * self.V[i] + self.lambda_reg * self.U[u]
                    grad_V = -error * self.U[u] + self.lambda_reg * self.V[i]

                    # Store old for simultaneous update
                    U_old = self.U[u].copy()
                    V_old = self.V[i].copy()

                    # Update
                    self.b_u[u] -= self.alpha * grad_b_u
                    self.b_i[i] -= self.alpha * grad_b_i

                    self.U[u] -= self.alpha * grad_U
                    self.V[i] -= self.alpha * grad_V

            # Evaluate
            train_rmse = self.compute_rmse()
            val_rmse = self.compute_rmse(samples=validation_samples) if validation_samples else train_rmse

            print(f"Epoch {epoch+1}: Train RMSE = {train_rmse:.4f}, Val RMSE = {val_rmse:.4f}")

            # Early stopping check
            if val_rmse < best_val_rmse:
                best_val_rmse = val_rmse
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print(f"Early stopping at epoch {epoch+1}")
                    break


    """
    ADAM optimizer for MFWithBias.
    Uses per-parameter adaptive learning rates.
    """
    def train_adam(self, beta1=0.9, beta2=0.999, epsilon=1e-8):

        # Moment estimates for U and V
        m_U = np.zeros_like(self.U)
        v_U = np.zeros_like(self.U)

        m_V = np.zeros_like(self.V)
        v_V = np.zeros_like(self.V)

        # Also for biases:
        m_b_u = np.zeros_like(self.b_u)
        v_b_u = np.zeros_like(self.b_u)
        m_b_i = np.zeros_like(self.b_i)
        v_b_i = np.zeros_like(self.b_i)

        t = 0

        for epoch in range(self.n_epochs):
            np.random.shuffle(self.X)

            for u, i, r in self.X:
                u, i = int(u), int(i)
                t += 1

                pred = self.predict_rating(u, i)
                error = r - pred

                # Gradients
                grad_U = -error * self.V[i] + self.lambda_reg * self.U[u]
                grad_V = -error * self.U[u] + self.lambda_reg * self.V[i]
                grad_b_u = -error + self.lambda_reg * self.b_u[u]
                grad_b_i = -error + self.lambda_reg * self.b_i[i]

                # Update moments for U
                m_U[u] = beta1 * m_U[u] + (1 - beta1) * grad_U
                v_U[u] = beta2 * v_U[u] + (1 - beta2) * (grad_U ** 2)

                m_hat_U = m_U[u] / (1 - beta1 ** t)
                v_hat_U = v_U[u] / (1 - beta2 ** t)

                self.U[u] -= self.alpha * m_hat_U / (np.sqrt(v_hat_U) + epsilon)

                # Update moments for V
                m_V[i] = beta1 * m_V[i] + (1 - beta1) * grad_V
                v_V[i] = beta2 * v_V[i] + (1 - beta2) * (grad_V ** 2)

                m_hat_V = m_V[i] / (1 - beta1 ** t)
                v_hat_V = v_V[i] / (1 - beta2 ** t)

                self.V[i] -= self.alpha * m_hat_V / (np.sqrt(v_hat_V) + epsilon)

                # Biases
                m_b_u[u] = beta1 * m_b_u[u] + (1 - beta1) * grad_b_u
                v_b_u[u] = beta2 * v_b_u[u] + (1 - beta2) * (grad_b_u ** 2)
                m_hat_b_u = m_b_u[u] / (1 - beta1 ** t)
                v_hat_b_u = v_b_u[u] / (1 - beta2 ** t)
                self.b_u[u] -= self.alpha * m_hat_b_u / (np.sqrt(v_hat_b_u) + epsilon)

                m_b_i[i] = beta1 * m_b_i[i] + (1 - beta1) * grad_b_i
                v_b_i[i] = beta2 * v_b_i[i] + (1 - beta2) * (grad_b_i ** 2)
                m_hat_b_i = m_b_i[i] / (1 - beta1 ** t)
                v_hat_b_i = v_b_i[i] / (1 - beta2 ** t)
                self.b_i[i] -= self.alpha * m_hat_b_i / (np.sqrt(v_hat_b_i) + epsilon)

            rmse = self.compute_rmse()
            print(f"Epoch {epoch+1}: RMSE: {rmse:.4f}")

    """
    Simple step decay: reduce LR by decay_factor every decay_epochs.
    """
    def train_with_scheduler(self, decay_factor=0.5, decay_epochs=10):
        current_lr = self.alpha

        for epoch in range(self.n_epochs):
            if epoch > 0 and epoch % decay_epochs == 0:
                current_lr *= decay_factor
                print(f"Step Decay: New LR = {current_lr:.6f}")

            np.random.shuffle(self.X)

            for u, i, r in self.X:
                u, i = int(u), int(i)
                pred = self.predict_rating(u, i)
                error = r - pred

                # Update latent factors
                U_old = self.U[u].copy()
                V_old = self.V[i].copy()

                self.U[u] += current_lr * (error * V_old - self.lambda_reg * U_old)
                self.V[i] += current_lr * (error * U_old - self.lambda_reg * V_old)

                # Biases
                self.b_u[u] += current_lr * (error - self.lambda_reg * self.b_u[u])
                self.b_i[i] += current_lr * (error - self.lambda_reg * self.b_i[i])

            rmse = self.compute_rmse()
            print(f"Epoch {epoch+1}: RMSE: {rmse:.4f}")

    """ Run RMSE on known entries"""
    def compute_rmse(self):
        mse = 0
        count = 0
        for u, i, r in self.X:
            pred = self.predict_rating(u, i)
            mse += (r - pred) ** 2
            count += 1
        return math.sqrt(mse / count)

    """ Predict the full User-Item matrix with biases """
    def predict_full_matrix(self):
        R_hat = np.zeros((self.n_users, self.n_items))

        for u in range(self.n_users):
            for i in range(self.n_items):
                R_hat[u, i] = self.predict_rating(u, i)

        return R_hat


def recommendTopNMoviesForUser(
    mf,  # your trained MF object
    username,  # e.g., "Alice"
    user_name_2_raw_user_id,  # "Alice" → raw userId
    raw_userid_2_user_idx,  # raw userId → user_idx
    movie_idx_2_raw_movieid,  # movie_idx → raw movieId
    movieId2title,  # raw movieId → title
    N=5 # Top N recommendations
):
    # 1.1 Resolve raw ID
    raw_user_id = user_name_2_raw_user_id[username]

    # 1.2 Map to user_idx
    user_idx = raw_userid_2_user_idx[raw_user_id]

    # 3. Predict all scores
    R_hat = mf.predict_full_matrix()
    user_scores = R_hat[user_idx]

    # 4. Remove seen
    # Mask out movies the user has already rated
    rated_items_idx = np.where(~np.isnan(mf.R[user_idx]))[0]
    user_scores[rated_items_idx] = -np.inf # block known

    # 5. Get top N
    top_N_idx = np.argsort(user_scores)[-N:][::-1]

    # 6. Build result
    recommendations = []
    for idx in top_N_idx:
        raw_movie_id = movie_idx_2_raw_movieid[idx]
        title = movieId2title.get(raw_movie_id, "Unknown Title")
        score = user_scores[idx]
        recommendations.append( (raw_movie_id, title, score) )

    return recommendations

In [None]:
user_name_2_raw_user_id = {"Alice": 156, "Triparna": 741, "Srishti": 768}

# Create ID mappings for later
movie_idx_2_raw_movieid = {idx: id_ for id_, idx in raw_movieid_2_movie_idx.items()}
movieId2title = pd.Series(movies["title"].values, index=movies["movieId"]).to_dict()

mf_bias = MFWithBias(R, K=10)
mf_bias.train_minibatch()





Epoch 1: Train RMSE = 0.8130, Val RMSE = 0.8130
Epoch 2: Train RMSE = 0.8112, Val RMSE = 0.8112
Epoch 3: Train RMSE = 0.8082, Val RMSE = 0.8082
Epoch 4: Train RMSE = 0.7975, Val RMSE = 0.7975
Epoch 5: Train RMSE = 0.7765, Val RMSE = 0.7765
Epoch 6: Train RMSE = 0.7596, Val RMSE = 0.7596
Epoch 7: Train RMSE = 0.7451, Val RMSE = 0.7451
Epoch 8: Train RMSE = 0.7332, Val RMSE = 0.7332
Epoch 9: Train RMSE = 0.7245, Val RMSE = 0.7245
Epoch 10: Train RMSE = 0.7181, Val RMSE = 0.7181
Epoch 11: Train RMSE = 0.7131, Val RMSE = 0.7131
Epoch 12: Train RMSE = 0.7095, Val RMSE = 0.7095
Epoch 13: Train RMSE = 0.7067, Val RMSE = 0.7067
Epoch 14: Train RMSE = 0.7048, Val RMSE = 0.7048
Epoch 15: Train RMSE = 0.7034, Val RMSE = 0.7034
Epoch 16: Train RMSE = 0.7023, Val RMSE = 0.7023
Epoch 17: Train RMSE = 0.7015, Val RMSE = 0.7015
Epoch 18: Train RMSE = 0.7008, Val RMSE = 0.7008
Epoch 19: Train RMSE = 0.7005, Val RMSE = 0.7005
Epoch 20: Train RMSE = 0.7001, Val RMSE = 0.7001
Epoch 21: Train RMSE = 0.6996

In [None]:
# User bias vector
user_biases = mf_bias.b_u


(500,)


In [None]:
import numpy as np

# Use Case 2 — Identify Consistently Harsh or Generous Raters

"""
Identifies harsh and generous raters based on user biases b_u.
Maps them back to raw userIds and usernames if available.
"""

def find_extreme_raters(
    mf,  # trained MFWithBias
    user_idx_2_raw_userid,  # { index → raw userId }
    raw_user_id_2_user_name,  # {  raw userId → username }
    top_percent=5
):


    # 1. Extract all user biases
    biases = mf.b_u

    # 2. Compute thresholds
    generous_thresh = np.percentile(biases, 100 - top_percent)
    harsh_thresh = np.percentile(biases, top_percent)

    print(f"Generous threshold: b_u >= {generous_thresh:.4f}")
    print(f"Harsh threshold: b_u <= {harsh_thresh:.4f}")

    generous_users = []
    harsh_users = []

    for idx, bias in enumerate(biases):
        raw_user_id = user_idx_2_raw_userid[idx]

        # Map to username if known
        username = raw_user_id_2_user_name.get(raw_user_id, 'User NA')

        user_info = {
            "user_idx": idx,
            "raw_user_id": raw_user_id,
            "username": username,
            "bias": bias
        }

        if bias >= generous_thresh:
            generous_users.append(user_info)
        elif bias <= harsh_thresh:
            harsh_users.append(user_info)

    return generous_users, harsh_users

# We have raw_userid_2_user_idx
# Lets create the reverse maping
user_idx_2_raw_userid = { v:k for k, v in raw_userid_2_user_idx.items() }

# We have user_name_2_raw_user_id (Username --> Raw user id)
# Lets create the Reverse mapping
raw_user_id_2_user_name = { v:k for k, v in user_name_2_raw_user_id.items() }

generous_users, harsh_users = find_extreme_raters(
    mf=mf_bias,  # your trained MFWithBias object
    user_idx_2_raw_userid=user_idx_2_raw_userid,  # { idx → raw userId }
    raw_user_id_2_user_name=raw_user_id_2_user_name,
    top_percent=5  # top 5% most generous & harsh
)

print("\n🌟 Generous Users:")
for user in generous_users:
    print(user)

print("\n🥶 Harsh Users:")
for user in harsh_users:
    print(user)


NameError: name 'raw_userid_2_user_idx' is not defined

In [None]:
# Use Case 3 — Spot Systematically Overrated or Underrated Movies

def find_extreme_movies(
    mf,  # trained MFWithBias
    item_idx_2_raw_movieid,  # { index → raw movieId }
    raw_movie_id_2_title,  # { raw movieId → title }
    top_percent=5
):
    """
    Finds systematically overrated and underrated movies based on b_i.
    """

    biases = mf.b_i

    # Compute thresholds
    overrated_thresh = np.percentile(biases, 100 - top_percent)
    underrated_thresh = np.percentile(biases, top_percent)

    print(f"Overrated threshold: b_i >= {overrated_thresh:.4f}")
    print(f"Underrated threshold: b_i <= {underrated_thresh:.4f}")

    overrated_movies = []
    underrated_movies = []

    for idx, bias in enumerate(biases):
        raw_movie_id = item_idx_2_raw_movieid[idx]
        title = raw_movie_id_2_title.get(raw_movie_id, "Title NA")

        movie_info = {
            "item_idx": idx,
            "raw_movie_id": raw_movie_id,
            "title": title,
            "bias": bias
        }

        if bias >= overrated_thresh:
            overrated_movies.append(movie_info)
        elif bias <= underrated_thresh:
            underrated_movies.append(movie_info)

    return overrated_movies, underrated_movies

# Suppose you have these mappings
item_idx_2_raw_movieid = { v:k for k, v in raw_movieid_2_index.items() }
raw_movie_id_2_title = pd.Series(movies["title"].values, index=movies["movieId"]).to_dict()

overrated_movies, underrated_movies = find_extreme_movies(
    mf=mf_bias,  # your trained MFWithBias object
    item_idx_2_raw_movieid=item_idx_2_raw_movieid,
    raw_movie_id_2_title=raw_movie_id_2_title,
    top_percent=5  # top 5% overrated & underrated
)

print("\n🔥 Systematically Overrated Movies:")
for movie in overrated_movies:
    print(movie)

print("\n💤 Systematically Underrated Movies:")
for movie in underrated_movies:
    print(movie)


In [None]:
# Use Case 1 — Top-N Personalized Recommendations

# Use Case 2 — Identify Consistently Harsh or Generous Raters

# Use Case 3 — Spot Systematically Overrated or Underrated Movies

# Use Case 4 — Explain Why a Specific Prediction is High or Low

In [None]:
import matplotlib.pyplot as plt

def plot_user_biases(mf, top_percent=5):
    b_u = mf.b_u

    harsh_thresh = np.percentile(b_u, top_percent)
    generous_thresh = np.percentile(b_u, 100 - top_percent)

    plt.figure(figsize=(10, 5))
    plt.hist(b_u, bins=50, color='skyblue', edgecolor='black')
    plt.axvline(harsh_thresh, color='red', linestyle='--', label=f'Harsh {top_percent}%')
    plt.axvline(generous_thresh, color='green', linestyle='--', label=f'Generous {top_percent}%')
    plt.title('Distribution of User Biases (b_u)')
    plt.xlabel('User Bias')
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()

def plot_item_biases(mf, top_percent=5):
    b_i = mf.b_i

    underrated_thresh = np.percentile(b_i, top_percent)
    overrated_thresh = np.percentile(b_i, 100 - top_percent)

    plt.figure(figsize=(10, 5))
    plt.hist(b_i, bins=50, color='lightcoral', edgecolor='black')
    plt.axvline(underrated_thresh, color='red', linestyle='--', label=f'Underrated {top_percent}%')
    plt.axvline(overrated_thresh, color='green', linestyle='--', label=f'Overrated {top_percent}%')
    plt.title('Distribution of Item Biases (b_i)')
    plt.xlabel('Item Bias')
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()


### Prepare side info for Matrix Factorization with Side features

In [None]:
# 1. Number of ratings per user
n_ratings = ratings.groupby('userId')['rating'].count()

# 2. Average rating per user
mean_rating = ratings.groupby('userId')['rating'].mean()

# 3. Rating variance per user
rating_var = ratings.groupby('userId')['rating'].var().fillna(0)

# Combine into DataFrame
user_features = pd.DataFrame({
    'n_ratings': n_ratings,
    'mean_rating': mean_rating,
    'rating_var': rating_var
}).reset_index()

# Normalize each feature (always good for MF)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
user_features_scaled = scaler.fit_transform(user_features[['n_ratings', 'mean_rating', 'rating_var']])

# Final matrix: rows = users, columns = features
X_user = user_features_scaled  # shape: (n_users, D)


AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [None]:
# Genre preference: multi-hot vector
# First, explode movies by genres
movies['genres'] = movies['genres'].str.split('|')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]
5,6,Heat (1995),"[Action, Crime, Thriller]"


In [None]:
all_genres = set()
for gs in movies["genres"]:
    all_genres.update(gs.split('|'))

print(all_genres)

# Multi Hot Encoding genres


{'Adventure', 'Musical', 'Sci-Fi', 'Film-Noir', 'Action', 'Children', 'Drama', 'Crime', 'Animation', 'Mystery', 'IMAX', 'Thriller', 'Comedy', 'Fantasy', 'Romance', 'Documentary', 'War', 'Horror', 'Western'}


In [None]:
genre_dummies = movies['genres'].get_dummies(sep='|')
genre_dummies

AttributeError: 'Series' object has no attribute 'get_dummies'



## ✅ **Use Case 1: Cold Start — Recommend for a User with Very Few Ratings**

---

### 📌 **Scenario**

Suppose you have a new user, or a user who has only rated *1–2* movies.
Standard MF **can’t generalize** well because there’s not enough data to learn that user’s vector.
**Side features** help by *inferring preferences* from demographics, behavior summaries, or other signals.


When presenting:

* Show the user’s real ratings → only 2.
* Then show the top 5 new recommendations → highlight that these are diverse and relevant.
* Emphasize: *this is impossible with plain MF.*



In [None]:
# Fnding cold start users (Raw idx)
# Count ratings per user (ignoring NaN/missing values)
user_rating_counts = np.sum(~np.isnan(R), axis=1)  # R is your user-item matrix

# Get indices of users with ≤ N ratings (e.g., N=2)
cold_user_raw_indices = np.where(user_rating_counts <= 2)[0]

print(f"Found {len(cold_user_raw_indices)} cold-start users")
print(f"Raw Indices of Cold Start users = {cold_user_raw_indices}")

In [None]:


from sklearn.neighbors import NearestNeighbors
model=NearestNeighbors( n_neighbors=7,algorithm='brute',metric='cosine')

In [None]:
model.fit(movie_sparse)

In [None]:
df.drop(columns=['genres','userId','rating'],inplace=True)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.to_csv('codf.csv',index=False)

In [None]:
distances,suggestions=model.kneighbors(movie_pivot.iloc[540,:].values.reshape(1,-1))

In [None]:
distances

In [None]:
suggestions

In [None]:
df1=df.copy()
ti=[]
for i in df1['title']:
    ti.append(i.split(' (')[0])
df1['title']=ti

In [None]:


for i in range(len(suggestions)):
    print(movie_pivot.index[suggestions[i]])


In [None]:
def reco(movie_name):
    movie_id=df1[df1['title']=='Toy Story'].drop_duplicates('title')['movieId'].values[0]
    distances,suggestions=model.kneighbors(movie_pivot.iloc[movie_id,:].values.reshape(1,-1))



    for i in range(len(suggestions)):
        return (movie_pivot.index[suggestions[i]])



In [None]:
res=reco("It Conquered the World")

In [None]:
for i in res:
    print(i)