<a href="https://colab.research.google.com/github/avirupdevzone/Matrix-Factorization/blob/main/MovieLens1M.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
movielens_1m_dataset_path = kagglehub.dataset_download('odedgolden/movielens-1m-dataset')

print('Data source import complete.')


Data source import complete.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import namedtuple
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF
import time

In [None]:

users_df = pd.read_csv(
    f"{movielens_1m_dataset_path}/users.dat",
    sep='::',
    names=['UserID','Gender','Age','Occupation','Zip-code'],
    engine='python', encoding='latin-1'
)

movies_df = pd.read_csv(
    f"{movielens_1m_dataset_path}/movies.dat",
    header=None, sep='::',
    names=['MovieID', 'Title', 'Genre'],
    engine='python', encoding='latin-1'
)

ratings_df = pd.read_csv(
    f"{movielens_1m_dataset_path}/ratings.dat",
    sep='::',
    names=['UserID','MovieID','Rating','Timestamp'],
    engine='python', encoding='latin-1'
)


In [None]:
users_df.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [None]:
movies_df.head()

Unnamed: 0,MovieID,Title,Genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [None]:
# Checking shapes of Datasets
print(f"\n Shape of users_df = {users_df.shape}")
print(f"\n Shape of movies_df = {movies_df.shape}")
print(f"\n Shape of ratings_df = {ratings_df.shape}")


 Shape of users_df = (6040, 5)

 Shape of movies_df = (3883, 3)

 Shape of ratings_df = (1000209, 4)


In [None]:
# Checking for null values
print("\n Users \n",users_df.isnull().sum())
print("\n Movies \n",movies_df.isnull().sum())
print("\n Ratings \n",ratings_df.isnull().sum())


 Users 
 UserID        0
Gender        0
Age           0
Occupation    0
Zip-code      0
dtype: int64

 Movies 
 MovieID    0
Title      0
Genre      0
dtype: int64

 Ratings 
 UserID       0
MovieID      0
Rating       0
Timestamp    0
dtype: int64


In [None]:
# How many ratings per user?
ratings_per_user = ratings_df.groupby('UserID').size()
print("\nMin ratings per user:", ratings_per_user.min())
print("Max ratings per user:", ratings_per_user.max())
print("Median ratings per user:", ratings_per_user.median())

# How many ratings per movie?
ratings_per_movie = ratings_df.groupby('MovieID').size()
print("\nMin ratings per movie:", ratings_per_movie.min())
print("Max ratings per movie:", ratings_per_movie.max())
print("Median ratings per movie:", ratings_per_movie.median())


Min ratings per user: 20
Max ratings per user: 2314
Median ratings per user: 96.0

Min ratings per movie: 1
Max ratings per movie: 3428
Median ratings per movie: 123.5


In [None]:
# Shapes before Downsampling
print(f"\n Shape of users_df = {users_df.shape}")
print(f"\n Shape of movies_df = {movies_df.shape}")
print(f"\n Shape of ratings_df = {ratings_df.shape}")


 Shape of users_df = (6040, 5)

 Shape of movies_df = (3883, 3)

 Shape of ratings_df = (1000209, 4)


In [None]:
"""
Filters MovieLens 1M dataset to a dense, consistent subset.

Parameters
----------
ratings : pd.DataFrame
    DataFrame with columns ['UserID', 'MovieID', 'Rating', 'Timestamp']

users : pd.DataFrame
    DataFrame with columns ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']

movies : pd.DataFrame
    DataFrame with columns ['MovieID', 'Title', 'Genres']

Returns
-------
dict with:
    - 'ratings': filtered ratings DataFrame
    - 'users': filtered users DataFrame
    - 'movies': filtered movies DataFrame
"""

def filter_movielens1m_subset_dense(
    ratings, users, movies,
    min_user_ratings=20,
    min_movie_ratings=50,
    max_users=500,
    max_movies=1000,
    verbose=True
):


    if verbose:
        print(f"Loaded: {len(ratings)} total ratings.")

    # --- Get top users ---
    user_counts = ratings['UserID'].value_counts()
    top_users = user_counts[user_counts >= min_user_ratings].head(max_users).index
    if verbose:
        print(f"Selected {len(top_users)} users with at least {min_user_ratings} ratings.")

    # --- Get top movies ---
    movie_counts = ratings['MovieID'].value_counts()
    top_movies = movie_counts[movie_counts >= min_movie_ratings].head(max_movies).index
    if verbose:
        print(f"Selected {len(top_movies)} movies with at least {min_movie_ratings} ratings.")

    # --- Filter ---
    ratings_filtered = ratings[
        ratings['UserID'].isin(top_users) &
        ratings['MovieID'].isin(top_movies)
    ].copy()

    if verbose:
        print(f"After filter: {len(ratings_filtered)} ratings")
        print(f"Unique users: {ratings_filtered['UserID'].nunique()}, Unique movies: {ratings_filtered['MovieID'].nunique()}")

    # Ensure final users/movies are consistent with filtered ratings
    valid_users = ratings_filtered['UserID'].unique()
    valid_movies = ratings_filtered['MovieID'].unique()

    users_filtered = users[users['UserID'].isin(valid_users)].copy()
    movies_filtered = movies[movies['MovieID'].isin(valid_movies)].copy()

    if verbose:
        print(f"Users retained: {len(users_filtered)}, Movies retained: {len(movies_filtered)}")

    return {
        "ratings": ratings_filtered,
        "users": users_filtered,
        "movies": movies_filtered
    }

# Downsampled Datasets
downsampled_datasets = filter_movielens1m_subset_dense(
    ratings_df, users_df, movies_df,
    min_user_ratings=30, min_movie_ratings=50,
    max_users=300, max_movies=500
)

# Downsampled datasets
ratings_df = downsampled_datasets["ratings"]
users_df = downsampled_datasets["users"]
movies_df = downsampled_datasets["movies"]

# Shape of Downsampled datasets
print(f"\n Shape of users_df = {users_df.shape}")
print(f"\n Shape of movies_df = {movies_df.shape}")
print(f"\n Shape of ratings_df = {ratings_df.shape}")



Loaded: 1000209 total ratings.
Selected 300 users with at least 30 ratings.
Selected 500 movies with at least 50 ratings.
After filter: 94725 ratings
Unique users: 300, Unique movies: 500
Users retained: 300, Movies retained: 500

 Shape of users_df = (300, 5)

 Shape of movies_df = (500, 3)

 Shape of ratings_df = (94725, 4)


In [None]:
# Drop timestamp
# We’re not doing time-aware MF yet (Hence temporal factor not required)
ratings_df = ratings_df.drop(columns=['Timestamp'])

In [None]:
# Reindex userId and movieId to consecutive integers.
user_ids = ratings_df['UserID'].unique()
movie_ids = ratings_df['MovieID'].unique()

raw_userid_2_user_idx = {int(id_): idx for idx, id_ in enumerate(user_ids)}
raw_movieid_2_movie_idx = {int(id_): idx for idx, id_ in enumerate(movie_ids)}

# We have Raw User Id --> User Id
# Lets build the reverse mapping
user_idx_2_raw_userid = { v:k  for k, v in raw_userid_2_user_idx.items() }

# We have Raw Movie Id --> Movie Id
# Lets build the reverse mapping
movie_idx_2_raw_movieid = { v:k for k, v in raw_movieid_2_movie_idx.items() }

# movieId2title - rawMovieId -> Title
movieId2title = pd.Series(movies_df["Title"].values, index=movies_df["MovieID"]).to_dict()

# Add new index columns
ratings_df['user_idx'] = ratings_df['UserID'].map(raw_userid_2_user_idx)
ratings_df['movie_idx'] = ratings_df['MovieID'].map(raw_movieid_2_movie_idx)

ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,user_idx,movie_idx
6511,48,2987,3,0,0
6515,48,648,3,0,1
6517,48,1320,3,0,2
6519,48,2124,3,0,3
6520,48,1250,5,0,4


In [None]:
ratings_df["UserID"].unique()[:10]

array([ 48,  53, 123, 148, 149, 173, 195, 202, 216, 245])

In [None]:
print(f"Shape of ratings after dropping users/movies = {ratings_df.shape}")

Shape of ratings after dropping users/movies = (94725, 5)


In [None]:
import numpy as np

n_users, n_items = len(raw_userid_2_user_idx), len(raw_movieid_2_movie_idx)

R = np.empty((n_users, n_items))
R[:] = np.nan  # Initialize with NaN

for row in ratings_df.itertuples():
    R[row.user_idx, row.movie_idx] = row.Rating

print(R.shape)
print(f"Sparsity: {100 * (1 - np.count_nonzero(~np.isnan(R)) / R.size):.2f}%")


(300, 500)
Sparsity: 36.85%


In [None]:
movies_df.head()

Unnamed: 0,MovieID,Title,Genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
5,6,Heat (1995),Action|Crime|Thriller
9,10,GoldenEye (1995),Action|Adventure|Thriller
10,11,"American President, The (1995)",Comedy|Drama|Romance


In [None]:
# Multi Hot Encode genres
movies['Genres'] = movies['Genres'].str.split('|')

mlb = MultiLabelBinarizer()
genres_multi_hot = pd.DataFrame(
    mlb.fit_transform(movies['Genres']),
    columns=mlb.classes_,
    index=movies.index
)

# === Extract raw Year ===
movies['Year'] = movies['Title'].str.extract(r'\((\d{4})\)').astype(float)

# === Standardize Year ===
scaler = StandardScaler()
year_scaled = pd.DataFrame(
    scaler.fit_transform(movies[['Year']].fillna(0)),
    columns=['Year_Scaled'],
    index=movies.index
)

# === Bucket into Decade ===
movies['Decade'] = (movies['Year'] // 10) * 10
decade_one_hot = pd.get_dummies(movies['Decade'].fillna(0).astype(int), prefix='Decade')

# === Combine all ===
item_features = pd.concat([
    movies[['MovieID']],   # ID to merge back
    genres_multi_hot,      # multi-hot genres
    year_scaled,           # standardized year
    decade_one_hot         # one-hot decades
], axis=1)

# === Index by MovieID ===
item_features.set_index('MovieID', inplace=True)

print(item_features.head())
print(f"Final shape: {item_features.shape}")

Unnamed: 0,MovieID,Title,Genre
0,1,Toy Story (1995),"[Animation, Children's, Comedy]"
1,2,Jumanji (1995),"[Adventure, Children's, Fantasy]"
5,6,Heat (1995),"[Action, Crime, Thriller]"
9,10,GoldenEye (1995),"[Action, Adventure, Thriller]"
10,11,"American President, The (1995)","[Comedy, Drama, Romance]"


In [None]:
users_df.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
47,48,M,25,4,92107
52,53,M,25,0,96931
122,123,M,35,9,67208
147,148,M,50,17,57747
148,149,M,25,1,29205


In [None]:

# One Hot Encode Gender
gender_one_hot = pd.get_dummies(users_df["Gender"], prefix="Gender")

"""
OHE Age
In MovieLens 1M, Age is coded buckets :-
1: Under 18
18: 18–24
25: 25–34
35: 35–44
45: 45–49
50: 50–55
56: 56+

These are categorical bins → so treat them as one-hot.
"""
age_one_hot = pd.get_dummies(users_df['Age'], prefix='Age')

"""
OHE Occupation
Occupation is coded from 0 to 20.
Treat as categorical --> One Hot encoding
"""
occupation_one_hot = pd.get_dummies(users_df['Occupation'], prefix='Occ')

# Combine All
user_features = pd.concat( [
    users_df[['UserID']],
    gender_one_hot,
    age_one_hot,
    occupation_one_hot
], axis=1)

# Index by UserID
user_features.set_index('UserID', inplace=True)

print(user_features.head())
print(f"\nFinal shape: {user_features.shape}")



Unnamed: 0,Gender_F,Gender_M
47,False,True
52,False,True
122,False,True
147,False,True
148,False,True


In [None]:
# === Assuming:
# user_features, item_features are DataFrames indexed by raw IDs
# raw_userid_2_user_idx, raw_movieid_2_movie_idx mappings exist
# user_idx_2_raw_userid, movie_idx_2_raw_movieid reverse mappings exist

# === Ordered raw IDs matching contiguous idx ===
n_users = len(raw_userid_2_user_idx)
n_items = len(raw_movieid_2_movie_idx)

ordered_raw_user_ids = [user_idx_2_raw_userid[i] for i in range(n_users)]
ordered_raw_movie_ids = [movie_idx_2_raw_movieid[j] for j in range(n_items)]

# === Reindex ===
user_features_aligned = user_features.loc[ordered_raw_user_ids]
item_features_aligned = item_features.loc[ordered_raw_movie_ids]

# === To ndarray ===
user_features_matrix = user_features_aligned.to_numpy()
item_features_matrix = item_features_aligned.to_numpy()

print(user_features_matrix.shape)  # Should be (n_users, D)
print(item_features_matrix.shape)  # Should be (n_items, F)


In [None]:
import numpy as np
import math

"""
R : User-Item matrix, np.ndarray with np.nan for missing
user_features : np.ndarray, shape (n_users, D)
item_features : np.ndarray, shape (n_items, F)
k : latent dimension
"""

class MFWithSideFeatures:
    def __init__(self, R, user_features, item_features, k=10, n_epochs=50, alpha=0.005, lambda_reg=0.02):

        self.R = R
        self.X = user_features  # side features for users
        self.Y = item_features  # side features for items
        self.k = k
        self.n_epochs = n_epochs
        self.alpha = alpha
        self.lambda_reg = lambda_reg

        self.n_users, self.n_items = R.shape
        self.D = self.X.shape[1]
        self.F = self.Y.shape[1]

        # Learnable mappings from side info to latent factors
        self.W_u = np.random.normal(scale=0.1, size=(k, self.D))
        self.W_v = np.random.normal(scale=0.1, size=(k, self.F))

        # Free residual embeddings
        self.epsilon_u = np.random.normal(scale=0.1, size=(self.n_users, k))
        self.epsilon_i = np.random.normal(scale=0.1, size=(self.n_items, k))

        # Biases
        self.mu = np.nanmean(R)
        self.b_u = np.zeros(self.n_users)
        self.b_i = np.zeros(self.n_items)

    def _matrix_to_samples(self):
        samples = []
        for u in range(self.n_users):
            for i in range(self.n_items):
                if not np.isnan(self.R[u, i]):
                    samples.append((u, i, self.R[u, i]))
        return samples

    def predict_rating(self, u, i):
        # Parts
        user_side = self.W_u @ self.X[u]  # (k,)
        item_side = self.W_v @ self.Y[i]  # (k,)
        residual_user = self.epsilon_u[u]
        residual_item = self.epsilon_i[i]

        # Full prediction
        pred = (
            user_side @ item_side +
            residual_user @ item_side +
            user_side @ residual_item +
            residual_user @ residual_item +
            self.b_u[u] + self.b_i[i] + self.mu
        )
        return pred

    def train(self):
        samples = self._matrix_to_samples()

        for epoch in range(self.n_epochs):
            np.random.shuffle(samples)
            for u, i, r in samples:
                # === Forward ===
                x_u = self.X[u]
                y_i = self.Y[i]

                user_side = self.W_u @ x_u  # (k,)
                item_side = self.W_v @ y_i  # (k,)
                residual_user = self.epsilon_u[u]
                residual_item = self.epsilon_i[i]

                pred = (
                    user_side @ item_side +
                    residual_user @ item_side +
                    user_side @ residual_item +
                    residual_user @ residual_item +
                    self.b_u[u] + self.b_i[i] + self.mu
                )

                error = r - pred

                # === Backward: update ===

                # Save current
                W_u_old = self.W_u.copy()
                W_v_old = self.W_v.copy()
                eps_u_old = residual_user.copy()
                eps_i_old = residual_item.copy()

                # Update mappings W_u and W_v
                self.W_u += self.alpha * (
                    np.outer(error * (item_side + eps_i_old), x_u) - self.lambda_reg * W_u_old
                )

                self.W_v += self.alpha * (
                    np.outer(error * (user_side + eps_u_old), y_i) - self.lambda_reg * W_v_old
                )

                # Update residuals
                self.epsilon_u[u] += self.alpha * (
                    error * (item_side + eps_i_old) - self.lambda_reg * eps_u_old
                )
                self.epsilon_i[i] += self.alpha * (
                    error * (user_side + eps_u_old) - self.lambda_reg * eps_i_old
                )

                # Update biases
                self.b_u[u] += self.alpha * (error - self.lambda_reg * self.b_u[u])
                self.b_i[i] += self.alpha * (error - self.lambda_reg * self.b_i[i])

            rmse = self.compute_rmse()
            print(f"Epoch {epoch+1}: RMSE = {rmse:.4f}")

    """
    Mini-batch SGD + early stopping
    """
    def train_minibatch(self, batch_size=256, patience=5, delta=0.001):

        samples = self._matrix_to_samples()
        best_rmse = float('inf')
        wait = 0

        for epoch in range(self.n_epochs):
            np.random.shuffle(samples)

            for start in range(0, len(samples), batch_size):
                batch = samples[start:start + batch_size]

                # === Forward & Backward for the batch ===
                for u, i, r in batch:
                    x_u = self.X[u]
                    y_i = self.Y[i]

                    user_side = self.W_u @ x_u  # (k,)
                    item_side = self.W_v @ y_i  # (k,)
                    residual_user = self.epsilon_u[u]
                    residual_item = self.epsilon_i[i]

                    pred = (
                        user_side @ item_side +
                        residual_user @ item_side +
                        user_side @ residual_item +
                        residual_user @ residual_item +
                        self.b_u[u] + self.b_i[i] + self.mu
                    )

                    error = r - pred

                    # Save current
                    W_u_old = self.W_u.copy()
                    W_v_old = self.W_v.copy()
                    eps_u_old = residual_user.copy()
                    eps_i_old = residual_item.copy()

                    # === Parameter updates ===

                    self.W_u += self.alpha * (
                        np.outer(error * (item_side + eps_i_old), x_u) - self.lambda_reg * W_u_old
                    )

                    self.W_v += self.alpha * (
                        np.outer(error * (user_side + eps_u_old), y_i) - self.lambda_reg * W_v_old
                    )

                    self.epsilon_u[u] += self.alpha * (
                        error * (item_side + eps_i_old) - self.lambda_reg * eps_u_old
                    )

                    self.epsilon_i[i] += self.alpha * (
                        error * (user_side + eps_u_old) - self.lambda_reg * eps_i_old
                    )

                    self.b_u[u] += self.alpha * (error - self.lambda_reg * self.b_u[u])
                    self.b_i[i] += self.alpha * (error - self.lambda_reg * self.b_i[i])

            # === End epoch: compute RMSE, check for early stopping ===
            rmse = self.compute_rmse()
            print(f"Epoch {epoch+1}: RMSE = {rmse:.4f}")

            if rmse + delta < best_rmse:
                best_rmse = rmse
                wait = 0
            else:
                wait += 1
                if wait >= patience:
                    print(f"⏹️ Early stopping at epoch {epoch+1} (no improvement in {patience} checks)")
                    break


    def compute_rmse(self):
        mse = 0
        count = 0
        for u in range(self.n_users):
            for i in range(self.n_items):
                if not np.isnan(self.R[u, i]):
                    pred = self.predict_rating(u, i)
                    mse += (self.R[u, i] - pred) ** 2
                    count += 1
        return math.sqrt(mse / count)

    def predict_full_matrix(self):
        full_R = np.zeros((self.n_users, self.n_items))
        for u in range(self.n_users):
            for i in range(self.n_items):
                full_R[u, i] = self.predict_rating(u, i)
        return full_R


In [None]:
def recommendTopNMoviesForUser_MFWithSideFeatures(
    mf,  # your trained MFWithSideFeatures object
    username,
    user_name_to_id,
    raw_userid_2_user_idx,
    movie_idx_2_raw_movieid,
    movieId2title,
    N=5
):
    # 1. Resolve raw userID
    raw_user_id = user_name_to_id[username]

    # 2. Map to contiguous index
    user_idx = raw_userid_2_user_idx[raw_user_id]

    # 3. Predict all scores
    R_hat = mf.predict_full_matrix()
    user_scores = R_hat[user_idx]

    # 4. Mask out seen items
    rated_items_idx = np.where(~np.isnan(mf.R[user_idx]))[0]
    user_scores[rated_items_idx] = -np.inf

    # 5. Top N
    top_N_idx = np.argsort(user_scores)[-N:][::-1]

    # 6. Build result
    recommendations = []
    for idx in top_N_idx:
        raw_movie_id = movie_idx_2_raw_movieid[idx]
        title = movieId2title.get(raw_movie_id, "Unknown Title")
        score = user_scores[idx]
        recommendations.append((raw_movie_id, title, score))

    return recommendations


In [None]:
user_name_2_raw_user_id = {"Alice": 156, "Triparna": 741, "Srishti": 768}
raw_user_id_2_user_name = { v: k for k, v in user_name_2_raw_user_id.items() }


In [None]:
import numpy as np

# Example: count number of ratings per user_idx in R
num_ratings_per_user = np.sum(~np.isnan(R), axis=1)

# Find users with <= 5 ratings
cold_users_idx = np.where(num_ratings_per_user <= 5)[0]

print(f"Found {len(cold_users_idx)} sparse users")

# Get raw user IDs for cold users
cold_raw_userids = [user_idx_2_raw_userid[user_idx] for user_idx in cold_users_idx]

# Get usernames (if available)
cold_usernames = [raw_user_id_2_user_name.get(raw_id, f"unknown_{raw_id}")
                 for raw_id in cold_raw_userids]

# Create a DataFrame for better visualization
import pandas as pd
cold_users_df = pd.DataFrame({
    'user_idx': cold_users_idx,
    'raw_userid': cold_raw_userids,
    'username': cold_usernames,
    'num_ratings': num_ratings_per_user[cold_users_idx]
})

print(f"\nFound {len(cold_users_df)} cold-start users:")
print(cold_users_df.head(10))  # Show first 10 cold users
