In [10]:
import numpy as np
import tensorflow as tf
import pandas as pd

In [11]:
movie_df   = pd.read_csv("movies.csv")
ratings_df = pd.read_csv("ratings.csv")

ratings_small = ratings_df.head(10000)

user_ids  = ratings_small.userId.unique()
movie_ids = ratings_small.movieId.unique()

user_map  = {u: i for i, u in enumerate(user_ids)}
movie_map = {m: i for i, m in enumerate(movie_ids)}

nu = len(user_map) #number of users
nm = len(movie_map) #number of movies

print(nu, nm)


75 3287


In [12]:
Y = np.zeros((len(movie_ids), len(user_ids)))
R = np.zeros((len(movie_ids), len(user_ids)))

for _, row in ratings_small.iterrows():
    m = movie_map[row.movieId]
    u = user_map[row.userId]
    Y[m, u] = row.rating ##row-> movie col-> user
    R[m, u] = 1


In [24]:
nf=100 #number of feature
X=np.random.randn(nm,nf)
W=np.random.randn(nu,nf)
b=np.zeros((1,nu))


In [25]:
def compute_cost(X, W, b, Y, R, lambda_):
    """
    X: (nm, nf) movie features
    W: (nu, nf) user preferences
    b: (1, nu) user bias
    Y: (nm, nu) ratings
    R: (nm, nu) indicator matrix
    """
    # Predictions for all movie-user pairs
    pred = X @ W.T + b        # (nm, nu)

    # Error only where rating exists
    error = (pred - Y) * R

    # Cost
    cost = 0.5 * np.sum(error ** 2)

    # Regularization
    cost += (lambda_ / 2) * (np.sum(X**2) + np.sum(W**2))

    return cost


In [26]:
def compute_gradients(X, W, b, Y, R, lambda_):
    """
    Returns gradients for X, W, b
    """
    pred = X @ W.T + b        # (nm, nu)
    error = (pred - Y) * R   # (nm, nu)

    # Gradients
    X_grad = error @ W + lambda_ * X        # (nm, nf)
    W_grad = error.T @ X + lambda_ * W      # (nu, nf)
    b_grad = np.sum(error, axis=0, keepdims=True)  # (1, nu)

    return X_grad, W_grad, b_grad


In [27]:
Ymean = np.zeros((nm, 1))
Ynorm = np.zeros_like(Y)

for i in range(nm):
    idx = R[i, :] == 1
    if np.any(idx):
        Ymean[i] = np.mean(Y[i, idx])
        Ynorm[i, idx] = Y[i, idx] - Ymean[i]


In [29]:
# Copy original matrices
Y_train = Y.copy()
R_train = R.copy()

# Get indices of all existing ratings
rated_indices = np.argwhere(R == 1)

# Shuffle ratings
np.random.shuffle(rated_indices)

# Use 10% for validation
val_size = int(0.1 * len(rated_indices))
val_indices = rated_indices[:val_size]

# Remove validation ratings from training data
for m, u in val_indices:
    Y_train[m, u] = 0
    R_train[m, u] = 0


In [30]:
alpha = 1e-3
lambda_ = 0.1
iterations = 1000

# Reinitialize parameters (important!)
X = np.random.randn(nm, nf) * 0.1
W = np.random.randn(nu, nf) * 0.1
b = np.zeros((1, nu))

for i in range(iterations):
    cost = compute_cost(X, W, b, Y_train, R_train, lambda_)
    Xg, Wg, bg = compute_gradients(X, W, b, Y_train, R_train, lambda_)

    X -= alpha * Xg
    W -= alpha * Wg
    b -= alpha * bg

    if i % 100 == 0:
        print(f"Iteration {i}: training cost = {cost:.2f}")


Iteration 0: training cost = 63071.62
Iteration 100: training cost = 1614.10
Iteration 200: training cost = 449.70
Iteration 300: training cost = 259.36
Iteration 400: training cost = 220.77
Iteration 500: training cost = 209.34
Iteration 600: training cost = 204.08
Iteration 700: training cost = 200.50
Iteration 800: training cost = 197.46
Iteration 900: training cost = 194.63


In [31]:
def compute_validation_rmse(X, W, b, Y, val_indices):
    errors = []

    for m, u in val_indices:
        pred = X[m] @ W[u] + b[0, u]
        errors.append((pred - Y[m, u]) ** 2)

    return np.sqrt(np.mean(errors))
val_rmse = compute_validation_rmse(X, W, b, Y, val_indices)
print("Validation RMSE:", val_rmse)


Validation RMSE: 0.9769518522693638


In [36]:
np.savez(
    "recommender_model.npz",
    X=X,
    W=W,
    b=b,
    Ymean=Ymean
)


In [35]:
# ----------- PICK A USER -----------
user_id = user_ids[0]          # choose any existing userId
u = user_map[user_id]

# ----------- PREDICT RATINGS -----------
# Predict ratings for ALL movies for this user
predictions = X @ W[u] + b[0, u]    # shape: (num_movies,)

# ----------- REMOVE ALREADY RATED MOVIES -----------
predictions[R[:, u] == 1] = -1e9    # so they don't get recommended

# ----------- PICK TOP 10 MOVIES -----------
top_indices = np.argsort(predictions)[-10:][::-1]

# ----------- MAP BACK TO MOVIE IDs -----------
inv_movie_map = {v: k for k, v in movie_map.items()}
top_movie_ids = [inv_movie_map[i] for i in top_indices]

# ----------- SHOW MOVIE TITLES -----------
recommended_movies = (
    movie_df[movie_df.movieId.isin(top_movie_ids)]
    .set_index("movieId")
    .loc[top_movie_ids]
    .reset_index()
)

print("Recommended movies for user:", user_id)
recommended_movies


Recommended movies for user: 1


Unnamed: 0,movieId,title,genres
0,3088,Harvey (1950),Comedy|Fantasy
1,231,Dumb & Dumber (Dumb and Dumber) (1994),Adventure|Comedy
2,953,It's a Wonderful Life (1946),Children|Drama|Fantasy|Romance
3,5119,Saturday Night and Sunday Morning (1960),Drama
4,139385,The Revenant (2015),Adventure|Drama
5,26565,Asterix in Britain (Astérix chez les Bretons) ...,Adventure|Animation|Children|Comedy
6,6093,"Last Unicorn, The (1982)",Animation|Children|Fantasy
7,6979,WarGames (1983),Drama|Sci-Fi|Thriller
8,3034,Robin Hood (1973),Adventure|Animation|Children|Comedy|Musical
9,2028,Saving Private Ryan (1998),Action|Drama|War


In [39]:
import pandas as pd

# Load the original large file
df = pd.read_csv("ratings.csv")

# Keep only first 10,000 rows (you can change number)
df_small = df.head(10000)

# OVERWRITE the same file
df_small.to_csv("ratings.csv", index=False)

print("New ratings.csv size:", df_small.shape)


New ratings.csv size: (10000, 4)
