In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt



In [2]:
data = load_dataset("nbtpj/movielens-1m-ratings")["train"].shuffle(seed=10).select(range(200000))
movielens_df = pd.DataFrame(data)
movielens_df = movielens_df[["user_id", "movie_id", "user_rating"]]

user_ids = movielens_df["user_id"].unique()
user_id_map = {id: index for index, id in enumerate(user_ids)}
movie_ids = movielens_df["movie_id"].unique()
movie_id_map = {id: index for index, id in enumerate(movie_ids)}

movielens_df["user_id"] = movielens_df["user_id"].map(user_id_map)
movielens_df["movie_id"] = movielens_df["movie_id"].map(movie_id_map)

In [None]:
train_data, test_data = train_test_split(movielens_df, test_size=0.2, random_state=10)

user_avg_ratings = train_data.groupby('user_id')['user_rating'].mean().to_dict()
ratings_matrix = train_data.pivot(index="user_id", columns="movie_id", values="user_rating").apply(lambda x: x.fillna(user_avg_ratings[x.name]), axis=1)
user_rating_mean = ratings_matrix.mean(axis=1)
ratings_matrix_demeaned = ratings_matrix - user_rating_mean.values.reshape(-1, 1)
ratings_matrix_csr = csr_matrix(ratings_matrix_demeaned.values)

U, sigma, Vt = svds(ratings_matrix_csr, k=200)
sigma = np.diag(sigma)

In [None]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_rating_mean.values.reshape(-1, 1)
user_mean = np.mean(all_user_predicted_ratings)

def predict_rating_svd(user_id: int, movie_id: int):
    if user_id < all_user_predicted_ratings.shape[0] and movie_id < all_user_predicted_ratings.shape[1]:
        return all_user_predicted_ratings[user_id, movie_id]
    else:
        return user_mean

predictions: list[float] = []
true_ratings: list[float] = []

for _, row in tqdm(test_data.iterrows(), total=test_data.shape[0]):
    user_id = int(row["user_id"])
    movie_id = int(row["movie_id"])
    true_rating = row["user_rating"]

    predicted_rating = predict_rating_svd(user_id, movie_id)
    predictions.append(round(predicted_rating))
    true_ratings.append(true_rating)

rmse = np.sqrt(np.mean((np.array(predictions) - np.array(true_ratings))**2))
print(f"RMSE: {rmse}")

In [None]:
plt.figure(figsize=(10, 5))
plt.hist(predictions, bins=5, alpha=0.5, label="Predicted", color="#fc1c49")
plt.hist(true_ratings, bins=5, alpha=0.5, label="Actual", color="#00a67d")
plt.title("Predicted vs. Actual Rating Distribution")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.legend()
plt.show()