In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt



In [13]:
data = load_dataset("nbtpj/movielens-1m-ratings")["train"].shuffle(seed=10).select(range(200000))
movielens_df = pd.DataFrame(data)
movielens_df = movielens_df[["user_id", "movie_id", "user_rating"]]

user_ids = movielens_df["user_id"].unique()
user_id_map = {id: index for index, id in enumerate(user_ids)}
movie_ids = movielens_df["movie_id"].unique()
movie_id_map = {id: index for index, id in enumerate(movie_ids)}

movielens_df["user_id"] = movielens_df["user_id"].map(user_id_map)
movielens_df["movie_id"] = movielens_df["movie_id"].map(movie_id_map)

train_data, test_data = train_test_split(movielens_df, test_size=0.2, random_state=10)

In [14]:
num_users = len(user_ids)
num_movies = len(movie_ids)
num_features = 10

user_features = np.random.normal(0, .1, (num_users, num_features))
movie_features = np.random.normal(0, .1, (num_movies, num_features))

learning_rate = 0.01
regularization = 0.1
epochs = 20
epoch_rmse: list[float] = []


def predict_rating(user_id: int, movie_id: int) -> float:
    return np.dot(user_features[user_id], movie_features[movie_id])


for epoch in tqdm(range(epochs)):
    squared_errors: list[float] = []

    for _, row in train_data.iterrows():
        user_id = int(row["user_id"])
        movie_id = int(row["movie_id"])
        rating = row["user_rating"]

        prediction = predict_rating(user_id, movie_id)
        error = rating - prediction
        squared_errors.append(error**2)

        user_features[user_id] += learning_rate * (error * movie_features[movie_id] - regularization * user_features[user_id])
        movie_features[movie_id] += learning_rate * (error * user_features[user_id] - regularization * movie_features[movie_id])

    epoch_rmse.append(np.sqrt(np.mean(squared_errors)))

100%|██████████| 20/20 [01:09<00:00,  3.45s/it]


In [None]:
plt.figure(figsize=(10, 5))
plt.plot(range(1, epochs+1), epoch_rmse, linewidth=2, color="#fc1c49")
plt.title("Epoch vs. RMSE")
plt.xlabel("Epoch")
plt.ylabel("RMSE")
plt.grid(True)
plt.show()

In [None]:
predictions: list[float] = []
true_ratings: list[float] = []

for _, row in tqdm(test_data.iterrows(), total=test_data.shape[0]):
    user_id = int(row["user_id"])
    movie_id = int(row["movie_id"])
    true_rating = row["user_rating"]

    predicted_rating = predict_rating(user_id, movie_id)
    predictions.append(round(predicted_rating))
    true_ratings.append(true_rating)

In [None]:
plt.figure(figsize=(10, 5))
plt.hist(predictions, bins=5, alpha=0.5, label="Predicted", color="#fc1c49")
plt.hist(true_ratings, bins=5, alpha=0.5, label="Actual", color="#00a67d")
plt.title("Predicted vs. Actual Rating Distribution")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.legend()
plt.show()