In [1]:
#import
import pandas as pd
import numpy as np
from pathlib import Path
import sys

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

# add folder src/ to the python file
sys.path.append(str(Path().resolve().parent / "src"))

from data_preprocessing import load_data
from collaborative_filtering import (
    predict_rating,
    predict_rating_fast,
    predict_mean_rating,
    predict_random_rating,
    evaluate_model
    )

In [2]:
# loading
project_root = Path().resolve().parent
data_dir = project_root / "data"

movie_genres, ratings, user_movie_matrix, movies = load_data(
    movies_path=str(data_dir / "u.item"),
    ratings_path=str(data_dir / "u.data")
)

Taux de remplissage (density) : 0.0635
Sparsité (sparsity) : 0.9365


In [3]:
# Split train/test
# we split ratings (not users)
train_df, test_df = train_test_split(
    ratings,
    test_size=0.2,
    random_state=42
)

print(f"Train: {len(train_df)} notes | Test: {len(test_df)} notes")

Train: 80000 notes | Test: 20000 notes


In [4]:
train_matrix = train_df.pivot_table(index="user_id", columns="title", values="rating")

In [5]:
train_users = set(train_df["user_id"])
train_movies = set(train_df["title"])

test_users = set(test_df["user_id"])
test_movies = set(test_df["title"])

print("🔍 Utilisateurs dans le test non vus en train :", len(test_users - train_users))
print("🔍 Films dans le test non vus en train :", len(test_movies - train_movies))

🔍 Utilisateurs dans le test non vus en train : 0
🔍 Films dans le test non vus en train : 29


In [6]:
# Prediction loop
sample_test:pd.DataFrame = test_df.sample(n=500, random_state=42)

sample_test = sample_test[
    sample_test["user_id"].isin(train_users) &
    sample_test["title"].isin(train_movies)
]

y_true = []
y_pred = []

for _, row in sample_test.iterrows():
    user_id = row["user_id"]
    movie_title = row["title"]
    true_rating = row["rating"]

    predicted_rating = predict_rating(user_id, movie_title, train_matrix)

    if not np.isnan(predicted_rating):
        y_true.append(true_rating)
        y_pred.append(predicted_rating)

print(f"{len(y_pred)} prédictions réalisées sur {len(sample_test)} exemples.")

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]


500 prédictions réalisées sur 500 exemples.


In [7]:
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true, y_pred)

print(f"📈 RMSE : {rmse:.4f}")
print(f"📉 MAE  : {mae:.4f}")

📈 RMSE : 1.8414
📉 MAE  : 1.5309


In [10]:
rmse_mean, mae_mean = evaluate_model(
    lambda user_id, title, matrix: predict_mean_rating(title, matrix),
    test_df, train_matrix
)
print(f"🎯 Moyenne globale – RMSE : {rmse_mean:.4f}, MAE : {mae_mean:.4f}")

rmse_rand, mae_rand = evaluate_model(
    lambda user_id, title, matrix: predict_random_rating(),
    test_df, train_matrix
)
print(f"🎲 Aléatoire – RMSE : {rmse_rand:.4f}, MAE : {mae_rand:.4f}")


# Corrélation entre les films (colonnes)
item_similarity = train_matrix.corr(method="pearson", min_periods=5)
rmse_item, mae_item = evaluate_model(
    lambda user_id, title, matrix: predict_rating_fast(user_id, title, matrix, item_similarity),
    test_df,
    train_matrix
)
print(f"🤝 Collaboratif (item-item) –  RMSE : {rmse_item:.4f}, MAE : {mae_item:.4f}")

🎯 Moyenne globale – RMSE : 1.0195, MAE : 0.8114
🎲 Aléatoire – RMSE : 1.6931, MAE : 1.3838
🤝 Collaboratif (item-item) –  RMSE : 1.7653, MAE : 1.4550


In [11]:
results_df = pd.DataFrame({
    "Modèle": ["Collaboratif (item-item)", "Moyenne globale", "Aléatoire"],
    "RMSE": [rmse_item, rmse_mean, rmse_rand],
    "MAE": [mae_item, mae_mean, mae_rand]
})
print(results_df)

                     Modèle      RMSE       MAE
0  Collaboratif (item-item)  1.765345  1.454952
1           Moyenne globale  1.019510  0.811353
2                 Aléatoire  1.693068  1.383805
