In [1]:
import sys
sys.path.append('../')
import numpy as np
import pymongo
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from scipy.sparse import csr_matrix
import os
import mlflow

import warnings
warnings.filterwarnings("ignore")

# Configuration initiale de MLflow
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
mlflow.set_experiment("Movielens_NMF_Model")

# Connexion à MongoDB
client = pymongo.MongoClient('localhost:27017')
db = client['movie']
movies = db['movie_json']
users = db['user_json']



In [2]:
movies = pd.DataFrame(movies.find({}, {"_id": 1, "title": 1}))

# Récupérer les données de la base de données
data = list(users.find({}, {"movies.movieid": 1, "_id": 1, "movies.rating": 1, "movies.timestamp": 1}))

# Convertir les données en DataFrame
users = pd.json_normalize(data)

# "Dérouler" la liste movies
users = users.explode('movies')

# Convertir chaque élément de la liste en une colonne distincte
users_tmp = users['movies'].apply(pd.Series)
users = pd.concat([users, users_tmp], axis=1).drop('movies', axis=1)

print('taille de movies :', len(movies))
print('taille de users :', len(users))
users.head()

taille de movies : 6040
taille de users : 1000209


Unnamed: 0,_id,movieid,rating,timestamp
0,6040,573,4,956704056
0,6040,589,4,956704996
0,6040,1,3,957717358
0,6040,2068,4,997453982
0,6040,592,2,956716016


In [3]:
merged_df = movies.merge(users, left_on='_id', right_on='movieid')

# drop "movieid" column
merged_df = merged_df.drop(columns=['movieid'])

# rename "_id_x" column to "movieid"
merged_df = merged_df.rename(columns={"_id_x": "movieid"})

# rename "_id_y" column to "user_id"
merged_df = merged_df.rename(columns={"_id_y": "user_id"})

# Sort merged_df by timestamp
merged_df = merged_df.sort_values(by=['timestamp'])

# Reset index
merged_df = merged_df.reset_index(drop=True)

# Drop the first entire line because of the odd size of the dataset
merged_df = merged_df.drop(merged_df.index[0])

print('Taille de merged_df :', len(merged_df))
merged_df.tail(10)

Taille de merged_df : 1000208


Unnamed: 0,movieid,title,user_id,rating,timestamp
1000199,3098,"Natural, The (1984)",5948,4,1046437932
1000200,3267,"Mariachi, El (1992)",5312,4,1046444711
1000201,2453,"Boy Who Could Fly, The (1986)",4958,4,1046454260
1000202,2043,Darby O'Gill and the Little People (1959),4958,1,1046454282
1000203,3489,Hook (1991),4958,4,1046454320
1000204,2399,Santa Claus: The Movie (1985),4958,1,1046454338
1000205,1407,Scream (1996),4958,5,1046454443
1000206,2634,"Mummy, The (1959)",4958,3,1046454548
1000207,3264,Buffy the Vampire Slayer (1992),4958,4,1046454548
1000208,1924,Plan 9 from Outer Space (1958),4958,4,1046454590


In [4]:
# Number of ratings per movie
movies_counts = merged_df['movieid'].value_counts()
print(movies_counts.describe())

print('\n')

# Number of ratings per user
cusers_counts = merged_df['user_id'].value_counts()
print(cusers_counts.describe())

# Define the thresholds under which we drop the movies
movies_threshold = 33
users_threshold = 44

# Drop movies with less than 33 ratings
merged_df = merged_df[merged_df['movieid'].isin(movies_counts[movies_counts > movies_threshold].index)]

# Drop users with less than 44 ratings
merged_df = merged_df[merged_df['user_id'].isin(cusers_counts[cusers_counts > users_threshold].index)]

print('\n')
print('Nouvelle taille de merged_df :', len(merged_df))

count    3706.000000
mean      269.888829
std       384.046465
min         1.000000
25%        33.000000
50%       123.500000
75%       350.000000
max      3428.000000
Name: count, dtype: float64


count    6040.000000
mean      165.597351
std       192.746879
min        20.000000
25%        44.000000
50%        96.000000
75%       208.000000
max      2314.000000
Name: count, dtype: float64


Nouvelle taille de merged_df : 942215


In [5]:
# Split merged_df into train (80%) and test (20%) dataframes
train_size = int(0.8 * len(merged_df))
df_train = merged_df[:train_size]
df_test = merged_df[train_size:]

print(len(df_train))
print(len(df_test))


753772
188443


In [6]:
# Drop every line of df_test that contains a movieid that is not in df_train
films_df_train = df_train['movieid'].unique()
df_test = df_test[df_test['movieid'].isin(films_df_train)]

# Drop every line of df_test that contains a user_id that is not in df_train
users_df_train = df_train['user_id'].unique()
df_test = df_test[df_test['user_id'].isin(users_df_train)]


print(len(df_train))
print(len(df_test))

753772
100760


In [7]:
# Pivot train dataframe to get a matrix of users and their ratings for movies
ratings_train = df_train.pivot(index='user_id', columns='movieid', values='rating')

ratings_train

movieid,1,2,3,4,5,6,7,8,9,10,...,3937,3943,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
621,,,,,,,,,,,...,,,,,,,,,,
622,,,,,,,,,,,...,,,,,,,,,,
623,5.0,,,,,,,,,,...,,,,,,,,,,4.0
624,3.0,3.0,,,,,,,,,...,,,,,,,,,,
625,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,4.0,,1.0,2.0,1.0,,3.0,,,,...,,,,,,,,,,
6036,,,,2.0,,3.0,,,,,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


In [8]:
# Fill NaN values with 
ratings_train = ratings_train.fillna(0)

# Drop lines with only zeros
ratings_train = ratings_train[ratings_train.sum(axis=1) > 0]

# Sparse ratings train dataframe
ratings_train_sparse = ratings_train.astype(pd.SparseDtype("float", 0))

ratings_train_sparse

movieid,1,2,3,4,5,6,7,8,9,10,...,3937,3943,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
621,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
622,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
623,5.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4.0
624,3.0,3.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
625,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,4.0,0,1.0,2.0,1.0,0,3.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6036,0,0,0,2.0,0,3.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
n_components = 10
max_iter = 100
nmf = NMF(n_components=n_components, max_iter=max_iter)

# Fit the model to the user-item train matrix
U_train = nmf.fit_transform(ratings_train_sparse)  # User matrix train
M = nmf.components_  # Item matrix

pred_matrix = np.dot(U_train, M)
pred_matrix

array([[8.51365631e-01, 3.88185221e-01, 1.67896249e-01, ...,
        1.41989653e-02, 3.31142785e-02, 2.77764862e-01],
       [1.17929023e+00, 0.00000000e+00, 0.00000000e+00, ...,
        4.91257154e-03, 6.26733198e-03, 7.13903482e-02],
       [2.22904505e+00, 1.19689784e-01, 4.36127980e-02, ...,
        4.09830734e-03, 9.67381824e-03, 1.45441791e-01],
       ...,
       [2.10633930e+00, 1.13642521e-01, 0.00000000e+00, ...,
        2.62545816e-02, 3.21919691e-03, 9.31391376e-02],
       [1.41054116e+00, 6.47976921e-01, 3.08682132e-01, ...,
        1.98512001e-02, 1.82493106e-03, 1.81511400e-02],
       [1.79050195e+00, 9.65448271e-02, 5.76658868e-02, ...,
        2.78130451e-02, 3.78788215e-02, 2.86386177e-01]])

In [10]:
# "unpivot" the matrix returned to get
pred_df = pd.DataFrame(pred_matrix, columns=ratings_train.columns, index=ratings_train.index)

pred_df

movieid,1,2,3,4,5,6,7,8,9,10,...,3937,3943,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
621,0.851366,0.388185,0.167896,0.000000,0.024278,0.121328,0.000000,0.012006,0.020530,0.000000,...,0.274714,0.063760,0.018511,0.078348,0.013206,1.004728,0.146631,0.014199,0.033114,0.277765
622,1.179290,0.000000,0.000000,0.029329,0.000000,0.544785,0.036722,0.000000,0.000000,0.000000,...,0.004108,0.008768,0.000000,0.002865,0.016389,0.059169,0.020960,0.004913,0.006267,0.071390
623,2.229045,0.119690,0.043613,0.000000,0.006307,0.791876,0.000000,0.000000,0.003322,0.131556,...,0.066388,0.021479,0.006341,0.008458,0.002345,0.340951,0.046506,0.004098,0.009674,0.145442
624,2.150757,0.267703,0.102812,0.017859,0.076376,1.197831,0.109658,0.006643,0.049797,0.776964,...,0.113180,0.005167,0.002736,0.003061,0.007239,0.095133,0.020156,0.000311,0.001311,0.073073
625,0.155986,0.019697,0.013823,0.108690,0.008892,0.270078,0.147299,0.003555,0.000000,0.003147,...,0.001619,0.007067,0.000988,0.000810,0.000026,0.119901,0.034146,0.000338,0.021745,0.134792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,1.244132,0.728379,0.282242,0.261062,0.230793,0.843987,0.534150,0.054713,0.084887,0.755689,...,0.106350,0.015341,0.009970,0.018343,0.001401,0.292415,0.085477,0.000013,0.036680,0.256324
6036,2.250104,1.160296,0.395649,0.522778,0.245457,1.366659,0.951454,0.089160,0.046119,0.702095,...,0.419477,0.108879,0.029358,0.081266,0.107562,1.206576,0.349865,0.036737,0.113824,0.792298
6037,2.106339,0.113643,0.000000,0.000670,0.000000,0.811394,0.000839,0.000555,0.000949,0.071440,...,0.071046,0.036305,0.001276,0.017960,0.091002,0.079550,0.081370,0.026255,0.003219,0.093139
6039,1.410541,0.647977,0.308682,0.135369,0.266602,0.005472,0.530303,0.114248,0.000000,0.112334,...,0.023504,0.027325,0.016913,0.012598,0.069560,0.035266,0.053748,0.019851,0.001825,0.018151


In [11]:
# Stack the dataframe
pred_df = pred_df.stack().reset_index()
pred_df.columns = ['user_id', 'movieid', 'user_movie_position'] # Rename columns

# Merge the train and test dataframes with the predictions dataframe
train_pred_df = pd.merge(df_train, pred_df, on=['user_id', 'movieid'])
test_pred_df = pd.merge(df_test, pred_df, on=['user_id', 'movieid'])

test_pred_df

Unnamed: 0,movieid,title,user_id,rating,timestamp,user_movie_position
0,1330,April Fool's Day (1986),621,1,975798397,0.514336
1,1974,Friday the 13th (1980),621,2,975798397,1.429253
2,1128,"Fog, The (1980)",621,1,975798418,0.957992
3,2944,"Dirty Dozen, The (1967)",1268,3,975798431,1.446336
4,2787,Cat's Eye (1985),621,3,975798448,0.779395
...,...,...,...,...,...,...
100755,2399,Santa Claus: The Movie (1985),4958,1,1046454338,0.240852
100756,1407,Scream (1996),4958,5,1046454443,0.168757
100757,2634,"Mummy, The (1959)",4958,3,1046454548,0.022726
100758,3264,Buffy the Vampire Slayer (1992),4958,4,1046454548,0.338075


In [12]:
# Calculate the MSE
mse_train = mean_squared_error(train_pred_df['rating'], train_pred_df['user_movie_position'])
mse_test = mean_squared_error(test_pred_df['rating'], test_pred_df['user_movie_position'])

pred_df

Unnamed: 0,user_id,movieid,user_movie_position
0,621,1,0.851366
1,621,2,0.388185
2,621,3,0.167896
3,621,4,0.000000
4,621,5,0.024278
...,...,...,...
11209655,6040,3948,0.224972
11209656,6040,3949,0.136111
11209657,6040,3950,0.027813
11209658,6040,3951,0.037879


In [13]:
os.environ["GIT_PYTHON_REFRESH"] = "quiet"

In [14]:
with mlflow.start_run():
    
    # Log du modèle
    mlflow.sklearn.log_model(nmf, "nmf_model", registered_model_name="NMF_Model")
    # Configuration des paramètres du modèle
    n_components = int(input("Entrez le nombre de composants NMF (ex: 20): "))
    max_iter = int(input("Entrez le nombre maximal d'itérations (ex: 5000): "))
    mlflow.log_param("n_components", n_components)
    mlflow.log_param("max_iter", max_iter)
    mlflow.log_metric("Training MSE", mse_train)
    mlflow.log_metric("Test MSE", mse_test)
    

Registered model 'NMF_Model' already exists. Creating a new version of this model...
2024/02/15 15:56:16 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: NMF_Model, version 10
Created version '10' of model 'NMF_Model'.


In [15]:
# sort test_pred_df by user_movie_position for every user ans reset index
test_pred_df = test_pred_df.sort_values(by=['user_id', 'user_movie_position'], ascending=[True, False]).reset_index(drop=True)

# Create a dataframe with the top 10 ratings for each user
top_10_df = test_pred_df.groupby('user_id').head(10)

# Pour chaque groupe, calculer le coefficient de corrélation de Spearman et l'ajouter au dataframe des résultats
grouped = top_10_df.groupby('user_id')
results = pd.DataFrame(columns=['user_id', 'spearman_corr'])
for user, group in grouped:
    spearman_corr = group['rating'].corr(group['user_movie_position'], method='spearman')
    results.loc[len(results)] = [user, spearman_corr]

# Afficher les résultats
results.describe()


Unnamed: 0,user_id,spearman_corr
count,1084.0,1032.0
mean,2929.020295,0.123675
std,1563.145132,0.387211
min,621.0,-1.0
25%,1560.75,-0.110664
50%,2788.5,0.133306
75%,4190.5,0.406181
max,6040.0,1.0


In [16]:

df_sorted = df.sort_values(by='timestamp')
# Diviser les données en ensembles d'entraînement et de test
train_size = int(0.8 * len(merged_df))
df_train = merged_df[:train_size]
df_test = merged_df[train_size:]

# Diviser l'ensemble de test en 98% et 2%
test_size_2_percent = int(0.02 * len(df_test))
df_test, df_test_2_percent = df_test[:-test_size_2_percent], df_test[-test_size_2_percent:]

# Afficher la taille des ensembles de données
print("Taille de l'ensemble d'entraînement :", len(df_train))
print("Taille de l'ensemble de test (98%) :", len(df_test))
print("Taille de l'ensemble de test (2%) pour les tests :", len(df_test_2_percent))
print(df_train.head())
print(df_test.head())

NameError: name 'df' is not defined

In [None]:
user_list = []
movieid_list = []
rating_list = []
timestamp_list = []

# Parcourir les données et extraire les informations nécessaires
for entry in data:
    user_id = entry['_id']
    for movie in entry['movies']:
        user_list.append(user_id)
        movieid_list.append(movie['movieid'])
        rating_list.append(movie['rating'])
        timestamp_list.append(movie['timestamp'])

# Créer le DataFrame
df = pd.DataFrame({
    'user': user_list,
    'movieid': movieid_list,
    'rating': rating_list,
    'timestamp': timestamp_list
})

# Afficher les premières lignes du DataFrame
print(df.head())

In [None]:
# movies for each df
movies_train = set(df_train['movieid'])
movies_test = set(df_test['movieid'])

# Find movies communs
movies_common = movies_train.intersection(movies_test)

# Convert in list (option)
movies_common_list = list(movies_common)
print("Movies common to both df :", len(movies_common))

# Number of movies in train and not in test
movies_train_not_in_test = movies_train - movies_test
num_movies_train_not_in_test = len(movies_train_not_in_test)

# Number of movies in test and not in train
movies_train_not_in_train = movies_test - movies_train
num_movies_test_not_in_train = len(movies_train_not_in_train)

print("Movies in train but not in test :", num_movies_train_not_in_test)
print("Movies in test but not in train :", num_movies_test_not_in_train)