In [9]:
import pandas as pd
import pickle
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from surprise import Reader, Dataset
from surprise import SVD
from surprise.model_selection import cross_validate

In [10]:
# Content-Based filtering
df_content_tags = pd.read_csv("../data/interim/movies_tags.csv")
df_content_tags.dropna(subset=['all_tags'], inplace=True)
tfidf = TfidfVectorizer()
matrice_tfidf = tfidf.fit_transform(df_content_tags['all_tags'])
sim_cosinus = cosine_similarity(matrice_tfidf, matrice_tfidf)
sim_euclidienne = 1 / (1 + euclidean_distances(matrice_tfidf))
indices = pd.Series(range(0,len(df_content_tags)), index=df_content_tags.title)

def recommandations_content(titre, cos_sim, num_recommendations=10):
    idx = indices[titre]
    scores_similarite = list(enumerate(cos_sim[idx]))
    scores_similarite = sorted(scores_similarite, key=lambda x: x[1], reverse=True)
    top_similair = scores_similarite[1:num_recommendations+1]
    res = [(indices.index[idx], score) for idx, score in top_similair]
    res = pd.DataFrame(res)
    res = res.rename(columns={0: 'title', 1: 'score'})
    return res

In [11]:
print(matrice_tfidf.shape)
print(recommandations_content("Toy Story (1995)", sim_cosinus))

(27053, 7877)
                                               title     score
0                                 Toy Story 2 (1999)  0.574920
1                              Monsters, Inc. (2001)  0.401519
2                                        Cars (2006)  0.357911
3     Adventures of Ichabod and Mr. Toad, The (1949)  0.322854
4                                   Pinocchio (1940)  0.313041
5                                Finding Nemo (2003)  0.303094
6  101 Dalmatians (One Hundred and One Dalmatians...  0.295764
7                         Toy Story of Terror (2013)  0.295070
8                        Beauty and the Beast (1991)  0.288640
9                                       Shrek (2001)  0.286178


In [12]:
# Collaborative filtering
df_ratings = pd.read_csv("../data/raw/ratings.csv")
df_ratings = df_ratings.drop("timestamp", axis=1)
df_movies = pd.read_csv("../data/raw/movies.csv")
reader = Reader(rating_scale=(0, 5))
df_surprise = Dataset.load_from_df(df_ratings, reader=reader)

svd = SVD()
cross_validate(svd, df_surprise,measures=['RMSE', 'MAE'], cv=5, verbose=True )


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7860  0.7862  0.7864  0.7862  0.7864  0.7862  0.0001  
MAE (testset)     0.5980  0.5980  0.5983  0.5983  0.5984  0.5982  0.0002  
Fit time          94.06   119.61  102.93  118.68  100.83  107.22  10.17   
Test time         31.55   28.25   27.57   26.71   30.63   28.94   1.85    


{'test_rmse': array([0.78599116, 0.78619962, 0.78637094, 0.78623411, 0.78638188]),
 'test_mae': array([0.59802472, 0.5980028 , 0.59832207, 0.59826599, 0.59838414]),
 'fit_time': (94.06464791297913,
  119.60788702964783,
  102.93337392807007,
  118.68454623222351,
  100.83284306526184),
 'test_time': (31.551662921905518,
  28.252826929092407,
  27.565049171447754,
  26.70999503135681,
  30.634342908859253)}

In [13]:
train_set = df_surprise.build_full_trainset()

In [14]:
def recommandations_collab(user_id, num_recommendations=10):
    # Générer l'anti-test set pour l'utilisateur cible
    anti_testset = []
    targetUser = train_set.to_inner_uid(user_id)
    moyenne = train_set.global_mean
    user_note = train_set.ur[targetUser]
    user_livre = [item for (item, _) in user_note]

    for livre in train_set.all_items():
        if livre not in user_livre:
            anti_testset.append((user_id, train_set.to_raw_iid(livre), moyenne))

    # Effectuer les prédictions
    predictionsSVD = svd.test(anti_testset)
    predictionsSVD = pd.DataFrame(predictionsSVD)

    # Mapper les IDs de films aux titres
    movieId_title_map = df_movies.set_index('movieId')['title'].to_dict()
    predictionsSVD['title'] = predictionsSVD['iid'].map(movieId_title_map)

    # Renommer les colonnes et trier par note estimée
    predictionsSVD = predictionsSVD.rename(columns={'uid': 'userId', 'est': 'note'})
    predictionsSVD = predictionsSVD[['userId', 'title', 'note']]
    predictionsSVD.sort_values('note', ascending=False, inplace=True)

    # Retourner les top N recommandations
    return predictionsSVD.head(num_recommendations)

In [15]:
user_id = 1000
num_recommendations = 20
top_recommendations_collab = recommandations_collab(user_id, num_recommendations)
print(top_recommendations_collab)

       userId                                              title      note
2942     1000                        Boondock Saints, The (2000)  4.992041
562      1000                                   Gladiator (2000)  4.920841
1809     1000                      Dark Knight Rises, The (2012)  4.920187
12       1000                   Shawshank Redemption, The (1994)  4.906998
172      1000                                  Braveheart (1995)  4.832492
3719     1000                                   Gladiator (1992)  4.825395
2599     1000                                Intouchables (2011)  4.812233
1725     1000                               Notebook, The (2004)  4.795047
1317     1000                           Good Will Hunting (1997)  4.794791
306      1000                          American History X (1998)  4.773540
359      1000                                Forrest Gump (1994)  4.771473
1232     1000                               Love Actually (2003)  4.769917
4        1000            

In [339]:
with open("../models/svd_model.pkl", "wb") as f:
    pickle.dump(svd, f)

In [222]:
def generer_nouvelles_recommandations(recommandations, nb_recommandations=10):
    # Convertir les recommandations en liste
    recommandations_liste = recommandations.values.tolist()
    # Mélanger aléatoirement les recommandations
    random.shuffle(recommandations_liste)
    # Sélectionner un sous-ensemble de recommandations
    nouvelles_recommandations = recommandations_liste[:nb_recommandations]
    # Convertir la liste en DataFrame
    nouvelles_recommandations_df = pd.DataFrame(nouvelles_recommandations, columns=recommandations.columns)
    return nouvelles_recommandations_df

# Afficher les recommandations initiales
top_recommendations_collab = recommandations_collab(user_id, num_recommendations)
afficher_recommandations(top_recommendations_collab)

# Recueillir le retour utilisateur
satisfaction = input("Êtes-vous satisfait de ces recommandations ? (Oui/Non): ")

if satisfaction.lower() == "non":
    # Générer de nouvelles recommandations sans réentraîner le modèle
    nouvelles_recommandations = generer_nouvelles_recommandations(top_recommendations_collab)
    # Afficher les nouvelles recommandations
    afficher_recommandations(nouvelles_recommandations)


       userId                                              title      note
238      1000                              Godfather, The (1972)  4.926987
260      1000                     Godfather: Part II, The (1974)  4.913250
562      1000                                   Gladiator (2000)  4.882262
1809     1000                      Dark Knight Rises, The (2012)  4.862237
172      1000                                  Braveheart (1995)  4.840091
1796     1000  Harry Potter and the Deathly Hallows: Part 1 (...  4.825624
10458    1000                               Frozen Planet (2011)  4.820727
12       1000                   Shawshank Redemption, The (1994)  4.818289
9        1000          Star Wars: Episode IV - A New Hope (1977)  4.809208
2612     1000                                Interstellar (2014)  4.800105


Êtes-vous satisfait de ces recommandations ? (Oui/Non):  non


   userId                                              title      note
0    1000                              Godfather, The (1972)  4.926987
1    1000  Harry Potter and the Deathly Hallows: Part 1 (...  4.825624
2    1000                                   Gladiator (2000)  4.882262
3    1000                               Frozen Planet (2011)  4.820727
4    1000          Star Wars: Episode IV - A New Hope (1977)  4.809208
5    1000                                Interstellar (2014)  4.800105
6    1000                                  Braveheart (1995)  4.840091
7    1000                      Dark Knight Rises, The (2012)  4.862237
8    1000                     Godfather: Part II, The (1974)  4.913250
9    1000                   Shawshank Redemption, The (1994)  4.818289


In [82]:
from sklearn.preprocessing import MinMaxScaler
def recommandations_hybride(user_id, titre, num_recommendations=10, alpha=0.8, n=100):

    scaler = MinMaxScaler()
    # Obtenir les recommandations basées sur le contenu
    rec_content = recommandations_content(titre, sim_euclidienne, num_recommendations*n)
    rec_content = rec_content.set_index('title')
    rec_content = rec_content.rename(columns={'score': 'score_content'})
    rec_content['score_content'] = scaler.fit_transform(rec_content[['score_content']])
    print("Recommandations basées sur le contenu pour '{}':\n{}".format(titre, rec_content.head(10)))

    # Obtenir les recommandations basées sur le filtrage collaboratif
    rec_collab = recommandations_collab(user_id, num_recommendations*n)
    rec_collab = rec_collab.set_index('title')
    rec_collab = rec_collab.rename(columns={'note': 'score_collab'})
    rec_collab['score_collab'] = scaler.fit_transform(rec_collab[['score_collab']])
    print("Recommandations collaboratives pour l'utilisateur {}:\n{}".format(user_id, rec_collab.head(10)))
    
    # Fusionner les scores
    rec_combined = rec_content.join(rec_collab, how='outer').fillna(0)
    rec_combined['score'] = (alpha * rec_combined['score_content']) +((1 - alpha) * rec_combined['score_collab'])

    # Trier et retourner les recommandations
    rec_combined = rec_combined.sort_values('score', ascending=False)
    rec_combined = rec_combined[['score_content', 'score_collab', 'score']].reset_index()
    return rec_combined.head(num_recommendations)

In [83]:
user_id = 1000
titre = "Braveheart (1995)"
num_recommendations = 10
recommandations_final = recommandations_hybride(user_id, titre, num_recommendations, alpha=0.8)
print("Recommandations hybrides pour '{}':\n{}".format(titre, recommandations_final))


Recommandations basées sur le contenu pour 'Braveheart (1995)':
                            score_content
title                                    
Spartacus (1960)                 1.000000
Lawrence of Arabia (1962)        0.905894
Patton (1970)                    0.850188
Black Hawk Down (2001)           0.787831
Saving Private Ryan (1998)       0.764393
Rob Roy (1995)                   0.738802
Gladiator (2000)                 0.697220
Doctor Zhivago (1965)            0.690153
Dances with Wolves (1990)        0.670720
Last Emperor, The (1987)         0.631900
Recommandations collaboratives pour l'utilisateur 1000:
                                  userId  score_collab
title                                                 
Boondock Saints, The (2000)         1000      1.000000
Gladiator (2000)                    1000      0.904196
Dark Knight Rises, The (2012)       1000      0.903316
Shawshank Redemption, The (1994)    1000      0.885570
Braveheart (1995)                   1000      

In [84]:
user_id = 1000
titre = "Toy Story (1995)"
num_recommendations = 10
recommandations_final = recommandations_hybride(user_id, titre, num_recommendations, alpha=0.8)
print("Recommandations hybrides pour '{}':\n{}".format(titre, recommandations_final))


Recommandations basées sur le contenu pour 'Toy Story (1995)':
                                                    score_content
title                                                            
Toy Story 2 (1999)                                       1.000000
Monsters, Inc. (2001)                                    0.529893
Cars (2006)                                              0.433471
Adventures of Ichabod and Mr. Toad, The (1949)           0.360728
Pinocchio (1940)                                         0.341064
Finding Nemo (2003)                                      0.321429
101 Dalmatians (One Hundred and One Dalmatians)...       0.307143
Toy Story of Terror (2013)                               0.305799
Beauty and the Beast (1991)                              0.293408
Shrek (2001)                                             0.288695
Recommandations collaboratives pour l'utilisateur 1000:
                                  userId  score_collab
title                             