In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from surprise import Reader, Dataset
from surprise import SVD
from surprise.model_selection import cross_validate

In [2]:
# Content-Based filtering
df_content_tags = pd.read_csv("../data/interim/movies_tags.csv")
tfidf = TfidfVectorizer()
matrice_tfidf = tfidf.fit_transform(df_content_tags['tags'])
sim_cosinus = cosine_similarity(matrice_tfidf, matrice_tfidf)
sim_euclidienne = 1 / (1 + euclidean_distances(matrice_tfidf))
indices = pd.Series(range(0,len(df_content_tags)), index=df_content_tags.title)

def recommandations_content(titre, cos_sim, num_recommendations=10):
    idx = indices[titre]
    scores_similarite = list(enumerate(cos_sim[idx]))
    scores_similarite = sorted(scores_similarite, key=lambda x: x[1], reverse=True)
    top_similair = scores_similarite[1:num_recommendations+1]
    res = [(indices.index[idx], score) for idx, score in top_similair]
    res = pd.DataFrame(res)
    res = res.rename(columns={0: 'title', 1: 'score'})
    return res

In [3]:
print(matrice_tfidf.shape)
print(recommandations_content("Toy Story (1995)", sim_cosinus))

(27278, 23964)
                               title     score
0                 Toy Story 2 (1999)  0.579080
1                Finding Nemo (2003)  0.350542
2                   Pinocchio (1940)  0.324236
3               Bug's Life, A (1998)  0.320946
4                        Cars (2006)  0.305336
5        Beauty and the Beast (1991)  0.298094
6  Toy Story That Time Forgot (2014)  0.291482
7              Monsters, Inc. (2001)  0.278960
8                 Ratatouille (2007)  0.276444
9              Lion King, The (1994)  0.276129


In [4]:
# Collaborative filtering
df_ratings = pd.read_csv("../data/raw/ratings.csv")
df_ratings = df_ratings.drop("timestamp", axis=1)
df_movies = pd.read_csv("../data/raw/movies.csv")
reader = Reader(rating_scale=(0, 5))
df_surprise = Dataset.load_from_df(df_ratings, reader=reader)

svd = SVD()
cross_validate(svd, df_surprise,measures=['RMSE', 'MAE'], cv=5, verbose=True )


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7862  0.7863  0.7858  0.7860  0.7858  0.7860  0.0002  
MAE (testset)     0.5981  0.5986  0.5979  0.5983  0.5981  0.5982  0.0002  
Fit time          103.16  132.09  112.49  142.39  119.36  121.90  13.93   
Test time         37.19   39.39   35.78   43.92   32.24   37.71   3.88    


{'test_rmse': array([0.78622502, 0.78634878, 0.78575128, 0.78600815, 0.78582066]),
 'test_mae': array([0.59807191, 0.59859678, 0.59793272, 0.59832722, 0.59805088]),
 'fit_time': (103.1645679473877,
  132.0932011604309,
  112.49283027648926,
  142.39242315292358,
  119.36145806312561),
 'test_time': (37.18884301185608,
  39.39433407783508,
  35.783851861953735,
  43.92043590545654,
  32.243289947509766)}

In [8]:
from surprise.model_selection import GridSearchCV

param_grid = {'n_factors': [100,150],
              'n_epochs': [20,25,30],
              'lr_all':[0.005,0.01,0.1],
              'reg_all':[0.02,0.05,0.1]}
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=3)
grid_search.fit(df_surprise)     


KeyboardInterrupt



In [9]:
print(grid_search.best_score['rmse'])

print(grid_search.best_score['mae'])

print(grid_search.best_params['rmse'])

tunedSVD = grid_search.best_estimator['rmse']

cross_validate(tunedSVD, df_surprise, measures=['RMSE', 'MAE'], cv=5, verbose=True)

AttributeError: 'GridSearchCV' object has no attribute 'best_score'

In [5]:
train_set = df_surprise.build_full_trainset()

In [6]:
def recommandations_collab(user_id, num_recommendations=10):
    # Générer l'anti-test set pour l'utilisateur cible
    anti_testset = []
    targetUser = train_set.to_inner_uid(user_id)
    moyenne = train_set.global_mean
    user_note = train_set.ur[targetUser]
    user_livre = [item for (item, _) in user_note]

    for livre in train_set.all_items():
        if livre not in user_livre:
            anti_testset.append((user_id, train_set.to_raw_iid(livre), moyenne))

    # Effectuer les prédictions
    predictionsSVD = svd.test(anti_testset)
    predictionsSVD = pd.DataFrame(predictionsSVD)

    # Mapper les IDs de films aux titres
    movieId_title_map = df_movies.set_index('movieId')['title'].to_dict()
    predictionsSVD['title'] = predictionsSVD['iid'].map(movieId_title_map)

    # Renommer les colonnes et trier par note estimée
    predictionsSVD = predictionsSVD.rename(columns={'uid': 'userId', 'est': 'note'})
    predictionsSVD = predictionsSVD[['userId', 'title', 'note']]
    predictionsSVD.sort_values('note', ascending=False, inplace=True)

    # Retourner les top N recommandations
    return predictionsSVD.head(num_recommendations)

In [7]:
user_id = 1000
num_recommendations = 20
top_recommendations_collab = recommandations_collab(user_id, num_recommendations)
print(top_recommendations_collab)

       userId                                              title      note
1809     1000                      Dark Knight Rises, The (2012)  4.957928
172      1000                                  Braveheart (1995)  4.865203
934      1000                                         300 (2007)  4.799455
2612     1000                                Interstellar (2014)  4.797906
562      1000                                   Gladiator (2000)  4.783908
10457    1000                                 Bleak House (2005)  4.749111
12       1000                   Shawshank Redemption, The (1994)  4.735469
5654     1000     Batman: The Dark Knight Returns, Part 2 (2013)  4.730704
180      1000  Star Wars: Episode VI - Return of the Jedi (1983)  4.725811
12770    1000          Doctor Who: The Time of the Doctor (2013)  4.714095
1796     1000  Harry Potter and the Deathly Hallows: Part 1 (...  4.704260
3719     1000                                   Gladiator (1992)  4.695376
1862     1000            

In [339]:
import pickle

with open("../models/svd_model.pkl", "wb") as f:
    pickle.dump(svd, f)

In [222]:
import random

def generer_nouvelles_recommandations(recommandations, nb_recommandations=10):
    # Convertir les recommandations en liste
    recommandations_liste = recommandations.values.tolist()
    # Mélanger aléatoirement les recommandations
    random.shuffle(recommandations_liste)
    # Sélectionner un sous-ensemble de recommandations
    nouvelles_recommandations = recommandations_liste[:nb_recommandations]
    # Convertir la liste en DataFrame
    nouvelles_recommandations_df = pd.DataFrame(nouvelles_recommandations, columns=recommandations.columns)
    return nouvelles_recommandations_df

# Afficher les recommandations initiales
top_recommendations_collab = recommandations_collab(user_id, num_recommendations)
afficher_recommandations(top_recommendations_collab)

# Recueillir le retour utilisateur
satisfaction = input("Êtes-vous satisfait de ces recommandations ? (Oui/Non): ")

if satisfaction.lower() == "non":
    # Générer de nouvelles recommandations sans réentraîner le modèle
    nouvelles_recommandations = generer_nouvelles_recommandations(top_recommendations_collab)
    # Afficher les nouvelles recommandations
    afficher_recommandations(nouvelles_recommandations)


       userId                                              title      note
238      1000                              Godfather, The (1972)  4.926987
260      1000                     Godfather: Part II, The (1974)  4.913250
562      1000                                   Gladiator (2000)  4.882262
1809     1000                      Dark Knight Rises, The (2012)  4.862237
172      1000                                  Braveheart (1995)  4.840091
1796     1000  Harry Potter and the Deathly Hallows: Part 1 (...  4.825624
10458    1000                               Frozen Planet (2011)  4.820727
12       1000                   Shawshank Redemption, The (1994)  4.818289
9        1000          Star Wars: Episode IV - A New Hope (1977)  4.809208
2612     1000                                Interstellar (2014)  4.800105


Êtes-vous satisfait de ces recommandations ? (Oui/Non):  non


   userId                                              title      note
0    1000                              Godfather, The (1972)  4.926987
1    1000  Harry Potter and the Deathly Hallows: Part 1 (...  4.825624
2    1000                                   Gladiator (2000)  4.882262
3    1000                               Frozen Planet (2011)  4.820727
4    1000          Star Wars: Episode IV - A New Hope (1977)  4.809208
5    1000                                Interstellar (2014)  4.800105
6    1000                                  Braveheart (1995)  4.840091
7    1000                      Dark Knight Rises, The (2012)  4.862237
8    1000                     Godfather: Part II, The (1974)  4.913250
9    1000                   Shawshank Redemption, The (1994)  4.818289


In [353]:
def recommandations_hybride(user_id, titre, num_recommendations=10, alpha=0.8, n=100):
    # Obtenir les recommandations basées sur le contenu
    rec_content = recommandations_content(titre, sim_euclidienne, num_recommendations*n)
    rec_content = rec_content.set_index('title')
    rec_content = rec_content.rename(columns={'score': 'score_content'})
    print("Recommandations basées sur le contenu pour '{}':\n{}".format(titre, rec_content.head(10)))

    # Obtenir les recommandations basées sur le filtrage collaboratif
    rec_collab = recommandations_collab(user_id, num_recommendations*n)
    rec_collab = rec_collab.set_index('title')
    rec_collab = rec_collab.rename(columns={'note': 'score_collab'})
    rec_collab['score_collab'] = rec_collab['score_collab'] / 5.0
    print("Recommandations collaboratives pour l'utilisateur {}:\n{}".format(user_id, rec_collab.head(10)))

    # Fusionner les scores
    rec_combined = rec_content.join(rec_collab, how='outer').fillna(0)
    rec_combined['score'] = (alpha * rec_combined['score_content']) +((1 - alpha) * rec_combined['score_collab'])

    # Trier et retourner les recommandations
    rec_combined = rec_combined.sort_values('score', ascending=False)
    rec_combined = rec_combined[['score_content', 'score_collab', 'score']].reset_index()
    return rec_combined.head(num_recommendations)

In [350]:
user_id = 1000
titre = "Braveheart (1995)"
num_recommendations = 10
recommandations_final = recommandations_hybride(user_id, titre, num_recommendations, alpha=0.8)
print("Recommandations hybrides pour '{}':\n{}".format(titre, recommandations_final))


Recommandations basées sur le contenu pour 'Braveheart (1995)':
                                                    score_content
title                                                            
Dances with Wolves (1990)                                0.467537
Ben-Hur (1959)                                           0.464971
Schindler's List (1993)                                  0.464550
Lawrence of Arabia (1962)                                0.463550
Attack Force Z (a.k.a. The Z Men) (Z-tzu te kun...       0.463007
Saving Private Ryan (1998)                               0.462406
Bounty, The (1984)                                       0.460290
Platoon (1986)                                           0.458120
River, The (1984)                                        0.457828
Gone with the Wind (1939)                                0.456969
Recommandations collaboratives pour l'utilisateur 1000:
                                                    userId  score_collab
title          

In [351]:
user_id = 1000
titre = "Innocence (2014)"
num_recommendations = 10
recommandations_final = recommandations_hybride(user_id, titre, num_recommendations, alpha=0.8)
print("Recommandations hybrides pour '{}':\n{}".format(titre, recommandations_final))


Recommandations basées sur le contenu pour 'Innocence (2014)':
                                                    score_content
title                                                            
Whom the Gods Wish to Destroy (Nibelungen, Teil...       0.639589
Dragonworld (1994)                                       0.639589
Verbo (2011)                                             0.639589
Snowmageddon (2011)                                      0.639589
Boy and the Pirates, The (1960)                          0.639589
Neverland (2011)                                         0.639589
Sleeping Beauty (2014)                                   0.639589
Aladdin (1986)                                           0.639589
Merlin's Apprentice (2006)                               0.639589
The Lost Continent (1968)                                0.639589
Recommandations collaboratives pour l'utilisateur 1000:
                                                    userId  score_collab
title           

In [354]:
titre = "Toy Story (1995)"
recommandations = recommandations_hybride(user_id, titre, num_recommendations, alpha=0.8)
print("Recommandations hybrides pour '{}':\n{}".format(titre, recommandations))

Recommandations basées sur le contenu pour 'Toy Story (1995)':
                                   score_content
title                                           
Toy Story 2 (1999)                      0.521507
Finding Nemo (2003)                     0.467355
Pinocchio (1940)                        0.462417
Bug's Life, A (1998)                    0.461813
Cars (2006)                             0.458990
Beauty and the Beast (1991)             0.457702
Toy Story That Time Forgot (2014)       0.456539
Monsters, Inc. (2001)                   0.454366
Ratatouille (2007)                      0.453935
Lion King, The (1994)                   0.453881
Recommandations collaboratives pour l'utilisateur 1000:
                                                    userId  score_collab
title                                                                   
Godfather, The (1972)                                 1000      0.985397
Godfather: Part II, The (1974)                        1000      0.982650
G