In [152]:
import pandas as pd
import pickle
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from surprise import Reader, Dataset
from surprise import SVD
from surprise.model_selection import cross_validate

In [150]:
# Content-Based filtering
df_content_tags = pd.read_csv("../data/interim/movies_tags.csv")
df_content_tags.dropna(subset=['all_tags'], inplace=True)
tfidf = TfidfVectorizer()
matrice_tfidf = tfidf.fit_transform(df_content_tags['all_tags'])
sim_cosinus = cosine_similarity(matrice_tfidf, matrice_tfidf)
sim_euclidienne = 1 / (1 + euclidean_distances(matrice_tfidf))
indices = pd.Series(range(0,len(df_content_tags)), index=df_content_tags.title)

def recommandations_content(titre, cos_sim, num_recommendations=10):
    idx = indices[titre]
    scores_similarite = list(enumerate(cos_sim[idx]))
    scores_similarite = sorted(scores_similarite, key=lambda x: x[1], reverse=True)
    top_similair = scores_similarite[1:num_recommendations+1]
    res = [(indices.index[idx], score) for idx, score in top_similair]
    res = pd.DataFrame(res)
    res = res.rename(columns={0: 'title', 1: 'score'})
    return res

In [153]:
print(matrice_tfidf.shape)
print(recommandations_content("Toy Story (1995)", sim_cosinus))

(27053, 7877)
                                               title     score
0                                 Toy Story 2 (1999)  0.574920
1                              Monsters, Inc. (2001)  0.401519
2                                        Cars (2006)  0.357911
3     Adventures of Ichabod and Mr. Toad, The (1949)  0.322854
4                                   Pinocchio (1940)  0.313041
5                                Finding Nemo (2003)  0.303094
6  101 Dalmatians (One Hundred and One Dalmatians...  0.295764
7                         Toy Story of Terror (2013)  0.295070
8                        Beauty and the Beast (1991)  0.288640
9                                       Shrek (2001)  0.286178


In [154]:
# Collaborative filtering
df_ratings = pd.read_csv("../data/raw/ratings.csv")
df_ratings = df_ratings.drop("timestamp", axis=1)
df_movies = pd.read_csv("../data/raw/movies.csv")
reader = Reader(rating_scale=(0, 5))
df_surprise = Dataset.load_from_df(df_ratings, reader=reader)

svd = SVD()
cross_validate(svd, df_surprise,measures=['RMSE', 'MAE'], cv=5, verbose=True )


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7858  0.7866  0.7867  0.7856  0.7857  0.7861  0.0005  
MAE (testset)     0.5979  0.5985  0.5984  0.5977  0.5979  0.5981  0.0003  
Fit time          129.50  113.54  128.29  116.93  137.88  125.23  8.87    
Test time         130.24  93.50   92.29   101.22  144.64  112.38  21.19   


{'test_rmse': array([0.78579304, 0.78657816, 0.7866648 , 0.7856002 , 0.78571269]),
 'test_mae': array([0.59791224, 0.59849625, 0.59841682, 0.59771227, 0.59786458]),
 'fit_time': (129.49917697906494,
  113.54407215118408,
  128.2946548461914,
  116.93467712402344,
  137.88201713562012),
 'test_time': (130.24111986160278,
  93.49926328659058,
  92.2891640663147,
  101.21671915054321,
  144.64489603042603)}

In [155]:
train_set = df_surprise.build_full_trainset()
svd.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x138748260>

In [175]:
def recommandations_collab(user_id, num_recommendations=10, start_index=0):
    # Générer l'anti-test set pour l'utilisateur cible
    anti_testset = []
    targetUser = train_set.to_inner_uid(user_id)
    moyenne = train_set.global_mean
    user_note = train_set.ur[targetUser]
    user_livre = [item for (item, _) in user_note]

    for livre in train_set.all_items():
        if livre not in user_livre:
            anti_testset.append((user_id, train_set.to_raw_iid(livre), moyenne))

    # Effectuer les prédictions
    predictionsSVD = svd.test(anti_testset)
    predictionsSVD = pd.DataFrame(predictionsSVD)

    # Mapper les IDs de films aux titres
    df_movies = pd.read_csv("../data/raw/movies.csv")
    movieId_title_map = df_movies.set_index('movieId')['title'].to_dict()
    predictionsSVD['title'] = predictionsSVD['iid'].map(movieId_title_map)

    # Renommer les colonnes et trier par note estimée
    predictionsSVD = predictionsSVD.rename(columns={'uid': 'userId', 'est': 'note'})
    predictionsSVD = predictionsSVD[['userId', 'title', 'note']]
    predictionsSVD.sort_values('note', ascending=False, inplace=True)

    # Retourner les top N recommandations
    return predictionsSVD.iloc[start_index:start_index + num_recommendations]

In [134]:
with open("../models/svd_model.pkl", "rb") as filehandler:
        svd_model = pickle.load(filehandler)


In [176]:
user_id = 1000
num_recommendations = 20
top_recommendations_collab = recommandations_collab(user_id, num_recommendations)
print(top_recommendations_collab)

       userId                                              title      note
12       1000                   Shawshank Redemption, The (1994)  4.992657
10458    1000                               Frozen Planet (2011)  4.839372
172      1000                                  Braveheart (1995)  4.821065
562      1000                                   Gladiator (2000)  4.809198
1809     1000                      Dark Knight Rises, The (2012)  4.793414
359      1000                                Forrest Gump (1994)  4.753796
9        1000          Star Wars: Episode IV - A New Hope (1977)  4.731386
640      1000                            Schindler's List (1993)  4.716473
3719     1000                                   Gladiator (1992)  4.713671
1813     1000                     Star Trek Into Darkness (2013)  4.707517
306      1000                          American History X (1998)  4.705204
30       1000  Star Wars: Episode V - The Empire Strikes Back...  4.696736
2955     1000            

In [160]:
from sklearn.preprocessing import MinMaxScaler
def recommandations_hybride(user_id, titre, num_recommendations=10, alpha=0.8, n=1000):

    scaler = MinMaxScaler()
    # Obtenir les recommandations basées sur le contenu
    rec_content = recommandations_content(titre, sim_euclidienne, num_recommendations*n)
    rec_content = rec_content.set_index('title')
    rec_content = rec_content.rename(columns={'score': 'score_content'})
    rec_content['score_content'] = scaler.fit_transform(rec_content[['score_content']])
    print("Recommandations basées sur le contenu pour '{}':\n{}".format(titre, rec_content.head(10)))

    # Obtenir les recommandations basées sur le filtrage collaboratif
    rec_collab = collab_reco(user_id, num_recommendations*n)
    rec_collab = rec_collab.set_index('title')
    rec_collab = rec_collab.rename(columns={'note': 'score_collab'})
    rec_collab['score_collab'] = scaler.fit_transform(rec_collab[['score_collab']])
    print("Recommandations collaboratives pour l'utilisateur {}:\n{}".format(user_id, rec_collab.head(10)))
    
    # Fusionner les scores
    rec_combined = rec_content.join(rec_collab, how='outer').fillna(0)
    rec_combined['score'] = (alpha * rec_combined['score_content']) +((1 - alpha) * rec_combined['score_collab'])

    # Trier et retourner les recommandations
    rec_combined = rec_combined.sort_values('score', ascending=False)
    rec_combined = rec_combined[['score_content', 'score_collab', 'score']].reset_index()
    return rec_combined.head(num_recommendations)

In [161]:
user_id = 1000
titre = "Braveheart (1995)"
num_recommendations = 10
recommandations_final = recommandations_hybride(user_id, titre, num_recommendations, alpha=0.8)
print("Recommandations hybrides pour '{}':\n{}".format(titre, recommandations_final))


Recommandations basées sur le contenu pour 'Braveheart (1995)':
                            score_content
title                                    
Spartacus (1960)                 1.000000
Lawrence of Arabia (1962)        0.926684
Patton (1970)                    0.883285
Black Hawk Down (2001)           0.834704
Saving Private Ryan (1998)       0.816444
Rob Roy (1995)                   0.796507
Gladiator (2000)                 0.764112
Doctor Zhivago (1965)            0.758606
Dances with Wolves (1990)        0.743466
Last Emperor, The (1987)         0.713222
Recommandations collaboratives pour l'utilisateur 1000:
                                                    userId  score_collab
title                                                                   
Interstellar (2014)                                   1000      1.000000
Dark Knight Rises, The (2012)                         1000      0.986494
Shawshank Redemption, The (1994)                      1000      0.980251
Boondock Sa

In [162]:
user_id = 1000
titre = "Toy Story (1995)"
num_recommendations = 10
recommandations_final = recommandations_hybride(user_id, titre, num_recommendations, alpha=0.8)
print("Recommandations hybrides pour '{}':\n{}".format(titre, recommandations_final))


Recommandations basées sur le contenu pour 'Toy Story (1995)':
                                                    score_content
title                                                            
Toy Story 2 (1999)                                       1.000000
Monsters, Inc. (2001)                                    0.585150
Cars (2006)                                              0.500061
Adventures of Ichabod and Mr. Toad, The (1949)           0.435868
Pinocchio (1940)                                         0.418515
Finding Nemo (2003)                                      0.401188
101 Dalmatians (One Hundred and One Dalmatians)...       0.388581
Toy Story of Terror (2013)                               0.387395
Beauty and the Beast (1991)                              0.376461
Shrek (2001)                                             0.372302
Recommandations collaboratives pour l'utilisateur 1000:
                                                    userId  score_collab
title           

In [179]:
def generer_nouvelles_recommandations(top_recommendations_collab):

    start_index = len(top_recommendations_collab)
    nouvelles_recommandations = recommandations_collab(user_id, num_recommendations, start_index)
    
    return nouvelles_recommandations

user_id = 1000
num_recommendations = 10
top_recommendations_collab = recommandations_collab(user_id, num_recommendations)
print(top_recommendations_collab)
satisfaction = input("Êtes-vous satisfait de ces recommandations ? (Oui/Non): ")

if satisfaction.lower() == "non":
    nouvelles_recommandations = generer_nouvelles_recommandations(top_recommendations_collab)
    print(nouvelles_recommandations)


       userId                                      title      note
12       1000           Shawshank Redemption, The (1994)  4.992657
10458    1000                       Frozen Planet (2011)  4.839372
172      1000                          Braveheart (1995)  4.821065
562      1000                           Gladiator (2000)  4.809198
1809     1000              Dark Knight Rises, The (2012)  4.793414
359      1000                        Forrest Gump (1994)  4.753796
9        1000  Star Wars: Episode IV - A New Hope (1977)  4.731386
640      1000                    Schindler's List (1993)  4.716473
3719     1000                           Gladiator (1992)  4.713671
1813     1000             Star Trek Into Darkness (2013)  4.707517


Êtes-vous satisfait de ces recommandations ? (Oui/Non):  non


       userId                                              title      note
306      1000                          American History X (1998)  4.705204
30       1000  Star Wars: Episode V - The Empire Strikes Back...  4.696736
2955     1000                        Battlestar Galactica (2003)  4.692232
3        1000                        Seven (a.k.a. Se7en) (1995)  4.681164
4        1000                         Usual Suspects, The (1995)  4.679545
2599     1000                                Intouchables (2011)  4.678799
1807     1000                               Avengers, The (2012)  4.677890
2621     1000                          The Imitation Game (2014)  4.673673
12758    1000                                     The War (2007)  4.659965
2942     1000                        Boondock Saints, The (2000)  4.654606


In [192]:
from surprise.model_selection import GridSearchCV

param_grid = {'n_factors': [50, 100, 150],
              'n_epochs': [20, 30, 40],
              'lr_all': [0.005, 0.01, 0.015],
              'reg_all': [0.02, 0.04, 0.06]}
svd = SVD()
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
  

In [None]:
grid_search.fit(df_surprise)    