In [26]:
# 📦 Chargement des librairies
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# 📂 Chargement des données
ratings = pd.read_csv("../data/u.data", sep="\t", names=["user_id", "movie_id", "rating", "timestamp"])
movies = pd.read_csv("../data/u.item", sep="|", encoding="latin-1", header=None,
                     names=["movie_id", "title", "release_date", "video_release", "imdb_url", 
                            "unknown", "Action", "Adventure", "Animation", "Children", "Comedy", "Crime", 
                            "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", 
                            "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"])

# 📌 On récupère les colonnes de genres (de 5 à la fin)
genre_columns = movies.columns[5:]

# 🧠 Vecteurs de genres : chaque film devient un vecteur de 0 et 1 (pour chaque genre)
movie_genres = movies[["movie_id", "title"] + list(genre_columns)].copy()

In [None]:
# Moyenne des notes et nombre de votes par film
movie_stats = ratings.groupby("movie_id").agg(
    avg_rating=("rating", "mean"),
    count=("rating", "count")
)

# On garde que les films ayant au moins 50 notes
popular_movie_stats = movie_stats[movie_stats["count"] >= 50]

# On ajoute les titres
popular_movie_stats = popular_movie_stats.merge(movies[["movie_id", "title"]], on="movie_id")

# Top 10 des films les mieux notés
print(popular_movie_stats.sort_values("avg_rating", ascending=False).head(10))

     movie_id  avg_rating  count  \
321       408    4.491071    112   
273       318    4.466443    298   
140       169    4.466102    118   
376       483    4.456790    243   
91        114    4.447761     67   
51         64    4.445230    283   
448       603    4.387560    209   
10         12    4.385768    267   
38         50    4.358491    583   
149       178    4.344000    125   

                                                 title  
321                              Close Shave, A (1995)  
273                            Schindler's List (1993)  
140                         Wrong Trousers, The (1993)  
376                                  Casablanca (1942)  
91   Wallace & Gromit: The Best of Aardman Animatio...  
51                    Shawshank Redemption, The (1994)  
448                                 Rear Window (1954)  
10                          Usual Suspects, The (1995)  
38                                    Star Wars (1977)  
149                              

In [29]:
# 🔢 Nombre de notes par film
ratings_count = ratings.groupby("movie_id").size().sort_values(ascending=False)

# 🧠 Merge avec le nom des films
popular_movies = pd.merge(ratings_count.reset_index(name='count'), movies[["movie_id", "title"]], on="movie_id")

# 🎬 Top 10 des films les plus notés
print(popular_movies.head(10))

   movie_id  count                          title
0        50    583               Star Wars (1977)
1       258    509                 Contact (1997)
2       100    508                   Fargo (1996)
3       181    507      Return of the Jedi (1983)
4       294    485               Liar Liar (1997)
5       286    481    English Patient, The (1996)
6       288    478                  Scream (1996)
7         1    452               Toy Story (1995)
8       300    431           Air Force One (1997)
9       121    429  Independence Day (ID4) (1996)


In [30]:
# 🎯 Fonction pour recommander des films similaires
def recommend_similar_movies(film_title, top_n=5):
    # On récupère la ligne du film demandé
    selected = movie_genres[movie_genres["title"] == film_title]
    
    if selected.empty:
        print("Film non trouvé.")
        return []
    
    # On calcule la similarité cosinus avec tous les autres films
    similarities = cosine_similarity(
        selected[genre_columns],
        movie_genres[genre_columns]
    )

    # On ajoute les scores de similarité
    movie_genres["similarity"] = similarities[0]

    # On trie par similarité (exclut le film lui-même)
    similar_movies = movie_genres[movie_genres["title"] != film_title]
    return similar_movies.sort_values(by="similarity", ascending=False)[["title", "similarity"]].head(top_n)

In [36]:
#recommend_similar_movies("Star Wars (1977)")

movie_genres.head()

Unnamed: 0,movie_id,title,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,similarity
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0.0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0.516398
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0.0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.258199
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0.0
