In [1]:
# 📦 Imports
import pandas as pd

from pathlib import Path
import sys

# add folder src/ to the python file
sys.path.append(str(Path().resolve().parent / "src"))

from content_based_reco import (
    get_nlp_content_based_recommendations,
    merge_movies_overviews, 
    load_content_based_data
)

In [2]:
df_movies = load_content_based_data("../data/u.item")

../data/u.item


In [3]:
# Get the overview from the Movie DataBase and load it inside df_movies

import requests
import time

# Remplace par ta propre clé
TMDB_API_KEY = "577e1c3e4565b15c362cbc3e21cd613b"

def get_movie_overview(title, year=None):
    base_url = "https://api.themoviedb.org/3/search/movie"
    params = {
        "api_key": TMDB_API_KEY,
        "query": title,
        "language": "en-US"
    }
    if year:
        params["year"] = year

    response = requests.get(base_url, params=params)
    if response.status_code != 200:
        return None

    results = response.json().get("results")
    if not results:
        return None

    return results[0].get("overview")

# Échantillon de films à traiter
sample_movies = df_movies[["movie_id", "title"]]

# Chemin du fichier sauvegardé
project_root = Path().resolve().parent  # Remonte d’un dossier
overview_file = project_root / "data" / "movie_overviews_sample.csv"

# Charger les résumés déjà récupérés s'ils existent
if overview_file.exists():
    overviews_df = pd.read_csv(overview_file)
else:
    overviews_df = pd.DataFrame(columns=["movie_id", "title", "overview"])

# Supprimer les doublons déjà traités
titles_done = set(overviews_df["title"])
to_fetch = sample_movies[~sample_movies["title"].isin(titles_done)]

# Récupération des résumés manquants
new_rows = []
for _, row in to_fetch.iterrows():
    title = row["title"]
    overview = get_movie_overview(title)
    if overview is None:
        print(f"❌ Aucun résumé trouvé pour : {title}")
    new_rows.append({
        "movie_id": row["movie_id"],
        "title": title,
        "overview": overview
    })
    time.sleep(0.2)  # éviter de spammer l'API

# Ajout au fichier existant
if new_rows:
    overviews_df = pd.concat([overviews_df, pd.DataFrame(new_rows)], ignore_index=True)
    overviews_df.to_csv(overview_file, index=False)
    print(f"Ajout de {len(new_rows)} nouveaux résumés sauvegardés dans {overview_file}")
else:
    print("Aucun nouveau résumé à ajouter.")

Aucun nouveau résumé à ajouter.


In [4]:
# Fusionner les deux datasets sur le titre
df = merge_movies_overviews(df_movies, overviews_df)

Index(['movie_id_x', 'title', 'genres', 'movie_id_y', 'overview'], dtype='object')


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df["text_features"])

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print(f"Matrice de similarité shape : {cosine_sim.shape}")

Matrice de similarité shape : (1704, 1704)


In [6]:
get_nlp_content_based_recommendations("Toy Story (1995)", cosine_sim, df)

Unnamed: 0,title,Score de similarité
0,"Pyromaniac's Love Story, A (1995)",0.35
1,"Story of Xinghua, The (1993)",0.3
2,Now and Then (1995),0.28
3,"To Have, or Not (1995)",0.28
4,"Philadelphia Story, The (1940)",0.28
5,FairyTale: A True Story (1997),0.26
6,"NeverEnding Story III, The (1994)",0.26
7,Entertaining Angels: The Dorothy Day Story (1996),0.2
8,Police Story 4: Project S (Chao ji ji hua) (1993),0.14
9,"Wife, The (1995)",0.12
