In [92]:
# !pip install requests pandas scikit-learn umap-learn plotly numpy

In [93]:
import requests
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import umap.umap_ as umap
import plotly.express as px

In [None]:
# Ta clé API TMDb ici
API_KEY = "Clé_api"
BASE_URL = "https://api.themoviedb.org/3"

In [95]:

def fetch_movies(endpoint, params):
    url = f"{BASE_URL}/{endpoint}"
    params["api_key"] = API_KEY
    response = requests.get(url, params=params)
    response.raise_for_status()
    return response.json()

def get_popular_movies(pages=20, min_vote_count=1000):
    all_movies = []
    for page in range(1, pages + 1):
        data = fetch_movies("discover/movie", {
            "language": "fr-FR",
            "sort_by": "vote_average.desc",
            "vote_count.gte": min_vote_count,
            "page": page
        })
        all_movies.extend(data["results"])
    return all_movies

def get_recent_movies(pages=20):
    from datetime import datetime
    year = datetime.now().year
    all_movies = []
    for page in range(1, pages+1):
        data = fetch_movies("movie/popular", {
            "language": "fr-FR",
            "sort_by": "release_date.desc",
            "primary_release_year": year,
            "page": page
        })
        all_movies.extend(data["results"])
    return all_movies

def extract_movie_features(movies):
    # Récupère les colonnes utiles
    df = pd.DataFrame(movies)
    # df = df[["id","title","overview","release_date","vote_average"]]
    df["overview"] = df["overview"].fillna("")
    df["release_date"] = pd.to_datetime(df["release_date"], errors="coerce")
    return df

In [96]:
# Récupération des données
popular_movies = get_popular_movies()
recent_movies = get_recent_movies()

df_popular = extract_movie_features(popular_movies)
df_recent = extract_movie_features(recent_movies)

# Ajout d'une colonne pour la source
df_popular["category"] = "popular"
df_recent["category"] = "recent"

In [97]:
df_popular

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,category
0,False,/zfbjgQE1uSd9wiPTX4VzsLi0rGG.jpg,"[18, 80]",278,en,The Shawshank Redemption,"En 1947, Andy Dufresne, un jeune banquier, est...",26.6187,/t30GjttOdb5At1sYy8b3TOwFgWV.jpg,1994-09-23,Les Évadés,False,8.712,28502,popular
1,False,/tmU7GeKVybMWFButWEGl2M4GeiP.jpg,"[18, 80]",238,en,The Godfather,La Seconde Guerre mondiale vient de s'achever....,26.4481,/k3uIbYtiuK8pwbCcbma29nTqmgG.jpg,1972-03-14,Le Parrain,False,8.688,21584,popular
2,False,/kGzFbGhp99zva6oZODW5atUtnqi.jpg,"[18, 80]",240,en,The Godfather Part II,"Depuis la mort de Don Vito Corleone, son fils ...",14.7361,/jUjglfsuWdTfs7XURF3Jqf8oTJ7.jpg,1974-12-20,"Le Parrain, 2e partie",False,8.571,13040,popular
3,False,/zb6fM1CX41D9rF9hdgclu0peUmy.jpg,"[18, 36, 10752]",424,en,Schindler's List,Évocation des années de guerre d’Oskar Schindl...,12.8044,/fLRbv1fGQD0OCPWkFy7PI2sESLj.jpg,1993-12-15,La Liste de Schindler,False,8.565,16516,popular
4,False,/bxgTSUenZDHNFerQ1whRKplrMKF.jpg,[18],389,en,12 Angry Men,Un jeune homme d'origine modeste est accusé du...,15.7631,/fFXrCl7nBFFaQU3IgTlinvk6vTi.jpg,1957-04-10,Douze Hommes en colère,False,8.500,9225,popular
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,False,/tcZi0LQc5iu2u6i5aOWyj9NBBvl.jpg,"[18, 10749]",80,en,Before Sunset,"Neuf ans auparavant, Jesse et Céline se sont r...",3.5591,/gqPkhjDOSaYcpRwedOtrc5JmZbp.jpg,2004-06-16,Before Sunset,False,7.800,3513,popular
396,False,/9AbessEz2YKjmPtYKATDeYRW3Px.jpg,"[16, 35, 10751]",531,en,The Wrong Trousers,"Pour payer ses factures, Wallace, l'inventeur,...",1.6923,/il1nrKu1ujuUhTK0GtCZXi5gCcF.jpg,1993-12-17,Wallace & Gromit : Un mauvais pantalon,False,7.805,1071,popular
397,False,/620hnMVLu6RSZW6a5rwO8gqpt0t.jpg,"[16, 35, 14, 12, 10751]",508943,en,Luca,"Un jeune garçon, Luca, vit un été inoubliable,...",10.9993,/bADnZZZdzQtajVwJ8MVWYlQ6Iq2.jpg,2021-06-17,Luca,False,7.803,8571,popular
398,False,/h1vBJ0uN4QCbBb5WVKxmtfTrHoY.jpg,[37],11697,en,The Man Who Shot Liberty Valance,"Stoddard, un jeune avocat, vient de s'installe...",8.6184,/6jTZ83zj8Uu2qVflOLmqbwWlu7v.jpg,1962-04-13,L'Homme qui tua Liberty Valance,False,7.804,1203,popular


In [98]:
df_recent

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,category
0,False,/sItIskd5xpiE64bBWYwZintkGf3.jpg,"[28, 53, 80]",541671,en,Ballerina,"Enfant, Eve Macarro a assisté impuissante au m...",658.8665,/e7zUVzux574daVsOlbcvmqEieyn.jpg,2025-06-04,Ballerina,False,7.158,519,recent
1,False,/rthMuZfFv4fqEU4JVbgSW9wQ8rs.jpg,"[28, 878, 12]",986056,en,Thunderbolts*,"Yelena Belova, Bucky Barnes, Red Guardian, Le ...",625.6935,/3OhI4smjsUNHPzEHN0IUNn1hfSP.jpg,2025-04-30,Thunderbolts*,False,7.470,1404,recent
2,False,/uIpJPDNFoeX0TVml9smPrs9KUVx.jpg,"[27, 9648]",574475,en,Final Destination Bloodlines,"Stefani, 18 ans, fait d’affreux cauchemars. Da...",402.5293,/4uI8C2zcfLWRhZDBgd0oTlZjV9j.jpg,2025-05-14,Destination finale : Bloodlines,False,7.206,1462,recent
3,False,/x58Gk2ZGU5AEBp25MQe2nhZhd5z.jpg,"[28, 14]",846422,en,The Old Guard 2,Andy et son équipe de guerriers immortels retr...,558.4141,/8iJmsUdYvyNNc4C9ukNM3D6plep.jpg,2025-07-01,The Old Guard 2,False,6.000,87,recent
4,False,/7Zx3wDG5bBtcfk8lcnCWDOLM4Y4.jpg,"[10751, 878, 35, 12]",552524,en,Lilo & Stitch,"Sur la planète Turo, le professeur Jumba compa...",302.6558,/71IjwRa88OJMYJBntId7nn0eFHy.jpg,2025-05-17,Lilo & Stitch,False,7.114,774,recent
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,False,/g4j7H4yDoCR90X1VV2IEvAZ1LAP.jpg,"[18, 10749]",707610,ko,가슴 큰 울 엄마,,14.3792,/zarhMAQRWKjjsNBR1rviIXZ5xtt.jpg,2020-05-06,가슴 큰 울 엄마,False,6.250,12,recent
396,False,/uzIGtyS6bbnJzGsPL93WCF1FWm8.jpg,"[12, 28, 14]",1865,en,Pirates of the Caribbean: On Stranger Tides,"Dans cette histoire pleine d’action, où vérité...",14.0061,/5JjjjGg24IGRXIQtaZkPU59acjV.jpg,2011-05-15,Pirates des Caraïbes : La Fontaine de jouvence,False,6.560,14360,recent
397,False,/f6nBStUsAEbn8i1vjMwqeaENxv9.jpg,[10749],1442532,tl,Malagkit,,12.7047,/cEJe5DfzXb0hqOsMzJxHzRjYLWq.jpg,2025-03-11,Malagkit,False,8.000,6,recent
398,False,/4PAtIaArgPMpKe6fkXw051SYR88.jpg,[18],50270,it,Paprika,"En 1958, Mimma, une jeune femme plantureuse jo...",12.7160,/odTCrXktfjTKhsgukKzepvIfdGC.jpg,1991-02-13,Paprika,False,7.100,537,recent


In [99]:
df_all = pd.concat([df_popular, df_recent]).drop_duplicates(subset="id").reset_index(drop=True)

In [100]:
# Vectorisation TF-IDF sur les résumés
vectorizer = TfidfVectorizer(max_features=500, stop_words="english")
X_tfidf = vectorizer.fit_transform(df_all["overview"])

In [101]:
# Réduction dimensionnelle avec UMAP
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
embedding = reducer.fit_transform(X_tfidf.toarray())

df_all["x"] = embedding[:,0]
df_all["y"] = embedding[:,1]


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [102]:
# Clustering KMeans (nombre de clusters à ajuster)
n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df_all["cluster"] = kmeans.fit_predict(embedding)

In [103]:
fig = px.scatter(df_all, x="x", y="y",
                 color="cluster",
                 symbol="category",
                 hover_data=["title","release_date","vote_average","category"],
                 title="Clustering films populaires et récents TMDb")

In [104]:
fig.show()

In [105]:
import numpy as np

def find_closest_movie(df, input_title):
    # Cherche le film dans le dataset
    if input_title not in df['title'].values:
        print(f"Le film '{input_title}' n'a pas été trouvé dans le dataset.")
        return None
    
    # df_recent = df[df['category'] == 'recent'].copy()
    
    # Coordonnées du film input
    input_point = df.loc[df['title'] == input_title, ['x', 'y']].values[0]
    
    # Calcul des distances euclidiennes vers tous les points
    df['distance'] = np.linalg.norm(df[['x', 'y']].values - input_point, axis=1)
    
    # Exclure le film lui-même (distance = 0)
    df_filtered = df[df['title'] != input_title]
    
    # Trouver le film avec la distance minimale
    closest_movie = df_filtered.loc[df_filtered['distance'].idxmin()]
    
    return closest_movie[['title', 'distance', 'cluster']]

In [108]:
input_title = "La Liste de Schindler"  # Exemple de film à rechercher
closest = find_closest_movie(df_all, input_title)

if closest is not None:
    print(f"Le film le plus proche de '{input_title}' est '{closest['title']}' dans le cluster {closest['cluster']} à une distance de {closest['distance']:.3f}")

Le film le plus proche de 'La Liste de Schindler' est 'Tu ne tueras point' dans le cluster 3 à une distance de 0.068
