In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

In [2]:
df = pd.read_csv("hf://datasets/HenryWaltson/TMDB-IMDB-Movies-Dataset/TMDB  IMDB Movies Dataset.csv")

df.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,genres,production_companies,production_countries,spoken_languages,keywords,directors,writers,averageRating,numVotes,cast
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...",Christopher Nolan,Christopher Nolan,8.8,2770171,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,...",Christopher Nolan,"Jonathan Nolan, Christopher Nolan",8.7,2456592,"Matthew McConaughey, Anne Hathaway, Michael Ca..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f...",Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Go...",9.1,3116358,"Christian Bale, Heath Ledger, Aaron Eckhart, M..."
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ...",James Cameron,James Cameron,7.9,1472421,"Sam Worthington, Zoe Saldaña, Sigourney Weaver..."
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com...",Joss Whedon,"Joss Whedon, Zak Penn",8.0,1539740,"Robert Downey Jr., Chris Evans, Mark Ruffalo, ..."


## Exploration du Dataset ##

The Dataset was found on [HuggingFace](https://huggingface.co/datasets/HenryWaltson/TMDB-IMDB-Movies-Dataset).
It is composed of TMDB and IMDB movies data.

In [3]:
print(f"The dataset contains {df.shape[0]} rows and {df.shape[1]} columns")

The dataset contains 434803 rows and 29 columns


In [4]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 434803 entries, 0 to 434802
Data columns (total 29 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    434803 non-null  int64  
 1   title                 434803 non-null  str    
 2   vote_average          434803 non-null  float64
 3   vote_count            434803 non-null  int64  
 4   status                434803 non-null  str    
 5   release_date          414261 non-null  str    
 6   revenue               434803 non-null  int64  
 7   runtime               434803 non-null  int64  
 8   adult                 434803 non-null  bool   
 9   backdrop_path         184870 non-null  str    
 10  budget                434803 non-null  int64  
 11  homepage              54526 non-null   str    
 12  tconst                434803 non-null  str    
 13  original_language     434803 non-null  str    
 14  original_title        434803 non-null  str    
 15  overview   

### Nettoyage des données

In [5]:
df = df.drop_duplicates()

In [6]:
df = df.drop(columns=['backdrop_path', 'keywords', 'homepage', 'tconst', 'overview', 'poster_path', 'tagline'])

In [7]:
df = df[df['budget'] != 0]
df = df[df['revenue'] != 0]
df = df[df['release_date'].notna()]

### Feature Engineering

In [8]:
# Feature 1: Rating combiné pondéré
total_votes = df['vote_count'] + df['numVotes']
combined_rating = (df['vote_average'] * df['vote_count'] + df['averageRating'] * df['numVotes']) / total_votes

df['rating'] = combined_rating
df['total_votes'] = total_votes

In [9]:
df = df.drop(columns=['vote_count', 'numVotes', 'vote_average', 'averageRating'])
df.head()

Unnamed: 0,id,title,status,release_date,revenue,runtime,adult,budget,original_language,original_title,popularity,genres,production_companies,production_countries,spoken_languages,directors,writers,cast,rating,total_votes
0,27205,Inception,Released,2010-07-15,825532764,148,False,160000000,en,Inception,83.952,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili",Christopher Nolan,Christopher Nolan,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W...",8.794638,2804666
1,157336,Interstellar,Released,2014-11-05,701729206,169,False,165000000,en,Interstellar,140.241,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,Christopher Nolan,"Jonathan Nolan, Christopher Nolan","Matthew McConaughey, Anne Hathaway, Michael Ca...",8.696297,2489163
2,155,The Dark Knight,Released,2008-07-16,1004558444,152,False,185000000,en,The Dark Knight,130.643,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin",Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Go...","Christian Bale, Heath Ledger, Aaron Eckhart, M...",9.094279,3146977
3,19995,Avatar,Released,2009-12-15,2923706026,162,False,237000000,en,Avatar,79.932,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish",James Cameron,James Cameron,"Sam Worthington, Zoe Saldaña, Sigourney Weaver...",7.89351,1502236
4,24428,The Avengers,Released,2012-04-25,1518815515,143,False,220000000,en,The Avengers,98.082,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian",Joss Whedon,"Joss Whedon, Zak Penn","Robert Downey Jr., Chris Evans, Mark Ruffalo, ...",7.994609,1568906


In [10]:
df['release_year'] = pd.to_datetime(df['release_date']).dt.year

In [11]:
features = [
    'rating',     # Rating pondéré TMDB+IMDB (créé)
    'total_votes',         # vote_count + numVotes (créé)
    'popularity',          # Score TMDB de buzz actuel
    'runtime',      
    'release_year',
    'budget',
    'revenue'
]

### Focus on the Label : Genres

In [12]:
unique_genres = df['genres'].dropna().str.split(', ').explode().unique()
number_of_genre = len(unique_genres)

print(f"There is {number_of_genre} different genres, that are :", sorted(unique_genres))

There is 19 different genres, that are : ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western']


In [13]:
single_genre_count = df[df['genres'].str.contains(',', na=True) == False].shape[0]
print(f"Number of rows with only one unique genre: {single_genre_count}")

Number of rows with only one unique genre: 1812


On va donc prendre les films avec plusieurs genres, mais en gardant le premier genre qui apprait

In [14]:
# Supprimer les films sans genre
df_clean = df[df['genres'].notna()].copy()
print(f"Films avec genres : {len(df_clean)} / {len(df)}")

# Extraire le premier genre
df_clean['genre'] = df_clean['genres'].str.split(',').str[0].str.strip()

# Distribution des genres
print("\n Distribution des genres :")
genre_counts = df_clean['genre'].value_counts()
print(genre_counts.head(20))

print(f"\n Total de genres uniques : {df_clean['genre'].nunique()}")

Films avec genres : 10098 / 10182

 Distribution des genres :
genre
Drama              2410
Comedy             2180
Action             1467
Horror              674
Adventure           576
Crime               454
Thriller            441
Animation           328
Romance             307
Science Fiction     229
Fantasy             216
Family              203
Documentary         138
Mystery             127
Music               108
War                 100
Western              73
History              65
TV Movie              2
Name: count, dtype: int64

 Total de genres uniques : 19


In [15]:
# Garder Top 10 genres
top_5_genres = df_clean['genre'].value_counts().head(5).index.tolist()
df_final = df_clean[df_clean['genre'].isin(top_5_genres)].copy()

print(f"Films gardés : {len(df_final)}")
print(f"Genres : {top_5_genres}")
print("\n Distribution finale :")
print(df_final['genre'].value_counts())

Films gardés : 7307
Genres : ['Drama', 'Comedy', 'Action', 'Horror', 'Adventure']

 Distribution finale :
genre
Drama        2410
Comedy       2180
Action       1467
Horror        674
Adventure     576
Name: count, dtype: int64


In [16]:
X = df_final[features]
y = df_final['genre']

In [17]:
# 1. Train/test split stratifié
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,
    stratify=y,      # ← Important pour le déséquilibre
    random_state=42
)

# 2. Scaling avec RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Entraînement
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train_scaled, y_train)

# 4. Évaluation
from sklearn.metrics import classification_report, confusion_matrix
y_pred = model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Action       0.36      0.07      0.12       294
   Adventure       0.30      0.22      0.25       115
      Comedy       0.37      0.58      0.45       436
       Drama       0.49      0.65      0.56       482
      Horror       0.60      0.02      0.04       135

    accuracy                           0.42      1462
   macro avg       0.42      0.31      0.28      1462
weighted avg       0.42      0.42      0.37      1462



In [20]:
import matplotlib.pyplot as plt

# Paramètres du modèle GaussianNB
print("=" * 60)
print("PARAMÈTRES DU MODÈLE GAUSSIEN NAIVE BAYES")
print("=" * 60)

# 1. Probabilités a priori des classes P(classe)
print("\n1. PROBABILITÉS A PRIORI P(classe):")
print("-" * 40)
for classe, proba in zip(model.classes_, model.class_prior_):
    print(f"   {classe:12s}: {proba:.4f} ({proba*100:.2f}%)")

# 2. Moyennes (theta) par feature et par classe
print("\n2. MOYENNES (μ) PAR FEATURE ET CLASSE:")
print("-" * 40)
theta_df = pd.DataFrame(
    model.theta_, 
    index=model.classes_, 
    columns=features
)
print(theta_df.round(3))

# 3. Variances par feature et par classe
print("\n3. VARIANCES (σ²) PAR FEATURE ET CLASSE:")
print("-" * 40)
var_df = pd.DataFrame(
    model.var_, 
    index=model.classes_, 
    columns=features
)
print(var_df.round(3))

PARAMÈTRES DU MODÈLE GAUSSIEN NAIVE BAYES

1. PROBABILITÉS A PRIORI P(classe):
----------------------------------------
   Action      : 0.2007 (20.07%)
   Adventure   : 0.0789 (7.89%)
   Comedy      : 0.2984 (29.84%)
   Drama       : 0.3299 (32.99%)
   Horror      : 0.0922 (9.22%)

2. MOYENNES (μ) PAR FEATURE ET CLASSE:
----------------------------------------
           rating  total_votes  popularity  runtime  release_year  budget  \
Action     -0.208        1.167       1.466    0.301        -0.095   1.279   
Adventure  -0.036        1.645       0.898    0.237        -0.445   1.795   
Comedy     -0.168        0.433       0.070   -0.129        -0.343   0.261   
Drama       0.236        0.658       0.023    0.316        -0.352   0.183   
Horror     -0.404        0.628       1.172   -0.402        -0.190   0.038   

           revenue  
Action       1.834  
Adventure    2.818  
Comedy       0.585  
Drama        0.397  
Horror       0.492  

3. VARIANCES (σ²) PAR FEATURE ET CLASSE:
-----