In [2]:
import pandas as pd

df = pd.read_csv("hf://datasets/HenryWaltson/TMDB-IMDB-Movies-Dataset/TMDB  IMDB Movies Dataset.csv")

df.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,genres,production_companies,production_countries,spoken_languages,keywords,directors,writers,averageRating,numVotes,cast
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...",Christopher Nolan,Christopher Nolan,8.8,2770171,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,...",Christopher Nolan,"Jonathan Nolan, Christopher Nolan",8.7,2456592,"Matthew McConaughey, Anne Hathaway, Michael Ca..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f...",Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Go...",9.1,3116358,"Christian Bale, Heath Ledger, Aaron Eckhart, M..."
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ...",James Cameron,James Cameron,7.9,1472421,"Sam Worthington, Zoe Saldaña, Sigourney Weaver..."
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com...",Joss Whedon,"Joss Whedon, Zak Penn",8.0,1539740,"Robert Downey Jr., Chris Evans, Mark Ruffalo, ..."


## Exploration du Dataset ##

The Dataset was found on [HuggingFace](https://huggingface.co/datasets/HenryWaltson/TMDB-IMDB-Movies-Dataset).
It is composed of TMDB and IMDB movies data.

In [18]:
print(f"The dataset contains {df.shape[0]} rows and {df.shape[1]} columns")

The dataset contains 434803 rows and 29 columns


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434803 entries, 0 to 434802
Data columns (total 29 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    434803 non-null  int64  
 1   title                 434803 non-null  object 
 2   vote_average          434803 non-null  float64
 3   vote_count            434803 non-null  int64  
 4   status                434803 non-null  object 
 5   release_date          414261 non-null  object 
 6   revenue               434803 non-null  int64  
 7   runtime               434803 non-null  int64  
 8   adult                 434803 non-null  bool   
 9   backdrop_path         184870 non-null  object 
 10  budget                434803 non-null  int64  
 11  homepage              54526 non-null   object 
 12  tconst                434803 non-null  object 
 13  original_language     434803 non-null  object 
 14  original_title        434803 non-null  object 
 15  

### Nettoyage des données

In [24]:
df = df.drop_duplicates()

### Feature Engineering

In [29]:
# Feature 1: Rating combiné pondéré
# Utiliser numVotes comme poids (IMDB a plus de votes donc plus fiable)
total_votes = df['vote_count'] + df['numVotes']
combined_rating = (df['vote_average'] * df['vote_count'] + df['averageRating'] * df['numVotes']) / total_votes

df['rating'] = combined_rating

# Feature 2: Total des votes (log scale pour éviter les outliers)
total_vote_count = np.log1p(df['vote_count'] + df['numVotes'])  # log1p pour éviter log(0)

df['total_votes'] = total_vote_count

### Focus on the Genres

In [19]:
unique_genres = df['genres'].dropna().str.split(', ').explode().unique()
number_of_genre = len(unique_genres)

print(f"There is {number_of_genre} different genres, that are :", sorted(unique_genres))

There is 19 different genres, that are : ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western']


In [11]:
single_genre_count = df[df['genres'].str.contains(',', na=True) == False].shape[0]
print(f"Number of rows with only one unique genre: {single_genre_count}")

Number of rows with only one unique genre: 187825


In [13]:
df[df['genres'].str.contains(',', na=True) == False]

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,genres,production_companies,production_countries,spoken_languages,keywords,directors,writers,averageRating,numVotes,cast
7,550,Fight Club,8.438,27238,Released,1999-10-15,100853753,139,False,/hZkgoQYus5vegHoetLkCJzb17zJ.jpg,...,Drama,"Regency Enterprises, Fox 2000 Pictures, Taurus...",United States of America,English,"dual identity, rage and hate, based on novel o...",David Fincher,"Chuck Palahniuk, Jim Uhls",8.8,2555900,"Edward Norton, Brad Pitt, Helena Bonham Carter..."
95,18785,The Hangover,7.312,15812,Released,2009-06-02,469310836,100,False,/2o0PKGmnSgCGkzoSePNAqse8Ure.jpg,...,Comedy,"Legendary Pictures, Green Hat Films, Warner Br...","Germany, United States of America",English,"blackjack, stag night, lost weekend, chapel, h...",Todd Phillips,"Jon Lucas, Scott Moore",7.7,915313,"Bradley Cooper, Ed Helms, Zach Galifianakis, J..."
216,14,American Beauty,8.000,11260,Released,1999-09-15,356296601,122,False,/DztBnZaqmla2sGUW9s8AyOmskT.jpg,...,Drama,"Jinks/Cohen Company, DreamWorks Pictures",United States of America,English,"estate agent, adultery, coming out, first time...",Sam Mendes,Alan Ball,8.3,1270120,"Kevin Spacey, Annette Bening, Thora Birch, Wes..."
218,37799,The Social Network,7.358,11190,Released,2010-10-01,224920315,121,False,/2BNKxbq4muNcwTjSDNCYnvr1dM8.jpg,...,Drama,"Columbia Pictures, Relativity Media, Scott Rud...",United States of America,English,"hacker, based on novel or book, boston, massac...",David Fincher,"Aaron Sorkin, Ben Mezrich",7.8,809374,"Jesse Eisenberg, Andrew Garfield, Armie Hammer..."
221,489,Good Will Hunting,8.149,11111,Released,1997-12-05,225933435,127,False,/bpV8wn48s82au37QyUJ51S7X2Vf.jpg,...,Drama,"Miramax, Lawrence Bender Productions, Be Gentl...",United States of America,English,"boston, massachusetts, professor, baseball, ma...",Gus Van Sant,"Matt Damon, Ben Affleck",8.3,1187897,"Matt Damon, Robin Williams, Ben Affleck, Stell..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434781,836515,Наши знакомые,0.000,0,Released,1969-06-02,0,0,False,,...,Drama,Gorky Film Studios,Soviet Union,,,Ilya Gurin,"Yuri German, Ilya Gurin",6.5,20,"Natalya Tenyakova, Kirill Lavrov, Ivan Lapikov..."
434784,836522,Они живут рядом,0.000,0,Released,1968-05-06,0,0,False,,...,Drama,Mosfilm,Soviet Union,,,Grigoriy Roshal,,6.1,15,"Aleksandr Borisov, Evgeniy Evstigneev, Viktori..."
434786,836534,The Picnic,0.000,0,Released,1998-10-03,0,17,False,,...,Drama,,South Korea,Korean,suicide by gas,Song Il-gon,Song Il-gon,6.3,76,"Son Byung-ho, Choi Ji-yeon"
434790,836378,Guns N Roses: Live Rarities,0.000,0,Released,2007-08-17,0,37,False,,...,Documentary,Shoreline Entertainment,,English,concert film,,,6.3,8,"Axl Rose, Slash, Izzy Stradlin, Duff McKagan, ..."
