# Work with TMDB + IMDB dataset 
- [Source](https://www.kaggle.com/datasets/alanvourch/tmdb-movies-daily-updates)

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import re

In [2]:
import sys
sys.path.append('../utils')
import functions

In [3]:
tmdb_df = pd.read_csv('../data/local/raw/TMDB_all_movies.csv')

In [4]:
functions.show_basic_info(tmdb_df)


DataFrame Shape: (1028026, 28)
Number of Rows: 1028026
Number of Columns: 28

Data Types of Columns:
id                           int64
title                       object
vote_average               float64
vote_count                 float64
status                      object
release_date                object
revenue                    float64
runtime                    float64
budget                     float64
imdb_id                     object
original_language           object
original_title              object
overview                    object
popularity                 float64
tagline                     object
genres                      object
production_companies        object
production_countries        object
spoken_languages            object
cast                        object
director                    object
director_of_photography     object
writers                     object
producers                   object
music_composer              object
imdb_rating            

In [5]:
# functions.show_column_summary(tmdb_df)

Columns to drop:
- cast
- director_of_photography
- music_composer
- poster_path
- writers
- tagline

In [6]:
tmdb_df.drop(columns=['cast', 'director_of_photography', 'music_composer', 'poster_path', 'writers', 'tagline'], inplace=True)
tmdb_df.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,...,overview,popularity,genres,production_companies,production_countries,spoken_languages,director,producers,imdb_rating,imdb_votes
0,2,Ariel,7.1,335.0,Released,1988-10-21,0.0,73.0,0.0,tt0094675,...,After the coal mine he works at closes and his...,11.915,"Comedy, Drama, Romance, Crime",Villealfa Filmproductions,Finland,suomi,Aki Kaurismäki,Aki Kaurismäki,7.4,8812.0
1,3,Shadows in Paradise,7.3,369.0,Released,1986-10-17,0.0,74.0,0.0,tt0092149,...,"Nikander, a rubbish collector and would-be ent...",16.287,"Comedy, Drama, Romance",Villealfa Filmproductions,Finland,"suomi, English, svenska",Aki Kaurismäki,Mika Kaurismäki,7.5,7587.0
2,5,Four Rooms,5.8,2628.0,Released,1995-12-09,4257354.0,98.0,4000000.0,tt0113101,...,It's Ted the Bellhop's first night on the job....,21.312,Comedy,"Miramax, A Band Apart",United States of America,English,"Quentin Tarantino, Robert Rodriguez, Alexandre...","Quentin Tarantino, Alexandre Rockwell, Lawrenc...",6.7,112798.0
3,6,Judgment Night,6.5,331.0,Released,1993-10-15,12136938.0,109.0,21000000.0,tt0107286,...,"Four young friends, while taking a shortcut en...",8.924,"Action, Crime, Thriller","Largo Entertainment, JVC, Universal Pictures",United States of America,English,Stephen Hopkins,"Gene Levy, Lloyd Segan, Marilyn Vance",6.6,19361.0
4,8,Life in Loops (A Megacities RMX),7.5,27.0,Released,2006-01-01,0.0,80.0,42000.0,tt0825671,...,Timo Novotny labels his new project an experim...,3.203,Documentary,inLoops,Austria,"English, हिन्दी, 日本語, Pусский, Español",Timo Novotny,"Ulrich Gehmacher, Timo Novotny",8.2,284.0


In [7]:
functions.check_for_duplicates(tmdb_df)


No duplicate rows found in the DataFrame.


Remove all rows where 'status' is not 'Released'

In [8]:
print(tmdb_df['status'].unique())

['Released' 'Rumored' 'Post Production' 'Canceled' 'Planned'
 'In Production' nan]


In [None]:
initial_rows = len(tmdb_df)
tmdb_df = tmdb_df[tmdb_df['status'] == 'Released'] # keep rows where 'status' is 'Released'
final_rows = len(tmdb_df)
removed_rows = initial_rows - final_rows
print(f'Number of rows removed: {removed_rows}')

Number of rows removed: 16758


#### 'release_date' column
- Convert to datetime
- Extract year only
- Convert year to integer

In [10]:
df = tmdb_df.copy()

In [None]:
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = df['release_date'].dt.year.astype('Int64')
print(df[['release_date', 'release_year']].head())

  release_date  release_year
0   1988-10-21          1988
1   1986-10-17          1986
2   1995-12-09          1995
3   1993-10-15          1993
4   2006-01-01          2006
