In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"
rating_url="https://datasets.imdbws.com/title.ratings.tsv.gz"

In [3]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [4]:
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

In [5]:
rating = pd.read_csv(rating_url, sep='\t', low_memory=False)

In [6]:
basics.replace({'\\N':np.nan}, inplace = True)
akas.replace({'\\N':np.nan}, inplace = True)
rating.replace({'\\N':np.nan}, inplace = True)

In [7]:
#Dropping nan in runtime and genres
basics.dropna(subset=['runtimeMinutes','genres'],inplace=True)

In [8]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [9]:
#Filter that keeps only US
akas = akas[akas['region'] == 'US']

In [10]:
#Keeping titles with only Movie
movie_type =basics['titleType'].str.contains('movie', case = False)

In [11]:
basics = basics[movie_type]

In [12]:
#Filter movies released 2000-2021
basics['startYear'] = basics ['startYear'].astype(float)

In [13]:
filter_year = (basics["startYear"] >= 2000) & (basics["startYear"] <= 2022)
basics = basics.loc[filter_year, :]

In [14]:
#Exclude documentary movies
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [15]:
keepers =basics['tconst'].isin(akas['titleId'])
keepers

34793       True
61095       True
67643       True
77937      False
86773       True
           ...  
9402724    False
9402762    False
9402807     True
9402891    False
9402964    False
Name: tconst, Length: 168403, dtype: bool

In [16]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34793,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61095,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
67643,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86773,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93909,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
...,...,...,...,...,...,...,...,...,...
9402178,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019.0,,74,Drama
9402574,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,,97,"Comedy,Drama,Fantasy"
9402714,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
9402723,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


In [17]:
keepers_rating = rating['tconst'].isin(akas['titleId'])
keepers_rating

0           True
1           True
2          False
3          False
4           True
           ...  
1251371    False
1251372     True
1251373    False
1251374    False
1251375    False
Name: tconst, Length: 1251376, dtype: bool

In [23]:
rating = rating [keepers_rating]

In [24]:
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)
rating.to_csv("Data/title_rating.csv.gz",compression='gzip',index=False)

In [25]:
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [26]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92669 entries, 0 to 92668
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          92669 non-null  object 
 1   titleType       92669 non-null  object 
 2   primaryTitle    92669 non-null  object 
 3   originalTitle   92669 non-null  object 
 4   isAdult         92669 non-null  int64  
 5   startYear       92669 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  92669 non-null  int64  
 8   genres          92669 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 6.4+ MB


In [27]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1378691 entries, 5 to 33983319
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1378691 non-null  object
 1   ordering         1378691 non-null  int64 
 2   title            1378691 non-null  object
 3   region           1378691 non-null  object
 4   language         3739 non-null     object
 5   types            966951 non-null   object
 6   attributes       45374 non-null    object
 7   isOriginalTitle  1377316 non-null  object
dtypes: int64(1), object(7)
memory usage: 94.7+ MB


In [28]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 476888 entries, 0 to 1251372
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         476888 non-null  object 
 1   averageRating  476888 non-null  float64
 2   numVotes       476888 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 14.6+ MB
