In [1]:
import pandas as pd
import numpy as np

# Importing Data

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [3]:
rating_url="https://datasets.imdbws.com/title.ratings.tsv.gz"
ratings = pd.read_csv(rating_url, sep='\t', low_memory=False)

In [4]:
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

# AKAs

In [5]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35843078 entries, 0 to 35843077
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.1+ GB


In [6]:
akas = akas.replace({'\\N':np.nan})
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0
...,...,...,...,...,...,...,...,...
35843073,tt9916852,5,Episódio #3.20,PT,pt,,,0
35843074,tt9916852,6,Episodio #3.20,IT,it,,,0
35843075,tt9916852,7,एपिसोड #3.20,IN,hi,,,0
35843076,tt9916856,1,The Wind,DE,,imdbDisplay,,0


In [7]:
us = akas['region'] == 'US'
akas = akas[us]
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0
...,...,...,...,...,...,...,...,...
35842604,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,,imdbDisplay,,0
35842674,tt9916620,1,The Copeland Case,US,,imdbDisplay,,0
35842763,tt9916702,1,Loving London: The Playground,US,,,,0
35842806,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0


# Basics

In [8]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9837728 entries, 0 to 9837727
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 675.5+ MB


In [9]:
basics = basics.replace({'\\N':np.nan})
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
9837723,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,,,"Action,Drama,Family"
9837724,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,,,"Action,Drama,Family"
9837725,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,,,"Action,Drama,Family"
9837726,tt9916856,short,The Wind,The Wind,0,2015,,27,Short


In [10]:
basics = basics.dropna(subset=["runtimeMinutes","genres"])
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
9837678,tt9916754,movie,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,0,2013,,49,Documentary
9837684,tt9916766,tvEpisode,Episode #10.15,Episode #10.15,0,2019,,43,"Family,Game-Show,Reality-TV"
9837719,tt9916840,tvEpisode,Horrid Henry's Comic Caper,Horrid Henry's Comic Caper,0,2014,,11,"Adventure,Animation,Comedy"
9837726,tt9916856,short,The Wind,The Wind,0,2015,,27,Short


In [11]:
basics = basics[(basics['titleType']=='movie')]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
...,...,...,...,...,...,...,...,...,...
9837494,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"
9837578,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,,123,Drama
9837619,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,,57,Documentary
9837646,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,,100,Documentary


In [12]:
basics = basics[(basics['startYear'] > '1999') & (basics['startYear'] < '2022')]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021,,94,Documentary
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9837494,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"
9837578,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,,123,Drama
9837619,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,,57,Documentary
9837646,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,,100,Documentary


In [13]:
#As we're only looking at movies, we don't need to look at endYear since it shows the year TV shows in the data ended
basics = basics.drop(columns=['endYear'])
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021,94,Documentary
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...
9837494,tt9916362,movie,Coven,Akelarre,0,2020,92,"Drama,History"
9837578,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,123,Drama
9837619,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,57,Documentary
9837646,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,100,Documentary


In [14]:
docs =basics['genres'].str.contains('documentary',case=False)
basics = basics[~docs]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,126,Drama
...,...,...,...,...,...,...,...,...
9837401,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,51,Drama
9837410,tt9916190,movie,Safeguard,Safeguard,0,2020,95,"Action,Adventure,Thriller"
9837449,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,84,Thriller
9837494,tt9916362,movie,Coven,Akelarre,0,2020,92,"Drama,History"


In [15]:
keepers = basics['tconst'].isin(akas['titleId'])
keepers

34803       True
61116       True
67669       True
86801       True
93938       True
           ...  
9837401     True
9837410     True
9837449    False
9837494     True
9837578    False
Name: tconst, Length: 138446, dtype: bool

In [16]:
basics = basics[keepers]
basics.info()
basics

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81710 entries, 34803 to 9837494
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          81710 non-null  object
 1   titleType       81710 non-null  object
 2   primaryTitle    81710 non-null  object
 3   originalTitle   81710 non-null  object
 4   isAdult         81710 non-null  object
 5   startYear       81710 non-null  object
 6   runtimeMinutes  81710 non-null  object
 7   genres          81710 non-null  object
dtypes: object(8)
memory usage: 5.6+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,126,Drama
...,...,...,...,...,...,...,...,...
9836866,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,74,Drama
9837261,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,97,"Comedy,Drama,Fantasy"
9837401,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,51,Drama
9837410,tt9916190,movie,Safeguard,Safeguard,0,2020,95,"Action,Adventure,Thriller"


# Ratings

In [17]:
ratings.info()
ratings

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1308743 entries, 0 to 1308742
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1308743 non-null  object 
 1   averageRating  1308743 non-null  float64
 2   numVotes       1308743 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.0+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1969
1,tt0000002,5.8,263
2,tt0000003,6.5,1815
3,tt0000004,5.6,178
4,tt0000005,6.2,2612
...,...,...,...
1308738,tt9916730,8.3,10
1308739,tt9916766,7.0,21
1308740,tt9916778,7.2,36
1308741,tt9916840,8.8,6


In [18]:
ratings = ratings.replace({'\\N':np.nan})
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1969
1,tt0000002,5.8,263
2,tt0000003,6.5,1815
3,tt0000004,5.6,178
4,tt0000005,6.2,2612
...,...,...,...
1308738,tt9916730,8.3,10
1308739,tt9916766,7.0,21
1308740,tt9916778,7.2,36
1308741,tt9916840,8.8,6


In [19]:
keepers2 =ratings['tconst'].isin(akas['titleId'])
keepers2

0           True
1           True
2          False
3          False
4           True
           ...  
1308738    False
1308739    False
1308740    False
1308741    False
1308742    False
Name: tconst, Length: 1308743, dtype: bool

In [20]:
ratings = ratings[keepers2]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1969
1,tt0000002,5.8,263
4,tt0000005,6.2,2612
5,tt0000006,5.1,181
6,tt0000007,5.4,818
...,...,...,...
1308704,tt9916200,8.1,229
1308705,tt9916204,8.1,262
1308712,tt9916348,8.3,18
1308713,tt9916362,6.4,5320


# Exporting Data and Finalizing it

In [21]:
import os as os
os.makedirs('Data/',exist_ok=True) 
os.listdir("Data/")

['title.akas.tsv.gz', 'title.ratings.tsv.gz', 'title_basics.csv.gz']

In [22]:
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [23]:
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,126,Drama


In [24]:
akas.to_csv("Data/title.akas.tsv.gz",compression='gzip',index=False)

In [25]:
akas = pd.read_csv("Data/title.akas.tsv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [26]:
ratings.to_csv("Data/title.ratings.tsv.gz",compression='gzip',index=False)

In [27]:
ratings = pd.read_csv("Data/title.ratings.tsv.gz", low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1969
1,tt0000002,5.8,263
2,tt0000005,6.2,2612
3,tt0000006,5.1,181
4,tt0000007,5.4,818
