In [1]:
import pandas as pd
import numpy as np

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"

In [3]:
basics = pd.read_csv(basics_url,sep='\t', low_memory=False)
akas = pd.read_csv(akas_url,sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url,sep='\t', low_memory=False)

In [4]:
basics = basics.replace({'\\N':np.nan})
akas = akas.replace({'\\N':np.nan})
ratings = ratings.replace({'\\N':np.nan})

In [5]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [6]:
is_movie = basics["titleType"].str.contains("movie", case=False)
basics = basics[is_movie]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
498,tt0000502,movie,Bohemios,Bohemios,0,1905,,100.0,
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70.0,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90.0,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,,,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,,,Drama


In [7]:
basics.dropna(subset=["runtimeMinutes", "genres"], inplace=True)
basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear           6025
endYear           451495
runtimeMinutes         0
genres                 0
dtype: int64

In [8]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [9]:
time_filter = (basics["startYear"] > '1999')
basics = basics[time_filter]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61119,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67672,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77968,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86806,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


In [10]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [11]:
akas.dropna(subset=["region"], inplace=True)
akas.isna().sum()

titleId                   0
ordering                  0
title                     2
region                    0
language            4314318
types              27111500
attributes         30359123
isOriginalTitle        2175
dtype: int64

In [12]:
region_filter = akas["region"].str.contains("US", case=False)
akas = akas[region_filter]
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [13]:
keepers =basics['tconst'].isin(akas['titleId'])
keepers

34805       True
61119       True
67672       True
77968      False
86806       True
           ...  
9047101    False
9047139    False
9047184     True
9047268    False
9047341    False
Name: tconst, Length: 163653, dtype: bool

In [14]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61119,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67672,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86806,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
91077,tt0093119,movie,Grizzly II: Revenge,Grizzly II: The Predator,0,2020,,74,"Horror,Music,Thriller"
...,...,...,...,...,...,...,...,...,...
9046555,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
9046951,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
9047091,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9047100,tt9916190,movie,Safeguard,Safeguard,0,2020,,90,"Action,Adventure,Thriller"


In [15]:
basics.info

<bound method DataFrame.info of             tconst titleType  \
34805    tt0035423     movie   
61119    tt0062336     movie   
67672    tt0069049     movie   
86806    tt0088751     movie   
91077    tt0093119     movie   
...            ...       ...   
9046555  tt9914942     movie   
9046951  tt9915872     movie   
9047091  tt9916170     movie   
9047100  tt9916190     movie   
9047184  tt9916362     movie   

                                              primaryTitle  \
34805                                       Kate & Leopold   
61119    The Tango of the Widower and Its Distorting Mi...   
67672                           The Other Side of the Wind   
86806                                    The Naked Monster   
91077                                  Grizzly II: Revenge   
...                                                    ...   
9046555                             Life Without Sara Amat   
9046951                               The Last White Witch   
9047091                  

In [16]:
akas.info

<bound method DataFrame.info of             titleId  ordering                          title region language  \
5         tt0000001         6                     Carmencita     US      NaN   
14        tt0000002         7         The Clown and His Dogs     US      NaN   
33        tt0000005        10               Blacksmith Scene     US      NaN   
36        tt0000005         1            Blacksmithing Scene     US      NaN   
41        tt0000005         6            Blacksmith Scene #1     US      NaN   
...             ...       ...                            ...    ...      ...   
32520780  tt9916702         1  Loving London: The Playground     US      NaN   
32520818  tt9916720        10                The Demonic Nun     US      NaN   
32520820  tt9916720        12                      The Nun 2     US      NaN   
32520837  tt9916756         1       Pretty Pretty Black Girl     US      NaN   
32520853  tt9916764         1                             38     US      NaN   

       

In [17]:
ratings.info

<bound method DataFrame.info of             tconst  averageRating  numVotes
0        tt0000001            5.7      1891
1        tt0000002            5.9       253
2        tt0000003            6.5      1686
3        tt0000004            5.7       166
4        tt0000005            6.2      2502
...            ...            ...       ...
1252422  tt9916690            6.5         6
1252423  tt9916720            5.2       230
1252424  tt9916730            8.4         6
1252425  tt9916766            6.8        21
1252426  tt9916778            7.2        35

[1252427 rows x 3 columns]>