In [1]:
import numpy as np
import pandas as pd

In [2]:
basics_url= "https://datasets.imdbws.com/title.basics.tsv.gz"
title_aka = 'https://datasets.imdbws.com/title.akas.tsv.gz'
title_rating = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

In [3]:
# read in files
basics = pd.read_csv(basics_url, sep = '\t', low_memory = False)
title = pd.read_csv(title_aka, sep = '\t', low_memory = False)
rating = pd.read_csv(title_rating, sep = '\t', low_memory = False)

In [4]:
basics_res = basics.copy()
title_res = title.copy()
title_res = title.copy()

## 2.1 basics dataframe

In [5]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9926879 entries, 0 to 9926878
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 681.6+ MB


In [6]:
# find and replace with np.nan
basics.replace({'\\N': np.nan}, inplace = True)

# check
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1337884
endYear           9818283
runtimeMinutes    6986753
genres             445048
dtype: int64

In [7]:
# drop nulls in 'runtimeMinutes' column
basics.dropna(subset = ['runtimeMinutes'], inplace = True)

# check
basics['runtimeMinutes'].isna().sum()

0

In [8]:
# drop nulls in 'genres' column
basics.dropna(subset = ['genres'], inplace = True)

# check
basics['genres'].isna().sum()

0

In [9]:
# keep only 'titleType' == 'movie'
basics = basics[basics['titleType'] == 'movie']

# check
basics['titleType'].value_counts()

movie    383787
Name: titleType, dtype: int64

In [10]:
# drop nulls from column
basics.dropna(subset = ['startYear'], inplace = True)

# check
basics['startYear'].isna().sum()

0

In [11]:
# convert dtype to int
basics['startYear'] = basics['startYear'].astype(int)

In [12]:
# check min and max year first
basics['startYear'].describe()

count    377300.000000
mean       1995.820347
std          26.685633
min        1894.000000
25%        1981.000000
50%        2008.000000
75%        2016.000000
max        2029.000000
Name: startYear, dtype: float64

In [13]:
# keep only 'startYear' from 2000 to 2022, including 2000 and 2022
basics = basics[(basics['startYear'] >= 2000) &
                (basics['startYear'] <= 2022)]

# check
basics['startYear'].value_counts().sort_index(ascending = False)

2022    12922
2021    12410
2020    11592
2019    14101
2018    14347
2017    14386
2016    13965
2015    13479
2014    13124
2013    12394
2012    11652
2011    10779
2010    10214
2009     9368
2008     8166
2007     6967
2006     6526
2005     5848
2004     5218
2003     4597
2002     4138
2001     3873
2000     3644
Name: startYear, dtype: int64

In [14]:
# check out 'genres' column
basics['genres'].value_counts()

Documentary                  53391
Drama                        36086
Comedy                       13462
Comedy,Drama                  6460
Horror                        5807
                             ...  
Animation,Biography,Sport        1
Adventure,History,Music          1
Adventure,History,War            1
Adventure,Romance,Sport          1
Crime,Fantasy,Sci-Fi             1
Name: genres, Length: 1172, dtype: int64

In [15]:
# exclude any movies that have 'documentary' in their genre
is_documentary = basics['genres'].str.contains('documentary', 
                                               case = False)
basics = basics[~is_documentary]

# check
basics['genres'].value_counts()

Drama                        36086
Comedy                       13462
Comedy,Drama                  6460
Horror                        5807
Drama,Romance                 4319
                             ...  
Family,Musical,Sport             1
Horror,Music,Mystery             1
Comedy,History,Mystery           1
Animation,Biography,Sport        1
Crime,Fantasy,Sci-Fi             1
Name: genres, Length: 955, dtype: int64

In [16]:
# check info
title.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36216823 entries, 0 to 36216822
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.2+ GB


In [17]:
# find and replace with np.nan
title.replace({'\\N': np.nan}, inplace = True)

# check
title.isna().sum()

titleId                   0
ordering                  0
title                     5
region              1893352
language            6634489
types              30634267
attributes         35955766
isOriginalTitle        2078
dtype: int64

In [18]:
# drop null values in 'region'
title.dropna(subset = ['region'], inplace = True)

# check
title['region'].isna().sum()

0

In [19]:
# check values in 'region'
title['region'].value_counts()

DE    4339896
FR    4335347
JP    4333540
IN    4275418
ES    4255030
       ...   
FM          2
TV          1
PW          1
NR          1
NU          1
Name: region, Length: 247, dtype: int64

In [20]:
# check number of movies from US
len(title[title['region'] == 'US'])

1444801

In [21]:
# keep only movies from uS
title = title[title['region'] == 'US']

# check
title['region'].value_counts()

US    1444801
Name: region, dtype: int64

In [22]:
# check info
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319129 entries, 0 to 1319128
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1319129 non-null  object 
 1   averageRating  1319129 non-null  float64
 2   numVotes       1319129 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.2+ MB


In [23]:
# find and replace with np.nan
rating.replace({'\\N': np.nan}, inplace = True)

# check
rating.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [26]:
us_movies = basics['tconst'].isin(title['titleId'])

# check
us_movies

34803       True
42384       True
61115       True
67668       True
86800       True
           ...  
9926552     True
9926561     True
9926600    False
9926645     True
9926729    False
Name: tconst, Length: 147698, dtype: bool

In [27]:
len(basics)

147698

In [28]:
# filter
basics = basics[us_movies]

# check
len(basics)

86784

In [29]:
us_movies = rating['tconst'].isin(title['titleId'])

# check
us_movies

0           True
1           True
2          False
3          False
4           True
           ...  
1319124    False
1319125    False
1319126    False
1319127    False
1319128    False
Name: tconst, Length: 1319129, dtype: bool

In [30]:
len(rating)

1319129

In [35]:
# filter
ratings = rating[us_movies]

# check
len(ratings)

501086

In [36]:
# check info for basics
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86784 entries, 34803 to 9926645
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          86784 non-null  object
 1   titleType       86784 non-null  object
 2   primaryTitle    86784 non-null  object
 3   originalTitle   86784 non-null  object
 4   isAdult         86784 non-null  object
 5   startYear       86784 non-null  int64 
 6   endYear         0 non-null      object
 7   runtimeMinutes  86784 non-null  object
 8   genres          86784 non-null  object
dtypes: int64(1), object(8)
memory usage: 6.6+ MB


In [37]:
# check info for title
title.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1444801 entries, 5 to 36216567
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1444801 non-null  object
 1   ordering         1444801 non-null  int64 
 2   title            1444801 non-null  object
 3   region           1444801 non-null  object
 4   language         3951 non-null     object
 5   types            980076 non-null   object
 6   attributes       46755 non-null    object
 7   isOriginalTitle  1443459 non-null  object
dtypes: int64(1), object(7)
memory usage: 99.2+ MB


In [38]:
# check info for ratings
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 501086 entries, 0 to 1319104
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         501086 non-null  object 
 1   averageRating  501086 non-null  float64
 2   numVotes       501086 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.3+ MB


In [48]:
# save each file to a compressed csv file in data folder
basics.to_csv('Data/title_basics.csv.gz', 
              compression = 'gzip', 
              index = False)

# check
basics = pd.read_csv('Data/title_basics.csv.gz',
                    low_memory = False)

basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0043139,movie,Life of a Beijing Policeman,Wo zhe yi bei zi,0,2013,,120,"Drama,History"
2,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


In [50]:
# save each file to a compressed csv file in data folder
title.to_csv('Data/titles.csv.gz', 
              compression = 'gzip', 
              index = False)

# check
title = pd.read_csv('Data/titles.csv.gz',
                    low_memory = False)

title.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [39]:
# save each file to a compressed csv file in data folder
ratings.to_csv('Data/ratings.csv.gz', 
              compression = 'gzip', 
              index = False)

# check
ratings = pd.read_csv('Data/ratings.csv.gz',
                    low_memory = False)

ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1980
1,tt0000002,5.8,265
2,tt0000005,6.2,2622
3,tt0000006,5.1,182
4,tt0000007,5.4,821


In [41]:
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
42384,tt0043139,movie,Life of a Beijing Policeman,Wo zhe yi bei zi,0,2013,,120,"Drama,History"
61115,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67668,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86800,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9926017,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
9926412,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
9926552,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9926561,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


In [42]:
title

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0
...,...,...,...,...,...,...,...,...
36216349,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,,imdbDisplay,,0
36216419,tt9916620,1,The Copeland Case,US,,imdbDisplay,,0
36216508,tt9916702,1,Loving London: The Playground,US,,,,0
36216551,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0


In [43]:
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1980
1,tt0000002,5.8,265
2,tt0000005,6.2,2622
3,tt0000006,5.1,182
4,tt0000007,5.4,821
...,...,...,...
501081,tt9916200,8.1,229
501082,tt9916204,8.2,263
501083,tt9916348,8.3,18
501084,tt9916362,6.4,5375
