In [1]:
import pandas as pd
import numpy as np

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"

In [3]:
basics = pd.read_csv(basics_url,sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url,sep='\t', low_memory=False)
akas = pd.read_csv(akas_url,sep='\t', low_memory=False)

# Basics

In [4]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9044976 entries, 0 to 9044975
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 621.1+ MB


In [5]:
basics['titleType'].unique()

array(['short', 'movie', 'tvEpisode', 'tvSeries', 'tvShort', 'tvMovie',
       'tvMiniSeries', 'tvSpecial', 'video', 'videoGame', 'tvPilot'],
      dtype=object)

In [6]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [7]:
basics.replace({'\\N':np.nan},inplace=True)
basics.replace({'\\\\N':np.nan},inplace=True)

In [8]:
basics.dropna(subset=['runtimeMinutes','genres', 'startYear'],inplace=True)

In [15]:
basics.drop(['endYear'], axis=1,inplace=True)

In [16]:
basics.isna().sum()

tconst            0
titleType         0
primaryTitle      0
originalTitle     0
isAdult           0
startYear         0
runtimeMinutes    0
genres            0
dtype: int64

In [10]:
basics = basics[basics['titleType'] == 'movie']

In [11]:
basics['titleType'].unique()

array(['movie'], dtype=object)

In [18]:
basics['startYear'] = basics['startYear'].astype(str).astype(int)

In [19]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [20]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 272620 entries, 570 to 9044825
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          272620 non-null  object
 1   titleType       272620 non-null  object
 2   primaryTitle    272620 non-null  object
 3   originalTitle   272620 non-null  object
 4   isAdult         272620 non-null  object
 5   startYear       272620 non-null  int32 
 6   runtimeMinutes  272620 non-null  object
 7   genres          272620 non-null  object
dtypes: int32(1), object(7)
memory usage: 17.7+ MB


In [21]:
year_upper_filter = basics['startYear'] <= 2022
year_lower_filter = basics['startYear'] >= 2000

In [22]:
df_filtered = basics.loc[year_upper_filter & year_lower_filter, :]
df_filtered   

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,118,"Comedy,Fantasy,Romance"
61119,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,70,Drama
67672,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122,Drama
77968,tt0079644,movie,November 1828,November 1828,0,2001,140,"Drama,War"
86806,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...
9044648,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,51,Drama
9044657,tt9916190,movie,Safeguard,Safeguard,0,2020,90,"Action,Adventure,Thriller"
9044696,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,84,Thriller
9044741,tt9916362,movie,Coven,Akelarre,0,2020,92,"Drama,History"


# AKAs

In [None]:
akas.info()

In [None]:
akas.head()

In [None]:
akas = akas[akas['region'] == 'US']
akas.head()

In [None]:
akas.replace({'\\N':np.nan},inplace=True) 
akas.head()

In [None]:
akas.isna().sum()

# Ratings

In [None]:
ratings.info()

In [None]:
ratings.head()

In [None]:
ratings.isna().sum()

# Filtering

In [None]:
keepers =basics['tconst'].isin(akas['titleId'])
keepers

In [None]:
basics = basics[keepers]
basics

# Saving file

In [None]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")


In [None]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [None]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

In [None]:
## Save current dataframe to file.
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [None]:
# Open saved file and preview again
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings.head()

In [None]:
## Save current dataframe to file.
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [None]:
# Open saved file and preview again
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()