# IMDB Notebook

Author: Nian Vrey

## Setup

### Imports

In [48]:
# Imports
import pandas as pd
import numpy as np
import os

### Files to DataFrames    

In [49]:
title_akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'
title_basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
title_ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

title_akas = pd.read_csv(title_akas_url, sep='\t', low_memory=False)
title_basics = pd.read_csv(title_basics_url, sep='\t', low_memory=False)
title_ratings = pd.read_csv(title_ratings_url, sep='\t', low_memory=False)

In [50]:
title_akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37517220 entries, 0 to 37517219
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.2+ GB


In [51]:
title_akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [52]:
title_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10246265 entries, 0 to 10246264
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 703.6+ MB


In [53]:
title_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [54]:
title_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1360168 entries, 0 to 1360167
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1360168 non-null  object 
 1   averageRating  1360168 non-null  float64
 2   numVotes       1360168 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 31.1+ MB


In [55]:
title_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2002
1,tt0000002,5.8,269
2,tt0000003,6.5,1893
3,tt0000004,5.5,178
4,tt0000005,6.2,2678


### Data Cleaning

#### AKAs

In [56]:
# Replace "\N" with np.nan
title_akas.replace({'\\N':np.nan}, inplace=True)

In [57]:
# Include only movies that were released in the United States
title_akas = title_akas.loc[title_akas['region'] == 'US']

In [58]:
title_akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


#### Basics

In [59]:
# Replace "\N" with np.nan
title_basics.replace({'\\N':np.nan}, inplace=True)

In [60]:
# Exclude any movie with missing values for genre or runtime
title_basics.dropna(subset=['runtimeMinutes','genres'], inplace=True)

In [61]:
# Include only full-length movies (titleType = "movie").
# Include only fictional movies (not from documentary genre)
# Include only movies that were released 2000 - 2021 (include 2000 and 2021)
# Include only movies that were released in the United States
#   ( If done before AKAs was filtered, use: & (title_basics['tconst'].isin(title_akas['titleId'].loc[title_akas['region'] == 'US'])

title_basics = title_basics[(title_basics['titleType']=='movie') 
                            & (title_basics['startYear'] >= '2000') & (title_basics['startYear'] <= '2021')
                            & ~(title_basics['genres'].str.contains('documentary',case=False))
                            & (title_basics['tconst'].isin(title_akas['titleId']))
]

In [62]:
title_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61111,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67485,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016,,90,Drama
67663,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
80548,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008,,94,Horror


#### Ratings

In [63]:
# Replace "\N" with np.nan
title_ratings.replace({'\\N':np.nan}, inplace=True)

In [64]:
# Include only movies that were released in the United States
title_ratings = title_ratings[title_ratings['tconst'].isin(title_akas['titleId'])]

In [65]:
title_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2002
1,tt0000002,5.8,269
4,tt0000005,6.2,2678
5,tt0000006,5.0,182
6,tt0000007,5.4,838


### Saving Data to CSVs

In [66]:
# Create Data Directory if not exist
os.makedirs('Data/', exist_ok=True)

In [67]:
title_akas.to_csv('Data/title_akas.csv.gz', compression='gzip', index=False)
title_basics.to_csv('Data/title_basics.csv.gz', compression='gzip', index=False)
title_ratings.to_csv('Data/title_ratings.csv.gz', compression='gzip', index=False)

#### Read CSVs

In [68]:
title_akas_path = 'Data/title_akas.csv.gz'
title_basics_path = 'Data/title_basics.csv.gz'
title_ratings_path = 'Data/title_ratings.csv.gz'

title_akas = pd.read_csv(title_akas_path)
title_basics = pd.read_csv(title_basics_path)
title_ratings = pd.read_csv(title_ratings_path)

In [69]:
title_akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [70]:
title_akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1472163 entries, 0 to 1472162
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1472163 non-null  object 
 1   ordering         1472163 non-null  int64  
 2   title            1472163 non-null  object 
 3   region           1472163 non-null  object 
 4   language         4168 non-null     object 
 5   types            984662 non-null   object 
 6   attributes       47623 non-null    object 
 7   isOriginalTitle  1470822 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 89.9+ MB


In [71]:
title_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
2,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016,,90,Drama
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
4,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008,,94,Horror


In [72]:
title_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82143 entries, 0 to 82142
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          82143 non-null  object 
 1   titleType       82143 non-null  object 
 2   primaryTitle    82143 non-null  object 
 3   originalTitle   82143 non-null  object 
 4   isAdult         82143 non-null  int64  
 5   startYear       82143 non-null  int64  
 6   endYear         0 non-null      float64
 7   runtimeMinutes  82143 non-null  int64  
 8   genres          82143 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 5.6+ MB


In [73]:
title_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2002
1,tt0000002,5.8,269
2,tt0000005,6.2,2678
3,tt0000006,5.0,182
4,tt0000007,5.4,838


In [74]:
title_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 512001 entries, 0 to 512000
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         512001 non-null  object 
 1   averageRating  512001 non-null  float64
 2   numVotes       512001 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 11.7+ MB
