In [1]:
import pandas as pd
import numpy as np
import os

# Saved data into Data Frames uncomment if you need to download orginal data
# BASICS_URL = "https://datasets.imdbws.com/title.basics.tsv.gz"
# RATINGS_URL = "https://datasets.imdbws.com/title.ratings.tsv.gz"
# AKAS_URL = "https://datasets.imdbws.com/title.akas.tsv.gz"

In [2]:
basics = pd.read_csv(BASICS_URL, sep="\t", low_memory=False)

In [3]:
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
8890192,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,\N,\N,"Action,Drama,Family"
8890193,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
8890194,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
8890195,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


# Preprocessing

## Basics Preprocessing

### Replacing "\N" with NULL values 

In [4]:
basics = basics.replace({"\\N": np.nan})

In [5]:
# checking if we replaced values with NULL
basics.isnull().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1185489
endYear           8800107
runtimeMinutes    6494822
genres             405190
dtype: int64

### Dropping NULL values in "runtimeMinutes column

In [6]:
basics = basics.dropna(subset=["runtimeMinutes"], how="all")

In [7]:
# checking if we dropped out NULLs
basics.isnull().sum()

tconst                  0
titleType               0
primaryTitle            0
originalTitle           0
isAdult                 1
startYear           35485
endYear           2350980
runtimeMinutes          0
genres              66548
dtype: int64

### Dropping NULL values in "genres" column

In [8]:
basics = basics.dropna(subset=["genres"], how="all")

In [9]:
# checking to see if we dropped the NULLs
basics.isnull().sum()

tconst                  0
titleType               0
primaryTitle            0
originalTitle           0
isAdult                 0
startYear           34152
endYear           2285970
runtimeMinutes          0
genres                  0
dtype: int64

In [10]:
basics.columns

Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')

### Filtering only movie titles

In [11]:
movie_filter = basics["titleType"] == "movie"

In [12]:
movie_filter

0          False
1          False
2          False
3          False
4          False
           ...  
8890147     True
8890153    False
8890188    False
8890195    False
8890196    False
Name: titleType, Length: 2328827, dtype: bool

In [16]:
# updating data frame to contain only movies
basics = basics.loc[movie_filter, :]
basics.shape

(359389, 9)

### Filtering out movies made in years 2000-2022

In [17]:
basics["startYear"].isnull().sum()

5601

In [44]:
# dropping NULLS from "startYear" column
basics = basics.dropna(subset=["startYear"], how="all")

In [19]:
basics.shape

(353788, 9)

In [45]:
# Creating a list to filter out years 2000-2022
my_list = list(range(2000, 2023))

In [46]:
# converting our items in our list into strings
for x, year in enumerate(my_list):
    my_list[x] = str(year)

In [22]:
# updating our data frame to movies made in 2020-2022
basics = basics[basics["startYear"].isin(my_list)]

In [23]:
# checking to see if we filtered correctly
basics["startYear"].value_counts()

2017    14098
2018    14016
2016    13768
2019    13601
2015    13304
2014    12934
2013    12225
2012    11491
2021    11315
2020    11019
2011    10663
2010    10083
2009     9233
2008     8042
2007     6847
2006     6393
2005     5733
2004     5093
2003     4503
2022     4315
2002     4071
2001     3796
2000     3575
Name: startYear, dtype: int64

### Filtering out "Documentaries 

In [24]:
# converting our items in column to lower case
basics["genre"] = basics["genres"].apply(lambda x: x.lower())

In [25]:
# updating our data frame
is_documentary = basics["genres"].str.contains("documentary", case=False)
basics = basics[~is_documentary]

In [26]:
# checking to see if no more documentary genres
basics[basics["genres"].str.contains("documentary", case=False)]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,genre


## Akas Preprocessing

In [27]:
akas = pd.read_csv(AKAS_URL, sep="\t", low_memory=False)

In [28]:
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0
...,...,...,...,...,...,...,...,...
31826323,tt9916852,5,Episódio #3.20,PT,pt,\N,\N,0
31826324,tt9916852,6,Episodio #3.20,IT,it,\N,\N,0
31826325,tt9916852,7,एपिसोड #3.20,IN,hi,\N,\N,0
31826326,tt9916856,1,The Wind,DE,\N,imdbDisplay,\N,0


### Filtering Movies only in US

In [29]:
united_filter = akas["region"] == "US"

In [30]:
akas = akas.loc[united_filter, :]

In [31]:
# checking to see if we only have movies in US
akas["region"].value_counts()

US    1317902
Name: region, dtype: int64

### Replacing "\N" with NULLS

In [32]:
akas = akas.replace({"\\N": np.nan})

In [33]:
# checking if we replaced "\N" with NULLS
akas.isnull().sum()

titleId                  0
ordering                 0
title                    0
region                   0
language           1314388
types               293860
attributes         1274027
isOriginalTitle       1375
dtype: int64

## Ratings

In [34]:
ratings = pd.read_csv(RATINGS_URL, sep="\t", low_memory=False)

In [35]:
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1878
1,tt0000002,5.9,248
2,tt0000003,6.5,1650
3,tt0000004,5.8,160
4,tt0000005,6.2,2475
...,...,...,...
1240370,tt9916690,6.5,6
1240371,tt9916720,5.1,209
1240372,tt9916730,8.7,6
1240373,tt9916766,6.7,19


In [47]:
# No NULLS
ratings.isnull().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

## Filtering basics data frame to include only movies in akas data frame

In [37]:
keeps = basics["tconst"].isin(akas["titleId"])
keeps

34805       True
61119       True
67672       True
77968      False
86806       True
           ...  
8889869     True
8889878     True
8889917    False
8889962     True
8890046    False
Name: tconst, Length: 138842, dtype: bool

In [38]:
basics = basics[keeps]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,genre
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance","comedy,fantasy,romance"
61119,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama,drama
67672,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama,drama
86806,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi","comedy,horror,sci-fi"
91077,tt0093119,movie,Grizzly II: Revenge,Grizzly II: The Predator,0,2020,,74,"Horror,Music,Thriller","horror,music,thriller"
...,...,...,...,...,...,...,...,...,...,...
8889333,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama,drama
8889729,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy","comedy,drama,fantasy"
8889869,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama,drama
8889878,tt9916190,movie,Safeguard,Safeguard,0,2020,,90,"Action,Adventure,Thriller","action,adventure,thriller"


## Saving Files to Repository

In [39]:
os.makedirs("Data/", exist_ok=True)

os.listdir("Data/")

['.ipynb_checkpoints']

In [40]:
basics.to_csv("Data/title_basics.csv.gz", compression="gzip", index=False)
akas.to_csv("Data/title_akas.csv.gz", compression="gzip", index=False)
ratings.to_csv("Data/title_ratings.csv.gz", compression="gzip", index=False)

In [41]:
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory=False)
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory=False)
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,genre
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance","comedy,fantasy,romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama,drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama,drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi","comedy,horror,sci-fi"
4,tt0093119,movie,Grizzly II: Revenge,Grizzly II: The Predator,0,2020,,74,"Horror,Music,Thriller","horror,music,thriller"


In [42]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [43]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1878
1,tt0000002,5.9,248
2,tt0000003,6.5,1650
3,tt0000004,5.8,160
4,tt0000005,6.2,2475
