In [8]:
import requests
from io import BytesIO
import gzip
import pandas as pd

Getting the GZIP containing the dataset file

In [9]:
# URL of the GZIP containing the dataset file
URL_IMDB_DATASET_GZIP = "https://datasets.imdbws.com/title.basics.tsv.gz"

# Requesting the GZIP
gzip_response = requests.get(URL_IMDB_DATASET_GZIP)

Loading the dataset to a dataframe

In [10]:
with gzip.open(BytesIO(gzip_response.content), 'rt') as gzip_file:
    # Creating Dataframe from byte object
    imdb = pd.read_csv(gzip_file, sep="\t", na_values=r'\N', \
        dtype={ \
            "tcost": object, \
            "titleType": object, \
            "primaryTitle": object, \
            "originalTitle": object, \
            "isAdult": object, \
            "startYear": 'Int64', \
            "endYear": object, \
            "runtimeMinutes": object, \
            "genres": object, \
        } \
    )

Filtering and cleaning the dataset

In [11]:
# Filtering only movies
imdb = imdb.loc[imdb.titleType == "movie"]

# Dropping any movie with no year of release
imdb = imdb.dropna(subset="startYear")

# Dropping unnecessary columns
imdb.drop(
    labels=['titleType', 'originalTitle', 'isAdult', 'endYear'],
    axis=1,
    inplace=True
)

# Drop duplicates Title and Year, keeping only the first occurrence
imdb.drop_duplicates(subset=['primaryTitle', 'startYear'], keep='first', inplace=True)

In [12]:
movies_genres = imdb.loc[:, ['tconst', 'genres']]
movies_genres.dropna(subset='genres', inplace=True)
movies_genres.set_index('tconst', inplace=True)
movies_genres = movies_genres.genres.str.split(',').explode().reset_index()

imdb.drop(labels='genres', axis=1, inplace=True)

In [13]:
imdb.to_csv("imdb.tsv", sep="\t", index=False)
imdb.head()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes
8,tt0000009,Miss Jerry,1894,45
144,tt0000147,The Corbett-Fitzsimmons Fight,1897,100
498,tt0000502,Bohemios,1905,100
570,tt0000574,The Story of the Kelly Gang,1906,70
587,tt0000591,The Prodigal Son,1907,90


In [14]:
movies_genres.to_csv("imdb_movies_genres.csv", sep=",", index=False)
movies_genres.head()

Unnamed: 0,tconst,genres
0,tt0000009,Romance
1,tt0000147,Documentary
2,tt0000147,News
3,tt0000147,Sport
4,tt0000574,Action
