In [1]:
import requests
from io import BytesIO
import gzip
import pandas as pd

Getting the GZIP containing the dataset file

In [2]:
# URL of the GZIP containing the dataset file
URL_IMDB_DATASET_GZIP = "https://datasets.imdbws.com/title.basics.tsv.gz"

# Requesting the GZIP
gzip_response = requests.get(URL_IMDB_DATASET_GZIP)

Loading the dataset to a dataframe

In [4]:
with gzip.open(BytesIO(gzip_response.content), 'rt') as gzip_file:
    # Creating Dataframe from byte object
    imdb = pd.read_csv(gzip_file, sep="\t", na_values=r'\N', \
        dtype={ \
            "tcost": object, \
            "titleType": object, \
            "primaryTitle": object, \
            "originalTitle": object, \
            "isAdult": object, \
            "startYear": 'Int64', \
            "endYear": object, \
            "runtimeMinutes": object, \
            "genres": object, \
        } \
    )

Filtering and cleaning the dataset

In [5]:
# Filtering only movies
imdb = imdb.loc[imdb.titleType == "movie"]

# Dropping any movie with no year of release
imdb = imdb.dropna(subset="startYear")

# Dropping unnecessary columns
imdb.drop(
    labels=['tconst', 'titleType', 'originalTitle', 'isAdult', 'endYear'],
    axis=1,
    inplace=True
)

Writing to .TSV file

In [None]:
imdb.to_csv("imdb.tsv", sep="\t")

In [6]:
imdb.head()

Unnamed: 0,primaryTitle,startYear,runtimeMinutes,genres
8,Miss Jerry,1894,45,Romance
144,The Corbett-Fitzsimmons Fight,1897,100,"Documentary,News,Sport"
498,Bohemios,1905,100,
570,The Story of the Kelly Gang,1906,70,"Action,Adventure,Biography"
587,The Prodigal Son,1907,90,Drama
