In [1]:
%load_ext lab_black

#### 1) Load some packages that will be needed

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from imdb import IMDb
from matplotlib import pyplot as plt
from scipy.stats import normaltest

%matplotlib inline

#### 2) Load all needed data

In [3]:
# loading IMDB data about movies
titles = pd.read_csv("data/title_basics.tsv", sep="\t", low_memory=False)
ratings = pd.read_csv("data/title_ratings.tsv", sep="\t", low_memory=False)
akas = pd.read_csv("data/title_akas.tsv", sep="\t", low_memory=False)

#### 3) Define functions

In [4]:
def prepare_title_data(titles=titles, akas=akas, after_1990=True):
    """Return clean titles of movies only (tv shows or series removed)"""
    titles = titles[titles.titleType == "movie"]
    titles = titles.replace(r"\\N", np.nan, regex=True)
    titles.startYear = pd.to_numeric(titles.startYear)
    titles.runtimeMinutes = pd.to_numeric(titles.runtimeMinutes)
    if after_1990:
        titles = titles[titles.startYear > 1990]
    titles = titles.merge(
        akas[["titleId", "region", "language"]],
        left_on="tconst",
        right_on="titleId",
        how="left",
    )
    titles = titles[
        [
            "tconst",
            "originalTitle",
            "startYear",
            "runtimeMinutes",
            "genres",
            "region",
            "language",
        ]
    ]
    titles.fillna("no value", inplace=True)
    titles = (
        titles.groupby(
            ["tconst", "originalTitle", "startYear", "runtimeMinutes", "genres"]
        )
        .agg(lambda x: ",".join(set(x)))
        .reset_index()
    )
    return titles

In [5]:
def add_rating_to_movies(titles=titles, ratings=ratings, votes_limit=75):
    """Merge movie ratings with their titles. Possible to reduce the dataset using votes_limit."""
    titles_with_rating = titles.merge(ratings, how="left", on="tconst")
    titles_with_rating.dropna(subset=["averageRating"], inplace=True)
    if votes_limit:
        titles_with_rating = titles_with_rating[
            titles_with_rating.numVotes > votes_limit
        ]
    return titles_with_rating

In [6]:
def get_dummies(data, column):
    """Return movies dataset with dummy genre variables."""
    subset = data[column].str.get_dummies(sep=",")
    data.index = subset.index
    final_df = pd.concat([data, subset], axis=1)
    final_df.drop(columns=[column], inplace=True)
    return final_df

#### 4) Preprocess the data to get the final dataframe

In [7]:
step1 = prepare_title_data()
step2 = add_rating_to_movies(titles=step1)
step3 = get_dummies(step2, "genres")
step4 = get_dummies(step3, "language")
df_final = get_dummies(step4, "region")

In [9]:
df_final[df_final.tconst == "tt0099296"]

Unnamed: 0,tconst,originalTitle,startYear,runtimeMinutes,averageRating,numVotes,Action,Adult,Adventure,Animation,...,XNA,XSA,XWG,XWW,XYU,YE,YUCS,ZA,\N,no value
114,tt0099296,La condanna,1991.0,92,6.1,207.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


#### 5) Save file to csv

In [8]:
df_final.to_csv("data/movies_with_ratings.csv")