In [3]:
import numpy as np
import pandas as pd

In [4]:
# Read the basic data frame of IMDb
df = pd.read_csv("../dat/imdb_data/title_basics/data.tsv", sep="\t")
print("overall: ", len(df.index))

# Sort out any non-movies (e.g tv-shows)
df = df[df["titleType"] == "movie"]
print("Number of movies:\t", len(df.index))

# Read the review data frame of IMDb
df_reviews = pd.read_csv("../dat/imdb_data/title_ratings/data.tsv", sep="\t")

df["tconst"] = df["tconst"].astype(str)
df_reviews["tconst"] = df_reviews["tconst"].astype(str)

# inner merge of movies and ratings (movies without any votes are dropped)
df = df.merge(df_reviews, how="inner", on="tconst")
print("Number of movies with rating:\t", len(df.index))

# Free up some memory
del df_reviews


# Read in our scraped data
df_scrape = pd.read_csv("../dat/tconst_scraped_data.csv")
df_scrape = df_scrape[~df_scrape.duplicated(['tconst'], keep="first")]
print("Number scraped movies:\t", len(df_scrape))

## Hard Coding

# Change Movie ID "" in the basic DF to the new id ""
# These are the same movie. The basic IMDb data set has an old (invalid) tconst
df["tconst"] = df["tconst"].replace(["tt11905872"], "tt4131756")
df["tconst"] = df["tconst"].replace(["tt4332782"], "tt0246007")
df["tconst"] = df["tconst"].replace(["tt5072702"], "tt4508986")
df["tconst"] = df["tconst"].replace(["tt6419536"], "tt4481310")

df = df[~df.duplicated(['tconst'], keep="first")]

# Drop Movie 
# "tt7368158", "tt2437136", "tt2584608", "tt6858500",
# "tt7375242", "tt7598832", "tt7718552", "tt7728678", "tt7738378"
# "tt8768374", "tt9828428"
# because it's no longer available
# Movie not available (404 Error)
df = df[df.tconst != "tt7368158"]
df = df[df.tconst != "tt2437136"]
df = df[df.tconst != "tt2584608"]
df = df[df.tconst != "tt6858500"]
df = df[df.tconst != "tt7375242"]
df = df[df.tconst != "tt7598832"]
df = df[df.tconst != "tt7718552"]
df = df[df.tconst != "tt7728678"]
df = df[df.tconst != "tt7738378"]
df = df[df.tconst != "tt8768374"]
df = df[df.tconst != "tt9828428"]
print("Number of movies after dropping:\t", df.shape[0])

# Movie "tt1027755" does not have start year but was realeased in 2012
df.iloc[147505, 5] = "2012"
df.iloc[148639, 5] = "2020"
df.iloc[161518, 5] = "2019"
df.iloc[161520, 5] = "2020"
df.iloc[178919, 5] = "2021"
df.iloc[185090, 5] = "2021"
df.iloc[254051, 5] = "2019"
df.iloc[259152, 5] = "2018"
df.iloc[259650, 5] = "2018"
df.iloc[271440, 5] = "2018"
df.iloc[271532, 5] = "2016"
df.iloc[272545, 5] = "2019"

df["tconst"] = df["tconst"].astype(str)
df_scrape["tconst"] = df_scrape["tconst"].astype(str)
df2 = df.copy()


# Merge the data frame and the scraped content
df = df.merge(df_scrape, how="inner", on="tconst")
print("Number of movies after merge:\t", df.shape[0])

# Free up some memory
del df_scrape

# Read the review data frame of IMDb
df_crew = pd.read_csv("../dat/imdb_data/title_crew/data.tsv", sep="\t")

df["tconst"] = df["tconst"].astype(str)
df_crew["tconst"] = df_crew["tconst"].astype(str)

# inner merge of movies and ratings (movies without any votes are dropped)
df = df.merge(df_crew, how="inner", on="tconst")
print("Number of movies after crew merge:\t", len(df))


# Free up some memory
del df_crew

# Sort according to tconst
df = df.sort_values("tconst")

df.to_csv("../dat/raw_data.csv", index=False)
print("saved dataframe sucesfully!")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


overall:  8598896
Number of movies:	 598851
Number of movies with rating:	 273557


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Number scraped movies:	 273543
Number of movies after dropping:	 273543
Number of movies after merge:	 273543
Number of movies after crew merge:	 273543
saved dataframe sucesfully!


In [5]:
import re
from collections import Counter

budget = df["Budget"].to_numpy()

currencies = []
counter = 0

for i in range(len(budget)):
    string = budget[i]

    if not pd.isna(string):
        number = "".join(re.findall(r"[\d]+", string))
        currency = re.findall(r"[^{\d,\xa0}]+", string)[0]
        currencies.append(currency)

        if currency in ["$", "£", "€", "CA$", "₹", "A$"]:
            counter += 1

print(Counter(currencies))
print(counter)

# data for ["$", "€", "CA$", "£", "A$", ]

Counter({'$': 32761, '€': 6603, '₹': 2990, '£': 2441, 'CA$': 1968, 'A$': 801, 'R$': 415, 'SEK': 395, 'NOK': 295, 'RUR': 282, 'FIM': 223, 'DKK': 199, 'IRR': 194, 'TRL': 186, 'DEM': 185, 'CN¥': 175, 'FRF': 169, 'MYR': 162, 'CHF': 156, 'EGP': 154, 'MX$': 148, 'HUF': 146, 'ARS': 140, '¥': 119, 'NZ$': 114, 'PLN': 105, 'ZAR': 96, '₩': 87, 'BDT': 87, 'IDR': 85, 'NLG': 84, 'HK$': 80, '₱': 77, 'CZK': 76, 'SGD': 72, 'ESP': 64, 'ITL': 63, 'DOP': 55, 'THB': 52, 'PKR': 51, 'NPR': 44, 'UAH': 44, 'ROL': 40, 'ISK': 40, 'NT$': 34, '₪': 33, 'NGN': 31, 'LKR': 30, 'PTE': 25, 'VEB': 21, 'HRK': 20, 'RON': 18, 'BGL': 18, 'COP': 17, 'CLP': 16, '₫': 14, 'AZM': 14, 'GEL': 13, 'AED': 12, 'BEF': 11, 'MVR': 10, 'MNT': 10, 'LVL': 9, 'MMK': 9, 'LTL': 8, 'EEK': 7, 'MAD': 7, 'AMD': 7, 'GRD': 6, 'SIT': 5, 'YUM': 5, 'TND': 5, 'PEN': 5, 'BHD': 5, 'IEP': 4, 'TTD': 4, 'JOD': 4, 'BYR': 4, 'PYG': 3, 'ATS': 3, 'LUF': 3, 'GTQ': 3, 'ALL': 3, 'KZT': 3, 'BAM': 3, 'MKD': 3, 'SKK': 2, 'CUP': 2, 'NAD': 2, 'LBP': 2, 'SAR': 2, 'BOB': 

In [6]:
from currency_converter import CurrencyConverter
from datetime import date  # datetime works too

c = CurrencyConverter(fallback_on_wrong_date=True)
c.convert(100, "USD", date=date(1999, 3, 1))

91.02494083378846

In [7]:
np.unique(currencies)

array(['$', 'A$', 'AED', 'AFA', 'ALL', 'AMD', 'ANG', 'ARS', 'ATS', 'AZM',
       'BAM', 'BDT', 'BEF', 'BGL', 'BHD', 'BND', 'BOB', 'BTN', 'BYR',
       'CA$', 'CDF', 'CHF', 'CLP', 'CN¥', 'COP', 'CRC', 'CUP', 'CZK',
       'DEM', 'DKK', 'DOP', 'DZD', 'EC$', 'EEK', 'EGP', 'ESP', 'ETB',
       'FCFA', 'FIM', 'FRF', 'GEL', 'GHC', 'GRD', 'GTQ', 'HK$', 'HNL',
       'HRK', 'HUF', 'IDR', 'IEP', 'IQD', 'IRR', 'ISK', 'ITL', 'JMD',
       'JOD', 'KES', 'KGS', 'KWD', 'KZT', 'LBP', 'LKR', 'LTL', 'LUF',
       'LVL', 'MAD', 'MKD', 'MMK', 'MNT', 'MTL', 'MVR', 'MX$', 'MYR',
       'NAD', 'NGN', 'NLG', 'NOK', 'NPR', 'NT$', 'NZ$', 'OMR', 'PAB',
       'PEN', 'PKR', 'PLN', 'PTE', 'PYG', 'QAR', 'R$', 'ROL', 'RON',
       'RUR', 'RWF', 'SAR', 'SEK', 'SGD', 'SIT', 'SKK', 'SYP', 'THB',
       'TND', 'TRL', 'TTD', 'UAH', 'UGX', 'VEB', 'XAU', 'YUM', 'ZAR',
       'ZWD', '£', '¥', '₩', '₪', '₫', '€', '₱', '₹'], dtype='<U4')

In [None]:
print("\xa0")

In [None]:
# ours 797_615_189.873417
# infl 797_701_090.78

In [None]:
df.to_csv("../dat/raw_data.csv", index=False)

In [None]:
c.currencies

In [None]:
df[(df.startYear == "\\N")]