Data from: https://datasets.imdbws.com



In [1]:
import pandas as pd
import re
import requests
import subprocess

Import datasets

In [9]:
def download_dataset(filename):
    url = "https://datasets.imdbws.com/title.{}.tsv.gz".format(filename)
    curl_cmd = "curl -L {} --output title.{}.tsv.gz".format(url, filename)
    status = subprocess.run(curl_cmd.split())
    gunzip_cmd = "gunzip title.{}.tsv.gz -qq".format(filename)
    status = subprocess.run(gunzip_cmd.split())
    dataset = pd.read_csv("title.{}.tsv".format(filename), sep="\t")
    return dataset

In [11]:
ratings = download_dataset("ratings")
print("ratings dataframe created")
titles = download_dataset("basics")
akas = download_dataset("akas")

  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):


Combine the data

In [12]:
df = titles.join(ratings.set_index('tconst'), on='tconst')
df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,1965.0
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",5.8,263.0
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance",6.5,1807.0
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short",5.6,178.0
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short",6.2,2604.0


Get number of types of data

In [13]:
df["titleType"].value_counts()

tvEpisode       7425225
short            924804
movie            642410
video            273194
tvSeries         241897
tvMovie          141116
tvMiniSeries      48161
tvSpecial         41168
videoGame         34092
tvShort           10057
tvPilot               1
Name: titleType, dtype: int64

Drop entries that have no votes, or no average rating

In [14]:
df = df.dropna(subset=['averageRating', 'numVotes'])

In [15]:
df["titleType"].value_counts()

tvEpisode       636167
movie           290265
short           146547
tvSeries         86731
tvMovie          50330
video            49191
tvMiniSeries     14790
videoGame        14487
tvSpecial        11043
tvShort           2180
Name: titleType, dtype: int64

We only want the alternate titles from the akas dataframe. We can remove the other columns, drop the duplicates and group all the alternate titles in a single list for each id.

In [16]:
akas = akas.drop(["ordering", "region", "language", "types", "attributes", "isOriginalTitle"], axis = 1)
akas = akas.drop_duplicates()
akas = akas.groupby('titleId').agg({'title': lambda x: list(x)})
akas

Unnamed: 0_level_0,title
titleId,Unnamed: 1_level_1
tt0000001,"[Карменсіта, Carmencita, Carmencita - spanyol ..."
tt0000002,"[Le clown et ses chiens, A bohóc és kutyái, De..."
tt0000003,"[Sarmanul Pierrot, Szegény Pierrot, 哀れなピエロ, Бі..."
tt0000004,"[Un bon bock, Ein gutes Glas Bier, Un ţap de b..."
tt0000005,"[Blacksmith Scene, The Blacksmith's Forge, Bla..."
...,...
tt9916846,"[Épisode #3.18, Folge #3.18, エピソード #3.18, Epis..."
tt9916848,"[Épisode #3.17, Folge #3.17, エピソード #3.17, Epis..."
tt9916850,"[Episódio #3.19, एपिसोड #3.19, Épisode #3.19, ..."
tt9916852,"[Episodio #3.20, Épisode #3.20, Folge #3.20, エ..."


Combine the akas dataframe and isolate the video games

In [17]:
df = df.join(akas, on='tconst').rename(columns = {"title": "akas"})
video_games = df[df["titleType"]=="videoGame"]
video_games

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,akas
82552,tt0084376,videoGame,"MysteryDisc: Murder, Anyone?","MysteryDisc: Murder, Anyone?",0,1982,\N,\N,"Adventure,Crime,Mystery",6.1,37.0,"[MysteryDisc: Murder, Anyone?]"
102667,tt0105000,videoGame,Night Trap,Night Trap,0,1992,\N,\N,"Adventure,Horror,Mystery",6.2,377.0,"[Scene of the Crime, Night Trap, ナイト トラップ]"
107391,tt0109865,videoGame,Gabriel Knight: Sins of the Fathers,Gabriel Knight: Sins of the Fathers,0,1993,\N,\N,"Adventure,Drama,Horror",9.1,707.0,[Gabriel Knight: Sins of the Fathers]
107783,tt0110267,videoGame,King's Quest VII: The Princeless Bride,King's Quest VII: The Princeless Bride,0,1994,\N,\N,"Adventure,Fantasy",7.6,206.0,"[King's Quest VII: The Princeless Bride, King'..."
108416,tt0110909,videoGame,Psychic Detective,Psychic Detective,0,1995,\N,\N,"Adventure,Fantasy,Mystery",8.5,50.0,[Psychic Detective]
...,...,...,...,...,...,...,...,...,...,...,...,...
9769171,tt9888864,videoGame,Infernal,Infernal,0,2007,\N,\N,Action,6.0,17.0,"[Infernal, Infernal: Hell's Vengeance, Diaboli..."
9770853,tt9892552,videoGame,Spiral Splatter,Spiral Splatter,0,2017,\N,\N,Action,2.0,10.0,[Spiral Splatter]
9771210,tt9893348,videoGame,Alien: Covenant In Utero VR Experience,Alien: Covenant In Utero VR Experience,0,2017,\N,\N,Sci-Fi,6.2,17.0,[Alien: Covenant In Utero VR Experience]
9771413,tt9893804,videoGame,The Lego Movie 2 Videogame,The Lego Movie 2 Videogame,0,2019,\N,\N,"Adventure,Animation,Family",5.5,156.0,"[The Lego Movie 2 Videogame, The Lego Movie 2,..."


Save the video games file to a csv

In [None]:
video_games.to_csv("imdb_games_db.csv")

# UNUSED/TEST CODE FROM BELOW

In [None]:
with_winnie = titles['primaryTitle'].str.contains('[^a-z]lego[^a-z]|$lego[^a-z]', na=False, flags=re.IGNORECASE, regex=True)
with_movie_type = titles['titleType']=='movie'

titles[with_winnie & with_movie_type]