# Project 3
- Zach Hanson

## Importing Libraries and Data

### Libraries

In [1]:
#pandas, numpy
import pandas as pd
import numpy as np

### Data

In [2]:
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'

In [3]:
#basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
#basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [6]:
#ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
#ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1953
1,tt0000002,5.8,263
2,tt0000003,6.5,1787
3,tt0000004,5.6,179
4,tt0000005,6.2,2589


In [7]:
#akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
#akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


## Filtering and Cleaning

### Placeholder values

In [8]:
# Replacing \N with pandas nan
basics = basics.replace({'\\N': np.nan})
ratings = ratings.replace({'\\N': np.nan})
akas = akas.replace({'\\N': np.nan})

### AKAS

In [9]:
#Keeping only movies from US
drop = ['US']
akas = akas[akas.region.isin(drop) != False]
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


### Title Basics


In [10]:
#Dropping movies that are null for runtimeMinutes and Genres
basics = basics.dropna(subset=['runtimeMinutes', 'genres'])

In [12]:
basics['titleType'].value_counts()

tvEpisode       1375284
short            592463
movie            377695
video            179011
tvMovie           90921
tvSeries          89355
tvSpecial         17726
tvMiniSeries      16871
tvShort            8671
videoGame           317
Name: titleType, dtype: int64

In [13]:
#Keeping only titleType of "Movie"
drop = ['movie']
basics = basics[basics.titleType.isin(drop) != False]
basics['titleType'].value_counts()

movie    377695
Name: titleType, dtype: int64

- Dropped all non-movie rows

In [14]:
basics['startYear'].value_counts()

2017    14313
2018    14266
2019    13983
2016    13913
2015    13429
        ...  
1899        1
1904        1
1897        1
1896        1
1894        1
Name: startYear, Length: 129, dtype: int64

In [20]:
#Keeping startYear of 2000-2022
basics = basics[basics['startYear'].notna()]
basics['startYear'].isna().sum()

0

In [26]:
basics['startYear'] = basics['startYear'].astype(int)

In [27]:
basics = basics[(basics['startYear'] >= 2000) & (basics['startYear'] <= 2022)]

In [28]:
basics['startYear'].value_counts()

2017    14313
2018    14266
2019    13983
2016    13913
2015    13429
2014    13051
2013    12350
2022    12269
2021    12167
2012    11605
2020    11454
2011    10747
2010    10181
2009     9325
2008     8128
2007     6940
2006     6486
2005     5801
2004     5181
2003     4567
2002     4116
2001     3846
2000     3630
Name: startYear, dtype: int64

In [29]:
#Eliminating movies with "Documentary" in genre
basics['genres'].value_counts()

Documentary              52666
Drama                    35838
Comedy                   13414
Comedy,Drama              6440
Horror                    5762
                         ...  
Family,Sci-Fi,Western        1
Adult,Fantasy,Horror         1
Crime,Family,Thriller        1
News,Reality-TV,Sport        1
Crime,Fantasy,Sci-Fi         1
Name: genres, Length: 1189, dtype: int64

In [30]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]
basics['genres'].value_counts()

Drama                         35838
Comedy                        13414
Comedy,Drama                   6440
Horror                         5762
Drama,Romance                  4293
                              ...  
Action,Animation,Game-Show        1
Adult,Crime,Mystery               1
Family,Musical,Sport              1
Horror,Music,Mystery              1
Crime,Fantasy,Sci-Fi              1
Name: genres, Length: 968, dtype: int64

- Documentaries seem to have been removed

In [32]:
#Keeping only US movies
keepers = basics['tconst'].isin(akas['titleId'])

In [33]:
#Filtering basics df
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama
...,...,...,...,...,...,...,...,...,...
9637357,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
9637753,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
9637893,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9637902,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


### Ratings

In [35]:
#Keeping only US movies
keepers = ratings['tconst'].isin(akas['titleId'])

In [36]:
#Filtering basics df
ratings = ratings[keepers]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1953
1,tt0000002,5.8,263
4,tt0000005,6.2,2589
5,tt0000006,5.1,177
6,tt0000007,5.4,812
...,...,...,...
1282589,tt9916200,8.2,221
1282590,tt9916204,8.2,253
1282597,tt9916348,8.5,17
1282598,tt9916362,6.4,5191


## Saving Data Frames

In [37]:
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

[]

- Folder created successfully

In [39]:
#Saving basics
basics.to_csv('Data/title_basics.csv.gz', compression='gzip', index=False)
basics = pd.read_csv('Data/title_basics.csv.gz', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [40]:
#Saving ratings
ratings.to_csv('Data/title_ratings.csv.gz', compression='gzip', index=False)
ratings = pd.read_csv('Data/title_ratings.csv.gz', low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1953
1,tt0000002,5.8,263
2,tt0000005,6.2,2589
3,tt0000006,5.1,177
4,tt0000007,5.4,812


In [42]:
#Saving akas
akas.to_csv('Data/title_akas.csv.gz', compression='gzip', index=False)
akas = pd.read_csv('Data/title_akas.csv.gz', low_memory=False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0
