# Project 3 - IMDB Movies

Anjali Prakash


## Load File and Imports

In [65]:
import pandas as pd 
import numpy as np 

In [66]:
# making new folder with os
import os
os.makedirs('Data/',exist_ok=True) # Confirm folder created
os.listdir("Data/")

['title.ratings.tsv',
 'title.akas.tsv',
 'title_basics.csv.gz',
 'title.basics.tsv',
 'title_akas.csv.gz']

In [67]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"

In [68]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [69]:
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

In [70]:
akas = pd.read_csv(akas_url, sep= '\t', low_memory=False)

##  Pre-processing steps

In [73]:
basics.shape

(10100795, 9)

In [74]:
ratings.shape

(1340671, 3)

In [75]:
akas.shape

(36951101, 8)

In [76]:
## elimate regions other than the US
akas_us = akas.loc[akas['region'] == 'US']     

In [77]:
akas_us.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
14,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
36,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
41,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [78]:
## replace '\n' values with 'NaN'
akas_us = akas_us.replace({'\\N':np.nan})

In [79]:
akas_us.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [81]:
## only keep films that also exist in akas_us table
keepers =basics['tconst'].isin(akas_us['titleId'])
keepers

0            True
1            True
2           False
3           False
4            True
            ...  
10100790    False
10100791    False
10100792    False
10100793    False
10100794    False
Name: tconst, Length: 10100795, dtype: bool

In [82]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10100656,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,\N,58,Family
10100685,tt9916620,movie,The Copeland Case,The Copeland Case,0,\N,\N,\N,Drama
10100723,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,\N,\N,\N,"Drama,Short"
10100746,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,\N,\N,Short


In [83]:
## replace null values with 'NaN'
basics = basics.replace({'\\N':np.nan})

In [84]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,,1,"Short,Sport"


In [85]:
## eliminate null values from runtimeMinutes column
basics = basics[basics['runtimeMinutes'].notna()]

In [87]:
## eliminate null values from genres column
basics = basics[basics['genres'].notna()]

In [89]:
## only keep titles that are 'movies'
basics = basics.loc[basics['titleType'] == 'movie']   

In [91]:
## change startYear to a float dtype
basics['startYear'] = basics['startYear'].astype('float')

In [92]:
basics.dtypes

tconst             object
titleType          object
primaryTitle       object
originalTitle      object
isAdult            object
startYear         float64
endYear            object
runtimeMinutes     object
genres             object
dtype: object

In [93]:
## only keep films that started between 2000-2021
basics = basics[(basics['startYear'] >=2000)&(basics['startYear'] <2022)]

In [95]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [97]:
## only include films in ratings table that also exist in 'akas_us' table
keepers =ratings['tconst'].isin(akas_us['titleId'])
keepers

0           True
1           True
2          False
3          False
4           True
           ...  
1340666    False
1340667    False
1340668    False
1340669    False
1340670    False
Name: tconst, Length: 1340671, dtype: bool

In [98]:
ratings = ratings[keepers]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1990
1,tt0000002,5.8,264
4,tt0000005,6.2,2647
5,tt0000006,5.0,182
6,tt0000007,5.4,829
...,...,...,...
1340632,tt9916200,8.1,233
1340633,tt9916204,8.1,267
1340640,tt9916348,8.3,18
1340641,tt9916362,6.4,5486


In [99]:
ratings = ratings.replace({'\\N':np.nan})

In [100]:
## preview of ratings table
ratings.info

<bound method DataFrame.info of             tconst  averageRating  numVotes
0        tt0000001            5.7      1990
1        tt0000002            5.8       264
4        tt0000005            6.2      2647
5        tt0000006            5.0       182
6        tt0000007            5.4       829
...            ...            ...       ...
1340632  tt9916200            8.1       233
1340633  tt9916204            8.1       267
1340640  tt9916348            8.3        18
1340641  tt9916362            6.4      5486
1340646  tt9916428            3.6        16

[506411 rows x 3 columns]>

In [101]:
## preview of basics table 
basics.info

<bound method DataFrame.info of              tconst titleType  \
34800     tt0035423     movie   
61112     tt0062336     movie   
67486     tt0068865     movie   
67664     tt0069049     movie   
86791     tt0088751     movie   
...             ...       ...   
10099934  tt9914942     movie   
10100328  tt9915872     movie   
10100468  tt9916170     movie   
10100477  tt9916190     movie   
10100561  tt9916362     movie   

                                               primaryTitle  \
34800                                        Kate & Leopold   
61112     The Tango of the Widower and Its Distorting Mi...   
67486                                   Lives of Performers   
67664                            The Other Side of the Wind   
86791                                     The Naked Monster   
...                                                     ...   
10099934                             Life Without Sara Amat   
10100328                               The Last White Witch   
1010

In [102]:
## preview of akas_us table
akas_us.info

<bound method DataFrame.info of             titleId  ordering                                      title  \
5         tt0000001         6                                 Carmencita   
14        tt0000002         7                     The Clown and His Dogs   
33        tt0000005        10                           Blacksmith Scene   
36        tt0000005         1                        Blacksmithing Scene   
41        tt0000005         6                        Blacksmith Scene #1   
...             ...       ...                                        ...   
36950627  tt9916560         1  March of Dimes Presents: Once Upon a Dime   
36950697  tt9916620         1                          The Copeland Case   
36950786  tt9916702         1              Loving London: The Playground   
36950829  tt9916756         1                   Pretty Pretty Black Girl   
36950845  tt9916764         1                                         38   

         region language        types             attri

In [103]:
## Save current dataframe to file.
basics.to_csv("Data/title.basics.csv.gz",compression='gzip',index=False)

In [104]:
# Open saved file and preview again
basics = pd.read_csv("Data/title.basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016.0,,90,Drama
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"


In [105]:
## Save current dataframe to file.
ratings.to_csv("Data/title.ratings.csv.gz",compression='gzip',index=False)

In [106]:
# Open saved file and preview again
ratings = pd.read_csv("Data/title.ratings.csv.gz", low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1990
1,tt0000002,5.8,264
2,tt0000005,6.2,2647
3,tt0000006,5.0,182
4,tt0000007,5.4,829


In [107]:
## Save current dataframe to file.
akas_us.to_csv("Data/title.akas.csv.gz",compression='gzip',index=False)

In [108]:
# Open saved file and preview again
akas_us = pd.read_csv("Data/title.akas.csv.gz", low_memory = False)
akas_us.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0
