In [1]:
# making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 

In [10]:
# Confirm folder was created and files added successfully
os.listdir("Data/")

['data.tsv',
 'IMDB Movie Dataset Info.docx',
 'title-akas-us-only.csv',
 'title.basics.tsv.gz',
 'title.ratings.tsv.gz']

In [29]:
import pandas as pd
import numpy as np

In [22]:
basics = pd.read_csv('Data/title.basics.tsv.gz', sep='\t', low_memory=False)

In [23]:
akas = pd.read_csv('Data/title-akas-us-only.csv', low_memory=False)

In [24]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [25]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [26]:
akas['region'].unique()

array(['US'], dtype=object)

In [27]:
# Filter the basics table down to only include the US by using the filter akas dataframe
filter_us_titles = basics['tconst'].isin(akas['titleId'])
basics = basics[filter_us_titles]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10016872,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,\N,58,Family
10016901,tt9916620,movie,The Copeland Case,The Copeland Case,0,\N,\N,\N,Drama
10016939,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,\N,\N,\N,"Drama,Short"
10016962,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,\N,\N,Short


In [150]:
# Convert placeholder "\N" values back to true null values
df = basics.replace({'\\N':np.nan})

In [151]:
# Dropping Null values from runtimeMinutes and genres columns
df = df.dropna(subset=['runtimeMinutes','genres'])

In [152]:
# Create a filter for titleType==Movie
filter_movie = df['titleType']=='movie'
df = df[filter_movie]

In [153]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 203476 entries, 8 to 10016777
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          203476 non-null  object
 1   titleType       203476 non-null  object
 2   primaryTitle    203476 non-null  object
 3   originalTitle   203476 non-null  object
 4   isAdult         203476 non-null  object
 5   startYear       199907 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  203476 non-null  object
 8   genres          203476 non-null  object
dtypes: object(9)
memory usage: 15.5+ MB


In [154]:
df['titleType'].value_counts()

movie    203476
Name: titleType, dtype: int64

In [155]:
# convert startYear to float
df['startYear'] = df['startYear'].astype(float)

In [156]:
df['startYear'].dtype

dtype('float64')

In [157]:
# Filter to keep movies with startYears that are >=2000 and <=2022
filter_startYear = (df['startYear'] >= 2000) & (df['startYear'] <= 2022)
df = df[filter_startYear]

In [158]:
# Eliminate movies that include "Documentary" in genre
filter_documentaries = df['genres'].str.contains('Documentary')
# Exclude movies in the documentary category.
df = df[~filter_documentaries]

In [159]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86979 entries, 34802 to 10016777
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86979 non-null  object 
 1   titleType       86979 non-null  object 
 2   primaryTitle    86979 non-null  object 
 3   originalTitle   86979 non-null  object 
 4   isAdult         86979 non-null  object 
 5   startYear       86979 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  86979 non-null  object 
 8   genres          86979 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.6+ MB


In [169]:
# Save file path to folder with new file name at the end
fname_out = "Data/filtered_data.csv"
df.to_csv(fname_out, index=False)

In [161]:
# Load the title ratings file
ratings = pd.read_csv('Data/title.ratings.tsv.gz', sep='\t', low_memory=False)

In [163]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1331492 entries, 0 to 1331491
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1331492 non-null  object 
 1   averageRating  1331492 non-null  float64
 2   numVotes       1331492 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.5+ MB


In [164]:
# Filter ratings dataframe
filter_basics = ratings['tconst'].isin(df['tconst'])
ratings = ratings[filter_basics]
ratings

Unnamed: 0,tconst,averageRating,numVotes
17961,tt0035423,6.4,87153
40764,tt0062336,6.4,175
46645,tt0069049,6.7,7754
63640,tt0088751,5.2,336
69953,tt0096056,5.6,846
...,...,...,...
1331411,tt9914942,6.6,178
1331437,tt9915872,6.4,9
1331450,tt9916170,7.0,7
1331451,tt9916190,3.7,243


In [166]:
# Convert placeholder "\N" values back to true null values
ratings = ratings.replace({'\\N':np.nan})

In [170]:
# Save file path to folder with new file name at the end
fname_out2 = "Data/filtered_ratings.csv"
ratings.to_csv(fname_out2, index=False)