In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings

# ignore warnings
warnings.filterwarnings('ignore')

In [3]:
akas = pd.read_csv('Data/title-akas-us-only.csv', low_memory=False)

In [4]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [5]:
basics = pd.read_csv('Data/title.basics.tsv.gz', sep='\t', low_memory=False)

In [6]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [7]:
# Filter the basics table down to only include the US by using the filter akas dataframe
filter_us_titles = basics['tconst'].isin(akas['titleId'])
basics = basics[filter_us_titles]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10016872,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,\N,58,Family
10016901,tt9916620,movie,The Copeland Case,The Copeland Case,0,\N,\N,\N,Drama
10016939,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,\N,\N,\N,"Drama,Short"
10016962,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,\N,\N,Short


In [8]:
basics.dropna(subset=['runtimeMinutes', 'genres'], how='any', inplace=True)

In [9]:
filter_documentaries = basics['genres'].str.contains('Documentary')

In [10]:
# Exclude movies in the documentary category.
basics = basics[~filter_documentaries]

In [11]:
# Filter to keep only full-length movies
filtered_movies = basics[basics['titleType'] == 'movie']

In [12]:
# Replace '\\N' values with NaN
basics['startYear'] = basics['startYear'].replace('\\N', np.nan)

# Convert the 'startYear' column to a float data type
basics['startYear'] = basics['startYear'].astype(float)

In [13]:
# Filter to keep movies with start years between 2000 and 2022
filtered = basics[(basics['titleType'] == 'movie') & (basics['startYear'] >= 2000) & (basics['startYear'] <= 2022)]

In [14]:
filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 104636 entries, 34802 to 10016809
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          104636 non-null  object 
 1   titleType       104636 non-null  object 
 2   primaryTitle    104636 non-null  object 
 3   originalTitle   104636 non-null  object 
 4   isAdult         104636 non-null  object 
 5   startYear       104636 non-null  float64
 6   endYear         104636 non-null  object 
 7   runtimeMinutes  104636 non-null  object 
 8   genres          104636 non-null  object 
dtypes: float64(1), object(8)
memory usage: 8.0+ MB


In [15]:
filtered.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,\N,118,"Comedy,Fantasy,Romance"
61114,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,\N,70,Drama
67666,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,\N,122,Drama
86793,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,\N,100,"Comedy,Horror,Sci-Fi"
93930,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,\N,126,Drama


In [16]:
# Save the filtered data to a CSV file in your 'Data' folder
filtered.to_csv('Data/filtered_movies.csv', index=False)

In [17]:
titles = pd.read_csv('Data/title.basics.tsv.gz', sep='\t', low_memory=False)

In [18]:
filter_titles = titles['tconst'].isin(basics['tconst'])

In [19]:
filter_titles = filter_titles.replace({'\\N':np.nan})

In [21]:
filter_titles.info()

<class 'pandas.core.series.Series'>
RangeIndex: 10017011 entries, 0 to 10017010
Series name: tconst
Non-Null Count     Dtype
--------------     -----
10017011 non-null  bool 
dtypes: bool(1)
memory usage: 9.6 MB


In [22]:
# Save 
filter_titles.to_csv('Data/filter_titles.csv', index=False)