# Project 3 Part 1: Loading and Preprocessing

In [8]:
import pandas as pd
import numpy as np

# Load Data

In [59]:
basics_df = pd.read_csv('https://datasets.imdbws.com/title.basics.tsv.gz', sep='\t', low_memory=False)

In [4]:
akas_df = pd.read_csv('https://datasets.imdbws.com/title.akas.tsv.gz', sep='\t', low_memory=False)

In [6]:
ratings_df = pd.read_csv('https://datasets.imdbws.com/title.ratings.tsv.gz', sep='\t', low_memory=False)

# Preprocessing basics_df

In [36]:
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


## Replace '/N'

In [60]:
basics_df.replace({'\\N':np.nan}, inplace=True)

## Eliminate entries null for runtimeMinutes or genre

In [61]:
basics_df.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1229118
endYear           9130315
runtimeMinutes    6753522
genres             427252
dtype: int64

In [62]:
basics_df.dropna(subset=['runtimeMinutes', 'genres'], inplace=True)

## Only keep if titleType==Movie

In [63]:
basics_df['titleType'].unique()

array(['short', 'movie', 'tvEpisode', 'tvSeries', 'tvShort', 'tvMovie',
       'tvMiniSeries', 'video', 'tvSpecial', 'videoGame'], dtype=object)

In [64]:
basics_df = basics_df[basics_df.titleType == 'movie']

## keep startYear 2000-2022

In [65]:
basics_df['startYear'] = pd.to_numeric(basics_df['startYear'])

In [66]:
basics_df['startYear'].dtype

dtype('float64')

In [67]:
year_filter = (basics_df['startYear'] > 1999) & (basics_df['startYear'] < 2023)

In [68]:
basics_df = basics_df[year_filter]

In [69]:
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13079,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021.0,,133,Documentary
34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61093,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
66308,tt0067683,movie,Workers '71: Nothing About Us Without Us,Robotnicy 1971 - Nic o nas bez nas,0,2006.0,,47,Documentary
67639,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama


## Eliminate movies that include "Documentary" in genre

In [70]:
is_documentary = basics_df['genres'].str.contains('documentary',case=False)
basics_df = basics_df[~is_documentary]

## Keep only US movies

In [81]:
akas_df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
14,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
36,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
41,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [78]:
US_filter = akas_df['region'] == 'US'
akas_df = akas_df[US_filter]

In [79]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keep =basics_df['tconst'].isin(akas_df['titleId'])
basics_df = basics_df[keep]

In [80]:
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61093,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
67639,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86770,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93906,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [77]:
akas_df.loc[akas_df['region']=='US']

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
14,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
36,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
41,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0
...,...,...,...,...,...,...,...,...
33183184,tt9916702,1,Loving London: The Playground,US,\N,\N,\N,0
33183221,tt9916720,10,The Demonic Nun,US,\N,tv,\N,0
33183223,tt9916720,12,The Nun 2,US,\N,imdbDisplay,\N,0
33183240,tt9916756,1,Pretty Pretty Black Girl,US,\N,imdbDisplay,\N,0


# Preprocesing akas_df

## keep only US movies
- see preprocessing basics_df

## Replace '/N'

In [82]:
akas_df.replace({'\\N':np.nan}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  akas_df.replace({'\\N':np.nan}, inplace=True)


# Preprocessing ratings_df

In [83]:
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1910
1,tt0000002,5.8,256
2,tt0000003,6.5,1714
3,tt0000004,5.6,169
4,tt0000005,6.2,2527


## Replace '/N'

In [84]:
ratings_df.replace({'\\N':np.nan}, inplace=True)

## Keep only US movies

In [85]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keep =ratings_df['tconst'].isin(akas_df['titleId'])
ratings_df = ratings_df[keep]

# Final info and saving

In [86]:
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82481 entries, 34792 to 9226165
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          82481 non-null  object 
 1   titleType       82481 non-null  object 
 2   primaryTitle    82481 non-null  object 
 3   originalTitle   82481 non-null  object 
 4   isAdult         82481 non-null  object 
 5   startYear       82481 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  82481 non-null  object 
 8   genres          82481 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.3+ MB


In [87]:
akas_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1348981 entries, 5 to 33183256
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1348981 non-null  object
 1   ordering         1348981 non-null  int64 
 2   title            1348981 non-null  object
 3   region           1348981 non-null  object
 4   language         3700 non-null     object
 5   types            963986 non-null   object
 6   attributes       44956 non-null    object
 7   isOriginalTitle  1347606 non-null  object
dtypes: int64(1), object(7)
memory usage: 92.6+ MB


In [88]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 476304 entries, 0 to 1260758
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         476304 non-null  object 
 1   averageRating  476304 non-null  float64
 2   numVotes       476304 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 14.5+ MB


In [92]:
basics_df.to_csv('basics.csv.gz')

In [93]:
akas_df.to_csv('akas.csv.gz')

In [94]:
ratings_df.to_csv('ratings.csv.gz')