In [1]:
import os
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
os.makedirs('Data/', exist_ok=True)

In [2]:
os.listdir("Data/")

['IMDB Movie Dataset Info.docx',
 'title-akas-us-only.csv',
 'title.basics.tsv.gz',
 'title.ratings.tsv.gz']

In [3]:
akas_url = "./Data/title-akas-us-only.csv"
basics_url = "C:/Users/dsdye/Documents/GitHub/prediction-of-movie-success/Data/title.basics.tsv.gz"

akas = pd.read_csv(akas_url, low_memory=False)
basics = pd.read_csv(basics_url, sep="\t", low_memory=False)

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


### Preprocessing Title Basics

In [4]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [5]:
basics.head() 

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [6]:
akas = akas.replace({'\\N':np.nan})
basics = basics.replace({'\\N':np.nan})

# akas.head()
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


### Keep Only US Movies

In [7]:
filter_us_titles = basics['tconst'].isin(akas['titleId'])
basics = basics[filter_us_titles]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10016872,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,,58,Family
10016901,tt9916620,movie,The Copeland Case,The Copeland Case,0,,,,Drama
10016939,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,,,,"Drama,Short"
10016962,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,,,Short


### Keep Only Full-Length Movies

In [8]:
basics = basics[basics['titleType'] == 'movie']
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 299550 entries, 8 to 10016901
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          299550 non-null  object
 1   titleType       299550 non-null  object
 2   primaryTitle    299550 non-null  object
 3   originalTitle   299550 non-null  object
 4   isAdult         299550 non-null  object
 5   startYear       240304 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  207381 non-null  object
 8   genres          288855 non-null  object
dtypes: object(9)
memory usage: 22.9+ MB


### Start Year

#### Convert startYear to a float dtype

In [9]:
basics.startYear.describe()

count     240304
unique       135
top         2019
freq        9620
Name: startYear, dtype: object

In [10]:
basics.startYear.describe()

count     240304
unique       135
top         2019
freq        9620
Name: startYear, dtype: object

In [11]:
basics['startYear'] = basics['startYear'].astype('float')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  basics['startYear'] = basics['startYear'].astype('float')


#### Filter to keep movies with startYears that are >=2000 and <=2022

In [12]:
basics.startYear.describe()

count    240304.000000
mean       1995.278064
std          29.479411
min        1894.000000
25%        1981.000000
50%        2009.000000
75%        2017.000000
max        2031.000000
Name: startYear, dtype: float64

In [13]:
# [basics['startYear'] >= 2000]
basics = basics[basics['startYear'] >= 2000]

In [14]:
basics.startYear.describe()

count    150280.000000
mean       2014.121693
std           6.008480
min        2000.000000
25%        2010.000000
50%        2015.000000
75%        2019.000000
max        2031.000000
Name: startYear, dtype: float64

### Eliminate movies that include "Documentary" in genre

In [15]:
basics = basics[~basics['genres'].str.contains('Documentary', na=False)]

### Summary

In [16]:
[print(col, basics[col].info(), '\n\n') for col in basics.columns]

<class 'pandas.core.series.Series'>
Int64Index: 109425 entries, 34802 to 10016809
Series name: tconst
Non-Null Count   Dtype 
--------------   ----- 
109425 non-null  object
dtypes: object(1)
memory usage: 1.7+ MB
tconst None 


<class 'pandas.core.series.Series'>
Int64Index: 109425 entries, 34802 to 10016809
Series name: titleType
Non-Null Count   Dtype 
--------------   ----- 
109425 non-null  object
dtypes: object(1)
memory usage: 1.7+ MB
titleType None 


<class 'pandas.core.series.Series'>
Int64Index: 109425 entries, 34802 to 10016809
Series name: primaryTitle
Non-Null Count   Dtype 
--------------   ----- 
109425 non-null  object
dtypes: object(1)
memory usage: 1.7+ MB
primaryTitle None 


<class 'pandas.core.series.Series'>
Int64Index: 109425 entries, 34802 to 10016809
Series name: originalTitle
Non-Null Count   Dtype 
--------------   ----- 
109425 non-null  object
dtypes: object(1)
memory usage: 1.7+ MB
originalTitle None 


<class 'pandas.core.series.Series'>
Int64Index: 1094

[None, None, None, None, None, None, None, None, None]

In [17]:
basics.head() 

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61114,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67666,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86793,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93930,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
