## movieStats

### Author:  Sheneka Allen

In [1]:
# import key libraries, tools
import pandas as pd
import pymysql
pymysql.install_as_MySQLdb()
from urllib.parse import quote_plus as urlquote # must have for special char pwd
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists

In [2]:
# connect to PyMysql
username = 'root'
password = 'pwd'

# Create connection string using credentials for special character mysql pwd

connection = f"mysql+pymysql://{username}:{urlquote(password)}@localhost/movieStats"
engine = create_engine(connection)

In [3]:
# Check if database exists, if not, create it
if database_exists(connection) == False: create_database(connection)
else: print('The database already exists.')

The database already exists.


In [4]:
# verify movieStats db was created
database_exists(connection)

True

In [28]:
# load the three (3) official IMDB data for the requested tables
# title.basics.tsv.gz, title.ratings.tsv.gz, title.akas.tsv.gz

df = pd.read_csv('Data/title.basics.tsv.gz', compression='gzip', sep='\t', low_memory=False)

# show basics dataframe
df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9023456 entries, 0 to 9023455
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 619.6+ MB


In [30]:
df.isnull().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

In [31]:
# Exclude any movie with missing values for genres or runtime
# Decided to exclude other 22 rows of nulls, small number 

df.dropna(subset=['genres', 'primaryTitle', 'originalTitle'], inplace=True)
df.isna().sum()

tconst            0
titleType         0
primaryTitle      0
originalTitle     0
isAdult           0
startYear         0
endYear           0
runtimeMinutes    0
genres            0
dtype: int64

In [32]:
# list number of movie categories by titleType
df.groupby(['titleType']).size()

titleType
movie            613641
short            875943
tvEpisode       6784692
tvMiniSeries      43986
tvMovie          136564
tvPilot               2
tvSeries         227083
tvShort           10580
tvSpecial         37428
video            262061
videoGame         31455
dtype: int64

In [34]:
# Include only full-length movies (titleType = "movie").

# Modified code: https://sparkexamples.com/pandas/pandas-delete-rows-based-on-column-value
df.drop(df[df['titleType'] != 'movie'].index, inplace=True)
df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama


In [35]:
# list number of genres categories
df.groupby(['genres']).size()

genres
Action                    14086
Action,Adult                 11
Action,Adult,Adventure        1
Action,Adult,Comedy           5
Action,Adult,Crime           10
                          ...  
Thriller,Western             43
War                        1300
War,Western                  14
Western                    5122
\N                        71578
Length: 1467, dtype: int64

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 613641 entries, 498 to 9023406
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          613641 non-null  object
 1   titleType       613641 non-null  object
 2   primaryTitle    613641 non-null  object
 3   originalTitle   613641 non-null  object
 4   isAdult         613641 non-null  object
 5   startYear       613641 non-null  object
 6   endYear         613641 non-null  object
 7   runtimeMinutes  613641 non-null  object
 8   genres          613641 non-null  object
dtypes: object(9)
memory usage: 62.9+ MB


In [37]:
# Include only fictional movies (not from documentary genre)

df = df[df.genres != 'Documentary']
df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 526319 entries, 498 to 9023396
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          526319 non-null  object
 1   titleType       526319 non-null  object
 2   primaryTitle    526319 non-null  object
 3   originalTitle   526319 non-null  object
 4   isAdult         526319 non-null  object
 5   startYear       526319 non-null  object
 6   endYear         526319 non-null  object
 7   runtimeMinutes  526319 non-null  object
 8   genres          526319 non-null  object
dtypes: object(9)
memory usage: 40.2+ MB


In [39]:
# list number of endYear values
df.groupby(['endYear']).size()

endYear
\N    526319
dtype: int64

In [48]:
# Include only movies that were released 2000 - 2021 (include 2000 and 2021)

df = df[(df['startYear'] > '1999')]
df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
11636,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019,\N,\N,"Action,Crime"
15179,tt0015414,movie,La tierra de los toros,La tierra de los toros,0,2000,\N,60,\N
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,\N,118,"Comedy,Fantasy,Romance"
61119,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,\N,70,Drama
67672,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,\N,122,Drama


basics dataframe reduced by about 100K rows

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 292582 entries, 11636 to 9023396
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          292582 non-null  object
 1   titleType       292582 non-null  object
 2   primaryTitle    292582 non-null  object
 3   originalTitle   292582 non-null  object
 4   isAdult         292582 non-null  object
 5   startYear       292582 non-null  object
 6   endYear         292582 non-null  object
 7   runtimeMinutes  292582 non-null  object
 8   genres          292582 non-null  object
dtypes: object(9)
memory usage: 22.3+ MB


In [42]:
# show ratings dataframe
df1 = pd.read_csv('Data/title.ratings.tsv.gz', compression='gzip', sep='\t', low_memory=False)
df1.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1892
1,tt0000002,5.9,252
2,tt0000003,6.5,1685
3,tt0000004,5.7,165
4,tt0000005,6.2,2499


In [43]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250153 entries, 0 to 1250152
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1250153 non-null  object 
 1   averageRating  1250153 non-null  float64
 2   numVotes       1250153 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.6+ MB


In [44]:
# show akas dataframe
df2 = pd.read_csv('Data/title.akas.tsv.gz', compression='gzip', sep='\t', low_memory=False)
df2.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [45]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32428306 entries, 0 to 32428305
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 1.9+ GB


In [46]:
# explore syntax that represents United States region
df2.groupby(['region']).size()

region
AD           48
AE        13288
AF          251
AG           20
AI            5
         ...   
ZA         8870
ZM           55
ZRCD          6
ZW          134
\N      1929670
Length: 247, dtype: int64

In [47]:
df2.region.value_counts()

FR    3861757
JP    3860109
DE    3839971
IN    3786969
ES    3784806
       ...   
NR          1
NU          1
PW          1
TV          1
TC          1
Name: region, Length: 247, dtype: int64

In [51]:
## Saving filtered file as csv.gz and immediately loading (to verify)
df.to_csv('Data/title_basics_cleaned.csv.gz',compression='gzip',index=False)
df = pd.read_csv('Data/title_basics_cleaned.csv.gz')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292582 entries, 0 to 292581
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          292582 non-null  object
 1   titleType       292582 non-null  object
 2   primaryTitle    292582 non-null  object
 3   originalTitle   292582 non-null  object
 4   isAdult         292582 non-null  int64 
 5   startYear       292582 non-null  object
 6   endYear         292582 non-null  object
 7   runtimeMinutes  292582 non-null  object
 8   genres          292582 non-null  object
dtypes: int64(1), object(8)
memory usage: 20.1+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019,\N,\N,"Action,Crime"
1,tt0015414,movie,La tierra de los toros,La tierra de los toros,0,2000,\N,60,\N
2,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,\N,118,"Comedy,Fantasy,Romance"
3,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,\N,70,Drama
4,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,\N,122,Drama


In [52]:
## Saving filtered file as csv.gz and immediately loading (to verify)
df1.to_csv('Data/title_ratings_cleaned.csv.gz',compression='gzip',index=False)
df1 = pd.read_csv('Data/title_ratings_cleaned.csv.gz')
df1.info()
df1.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250153 entries, 0 to 1250152
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1250153 non-null  object 
 1   averageRating  1250153 non-null  float64
 2   numVotes       1250153 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.6+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1892
1,tt0000002,5.9,252
2,tt0000003,6.5,1685
3,tt0000004,5.7,165
4,tt0000005,6.2,2499
