## movieStats

### Author:  Sheneka Allen

In [5]:
# import key libraries, tools
import pandas as pd
import pymysql
pymysql.install_as_MySQLdb()
from urllib.parse import quote_plus as urlquote # must have for special char pwd
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists

In [9]:
# connect to PyMysql
username = 'root'
password = 'pwd'

# Create connection string using credentials for special character mysql pwd

connection = f"mysql+pymysql://{username}:{urlquote(password)}@localhost/movieStats"
engine = create_engine(connection)

In [10]:
# Check if database exists, if not, create it
if database_exists(connection) == False: create_database(connection)
else: print('The database already exists.')

The database already exists.


In [11]:
# verify movieStats db was created
database_exists(connection)

True

In [12]:
# load the three (3) official IMDB data for the requested tables
# title.basics.tsv.gz, title.ratings.tsv.gz, title.akas.tsv.gz

df = pd.read_csv('Data/title.basics.tsv.gz', compression='gzip', sep='\t', low_memory=False)

# show basics dataframe
df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [17]:
# clean df, df1, and df2 dataframes
df.isnull().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

In [22]:
df.duplicated().sum()

0

In [28]:
# explore unique values in columns
df.groupby(['primaryTitle']).size()

primaryTitle
!Next?                                               1
!Que ve el Bisbe!                                    1
!Women Art Revolution                                1
#                                                    5
# (Hashtag)                                          1
                                                    ..
Šentilj-Spielfeld - Border Crossing That Once Was    1
Špansko the Continent                                1
Η Ιστορία των Ελληνικών Σιδηροδρόμων                 1
Τhe Improvisation of Petros Mokas                    1
Самая любимая                                        1
Length: 4152346, dtype: int64

In [30]:
df.groupby(['originalTitle']).size()

originalTitle
!Next?                  1
!Que ve el Bisbe!       1
#                       5
# (Hashtag)             1
# 110 Lucha Rules       1
                       ..
öregHarcos              1
öregHarcos II           1
über den Wolken         1
überRICH                1
ülker Dankek Reklami    1
Length: 4172447, dtype: int64

In [31]:
df.groupby(['genres']).size()

genres
Action                     34333
Action,Adult                 164
Action,Adult,Adventure        66
Action,Adult,Animation        68
Action,Adult,Comedy           43
                           ...  
Thriller,Western              46
War                         2729
War,Western                   14
Western                    13795
\N                        411119
Length: 2317, dtype: int64

In [29]:
# how many unique values in columns
df.nunique()

tconst            9023456
titleType              11
primaryTitle      4152346
originalTitle     4172447
isAdult                 9
startYear             151
endYear                98
runtimeMinutes        870
genres               2317
dtype: int64

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9023456 entries, 0 to 9023455
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 619.6+ MB


In [15]:
# show ratings dataframe
df1 = pd.read_csv('Data/title.ratings.tsv.gz', compression='gzip', sep='\t', low_memory=False)
df1.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1892
1,tt0000002,5.9,252
2,tt0000003,6.5,1685
3,tt0000004,5.7,165
4,tt0000005,6.2,2499


In [18]:
df1.isnull().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [23]:
df1.duplicated().sum()

0

In [26]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250153 entries, 0 to 1250152
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1250153 non-null  object 
 1   averageRating  1250153 non-null  float64
 2   numVotes       1250153 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.6+ MB


In [16]:
# show akas dataframe
df2 = pd.read_csv('Data/title.akas.tsv.gz', compression='gzip', sep='\t', low_memory=False)
df2.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


Good no nulls in ratings table!!

In [19]:
df2.isnull().sum()

titleId              0
ordering             0
title                4
region             103
language             0
types                0
attributes           0
isOriginalTitle      0
dtype: int64

In [24]:
df2.duplicated().sum()

0

In [32]:
df2.groupby(['title']).size()

title
!Huff                    1
!Next?                   1
!Women Art Revolution    3
!mpossible               1
#                        8
                        ..
ｗｈｉｔｅ ｒｏｏｍ ～止まれない愛～      1
～極乳犯～ 葉山雅                1
～真昼の夜とふしぎの門～             1
ｴﾛ義母と発情息子 淫らな家族          1
🧠+🧘=❤️                   1
Length: 3986444, dtype: int64

In [33]:
df2.groupby(['region']).size()

region
AD           48
AE        13288
AF          251
AG           20
AI            5
         ...   
ZA         8870
ZM           55
ZRCD          6
ZW          134
\N      1929670
Length: 247, dtype: int64

In [27]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32428306 entries, 0 to 32428305
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 1.9+ GB


In [None]:
# replace all table nulls with 'Missing' string value since all object dtype

# Modified code: https://www.statology.org/fillna-multiple-columns-pandas/ 
df[['primaryTitle', 'originalTitle', 'genres']] = df[['primaryTitle', 'originalTitle', 'genres']].fillna(value='Missing')
df2[['title', 'region']] = df2[['title', 'region']].fillna(value='Missing')

In [None]:
# add data to db

#df.to_sql('basics', engine, if_exists = 'replace')
#df1.to_sql('ratings', engine, if_exists = 'replace')
#df2.to_sql('akas', engine, if_exists = 'replace')

In [None]:
# filter data using customer specifications or requirements
#q = """SELECT * FROM tshirts LIMIT 5;"""
#pd.read_sql(q, engine)

In [None]:
# filter out unnecessary data
#
# exclude any movie with missing values for genre or runtime
# include only full-length movies (titleType = "movie").
# include only fictional movies (not from documentary genre)
# include only movies that were released 2000 - 2021 (include 2000 and 2021)
# include only movies that were released in the United States

# save the filtered tables as gzip-compressed csv files (".csv.gz") 
# in the movieStats repository.

## Saving and immediately loading (to verify)
basics.to_csv('Data/title_basics_cleaned.csv.gz',compression='gzip',index=False)
basics = pd.read_csv('Data/title_basics_cleaned.csv.gz')
basics.info()
basics.head()