# Movies Database
Brittany Lassiter

![png](Data/movies-erd.png)

## Imports/Data

In [1]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import pymysql
pymysql.install_as_MySQLdb()
from urllib.parse import quote_plus as urlquote

In [2]:
import json
with open ("C:/Users/blass/.secret/mysql.json") as f:
    login = json.load(f)
login.keys()

dict_keys(['username', 'password'])

In [3]:
connection = f"mysql+pymysql://{login['username']}:{urlquote(login['password'])}@localhost/Movies"
engine = create_engine(connection)

In [4]:
## Check if database exists, if not, create it
if database_exists(connection):
    print('It exists!')
else:
    create_database(connection)
    print('Database created!')

It exists!


## Show Table

In [5]:
# Preview the names of all tables 
q = '''SHOW TABLES;'''
pd.read_sql(q, engine)

Unnamed: 0,Tables_in_movies
0,genres
1,ratings
2,title_basics
3,title_genres


## Empty Tables

In [6]:
q= '''SELECT * FROM genres'''
pd.read_sql(q, engine)

Unnamed: 0,genres_id,genre_name


In [7]:
q= '''SELECT * FROM ratings'''
pd.read_sql(q, engine)

Unnamed: 0,tconst,average_rating,number_of_votes,genres_genres_id


In [8]:
q= '''SELECT * FROM title_basics'''
pd.read_sql(q, engine)

Unnamed: 0,tconst,primary_title,start_year,runtime


In [9]:
q= '''SELECT * FROM title_genres'''
pd.read_sql(q, engine)

Unnamed: 0,genres_id,title_basics_tconst


## Describe each Table

In [10]:
q = '''DESCRIBE genres;'''
describe = pd.read_sql(q, engine)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,genres_id,int,NO,PRI,,
1,genre_name,varchar(45),YES,,,


In [11]:
q = '''DESCRIBE ratings;'''
describe = pd.read_sql(q, engine)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,int,NO,PRI,,
1,average_rating,varchar(45),YES,,,
2,number_of_votes,varchar(45),YES,,,
3,genres_genres_id,int,NO,,,


In [12]:
q = '''DESCRIBE title_basics;'''
describe = pd.read_sql(q, engine)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,int,NO,PRI,,
1,primary_title,varchar(45),YES,,,
2,start_year,varchar(45),YES,,,
3,runtime,varchar(45),YES,,,


In [13]:
q = '''DESCRIBE title_genres;'''
describe = pd.read_sql(q, engine)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,genres_id,int,NO,PRI,,
1,title_basics_tconst,int,NO,PRI,,


## Foreign Keys

In [14]:
# Checking the setting for FOREIGN_KEY_CHECKS
q = """SELECT @@FOREIGN_KEY_CHECKS"""
pd.read_sql(q, engine)

Unnamed: 0,@@FOREIGN_KEY_CHECKS
0,1


In [15]:
# Changing the setting for FOREIGN_KEY_CHECKS with the connection
q = """SET @@FOREIGN_KEY_CHECKS=0"""
engine.execute(q)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x2886e741960>

In [16]:
# Confirm the checks are deactiavated
q = """SELECT @@FOREIGN_KEY_CHECKS"""
pd.read_sql(q,engine)

Unnamed: 0,@@FOREIGN_KEY_CHECKS
0,0


## Preparing Data

In [17]:
from sqlalchemy.types import *

### Title basics

In [18]:
title_basics = pd.read_csv('Data/title_basics_cleaned.csv.gz')
title_basics.info()
title_basics.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86979 entries, 0 to 86978
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86979 non-null  object 
 1   titleType       86979 non-null  object 
 2   primaryTitle    86979 non-null  object 
 3   originalTitle   86979 non-null  object 
 4   isAdult         86979 non-null  int64  
 5   startYear       86979 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  86979 non-null  int64  
 8   genres          86979 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 6.0+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [19]:
# Calculate max string lenghts for object columns
size_length = title_basics['tconst'].fillna('').map(len).max()
size_length

10

In [20]:
# Calculate max string lenghts for object columns
size_length = title_basics['titleType'].fillna('').map(len).max()
size_length

5

In [21]:
# Calculate max string lenghts for object columns
size_length = title_basics['primaryTitle'].fillna('').map(len).max()
size_length

242

In [22]:
# Calculate max string lenghts for object columns
size_length = title_basics['originalTitle'].fillna('').map(len).max()
size_length

242

In [23]:
# Create a schema dictionary using SQLAlchemy datatype objects
dtypes_dict = {'tconst': VARCHAR(15),
               'titletype': VARCHAR(8), 
              'primaryTitle': VARCHAR(246),
               'orginalTitle': VARCHAR(246)}

In [24]:
# save to sql with dtypes and index = False
title_basics.to_sql('title_basics', engine, dtype = dtypes_dict, if_exists = 'replace', index = False)

86979

In [25]:
title_basics.to_sql("title_basics",engine,index=False, if_exists='append')

86979

### Ratings

In [26]:
ratings = pd.read_csv('Data/title_ratings_cleaned.csv.gz')
ratings.info()
ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71900 entries, 0 to 71899
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         71900 non-null  object 
 1   averageRating  71900 non-null  float64
 2   numVotes       71900 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 1.6+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0035423,6.4,87153
1,tt0062336,6.4,175
2,tt0069049,6.7,7754
3,tt0088751,5.2,336
4,tt0096056,5.6,846


In [27]:
# Calculate max string lenghts for object columns
size_length = ratings['tconst'].fillna('').map(len).max()
size_length

10

In [28]:
# Create a schema dictionary using SQLAlchemy datatype objects
dtypes_dict = {'tconst': VARCHAR(15)}

In [29]:
# save to sql with dtypes and index = False
ratings.to_sql('ratings', engine, dtype = dtypes_dict, if_exists = 'replace', index = False)

71900

In [30]:
ratings.to_sql("ratings",engine,index=False, if_exists='append')

71900

## Final Database

In [31]:
# Final preview the names of all tables 
q = """SHOW TABLES;"""

pd.read_sql(q,engine)

Unnamed: 0,Tables_in_movies
0,genres
1,ratings
2,title_basics
3,title_genres


In [32]:
q = '''DESCRIBE title_basics;'''
pd.read_sql(q, engine)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,varchar(15),YES,,,
1,titleType,text,YES,,,
2,primaryTitle,varchar(246),YES,,,
3,originalTitle,text,YES,,,
4,isAdult,bigint,YES,,,
5,startYear,double,YES,,,
6,endYear,double,YES,,,
7,runtimeMinutes,bigint,YES,,,
8,genres,text,YES,,,


In [33]:
q = '''DESCRIBE ratings;'''
pd.read_sql(q, engine)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,varchar(15),YES,,,
1,averageRating,double,YES,,,
2,numVotes,bigint,YES,,,


## Final View

In [35]:
q = """SELECT * FROM title_basics LIMIT 5;"""
pd.read_sql(q, engine)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [36]:
q = """SELECT * FROM ratings LIMIT 5;"""
pd.read_sql(q, engine)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0035423,6.4,87153
1,tt0062336,6.4,175
2,tt0069049,6.7,7754
3,tt0088751,5.2,336
4,tt0096056,5.6,846
