In [3]:
#Python File for ETL Project

In [4]:
import pandas as pd
from sqlalchemy import create_engine

In [5]:
#Extract CSV's into DataFrames

In [6]:
# File Legend:

# CSV_1: Netflix Titles
# CSV_2: Metascore Data 


In [7]:
# Extract CSV 1
# Netflix Titles

CSV_1 = "Resources/netflix_titles.csv"
netflix_df = pd.read_csv(CSV_1)
netflix_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [8]:
# List total columns & Identify columns for revised dataframe

for col in netflix_df.columns: 
    print(col) 
    

show_id
type
title
director
cast
country
date_added
release_year
rating
duration
listed_in
description


In [9]:
# Transform Step
# Revise Dataset with select columns

revised_netflix_df = netflix_df[["type","title","director","country","release_year","listed_in","description"]]
revised_netflix_df.head()


Unnamed: 0,type,title,director,country,release_year,listed_in,description
0,TV Show,3%,,Brazil,2020,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,Movie,7:19,Jorge Michel Grau,Mexico,2016,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,Movie,23:59,Gilbert Chan,Singapore,2011,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,Movie,9,Shane Acker,United States,2009,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,Movie,21,Robert Luketic,United States,2008,Dramas,A brilliant group of students become card-coun...


In [10]:
# Transform Step
# Drop TV Type 
drop_netflix_df = revised_netflix_df[revised_netflix_df.type != 'TV Show']
drop_netflix_df.head()

Unnamed: 0,type,title,director,country,release_year,listed_in,description
1,Movie,7:19,Jorge Michel Grau,Mexico,2016,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,Movie,23:59,Gilbert Chan,Singapore,2011,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,Movie,9,Shane Acker,United States,2009,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,Movie,21,Robert Luketic,United States,2008,Dramas,A brilliant group of students become card-coun...
6,Movie,122,Yasir Al Yasiri,Egypt,2019,"Horror Movies, International Movies","After an awful accident, a couple admitted to ..."


In [11]:
# Transform Step
# Rename columns
renamed_netflix_df = drop_netflix_df.rename(columns={"title":"movie_title","country":"produced_in","listed_in": "netflix_genre"})
renamed_netflix_df.head()

Unnamed: 0,type,movie_title,director,produced_in,release_year,netflix_genre,description
1,Movie,7:19,Jorge Michel Grau,Mexico,2016,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,Movie,23:59,Gilbert Chan,Singapore,2011,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,Movie,9,Shane Acker,United States,2009,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,Movie,21,Robert Luketic,United States,2008,Dramas,A brilliant group of students become card-coun...
6,Movie,122,Yasir Al Yasiri,Egypt,2019,"Horror Movies, International Movies","After an awful accident, a couple admitted to ..."


In [12]:
# Transform Step
# Remove Type Column
movie_netflix_df = renamed_netflix_df.drop('type', axis=1)
movie_netflix_df.head()

Unnamed: 0,movie_title,director,produced_in,release_year,netflix_genre,description
1,7:19,Jorge Michel Grau,Mexico,2016,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,23:59,Gilbert Chan,Singapore,2011,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,9,Shane Acker,United States,2009,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,21,Robert Luketic,United States,2008,Dramas,A brilliant group of students become card-coun...
6,122,Yasir Al Yasiri,Egypt,2019,"Horror Movies, International Movies","After an awful accident, a couple admitted to ..."


In [13]:
# Extract CSV #2
# Metascore Data

CSV_2 = "Resources/metacritic_movies.csv"
metascore_df = pd.read_csv(CSV_2)
metascore_df.head()

Unnamed: 0,movie_title,release_date,genre,meta_mixed,meta_negative,meta_positive,metascore,user_mixed,user_negative,user_positive,userscore
0,Anatomy of a Murder,1-Jul-59,"Drama,Mystery,Thriller,Crime",0,0,15,95,0,0,3,tbd
1,Bringing Up Baby,18-Feb-38,"Comedy,Romance,Family",0,1,16,91,1,0,2,tbd
2,After Life,12-May-99,"Drama,Fantasy",0,0,19,91,0,2,1,tbd
3,Gavagai,3-Aug-18,Drama,1,0,6,91,0,1,2,tbd
4,The Hustler,25-Sep-61,"Drama,Sport",1,0,17,90,0,0,3,tbd


In [14]:
#List total columns & identify columns for revised dataframe
for col in metascore_df.columns: 
    print(col) 

movie_title
release_date
genre
meta_mixed
meta_negative
meta_positive
metascore
user_mixed
user_negative
user_positive
userscore


In [15]:
# Transform Step
# Revise Dataset with select columns

revised_metascore_df = metascore_df[["movie_title","genre","metascore"]]
revised_metascore_df.head()


Unnamed: 0,movie_title,genre,metascore
0,Anatomy of a Murder,"Drama,Mystery,Thriller,Crime",95
1,Bringing Up Baby,"Comedy,Romance,Family",91
2,After Life,"Drama,Fantasy",91
3,Gavagai,Drama,91
4,The Hustler,"Drama,Sport",90


In [16]:
# Transform Step
# Rename columns
renamed_metascore_df = revised_metascore_df.rename(columns={"genre": "metacritic_genre"})
renamed_metascore_df.head()


Unnamed: 0,movie_title,metacritic_genre,metascore
0,Anatomy of a Murder,"Drama,Mystery,Thriller,Crime",95
1,Bringing Up Baby,"Comedy,Romance,Family",91
2,After Life,"Drama,Fantasy",91
3,Gavagai,Drama,91
4,The Hustler,"Drama,Sport",90


In [17]:
# Transform Step
# Binning by score range 
#score range legend (4 star range)

# 4-Star Scale
# Their Grade	Converts to
# 4- 100
# 3 -75
# 2 - 50
# 1 - 25
# 0 - 0

 # Create bins in which to place values based upon TED Talk views
bins = [0, 25, 50, 75, 100]

# Create labels for these bins
group_labels = ["1 star", "2 stars", "3 stars", "4 stars"]

# Place the data series into a new column inside of the DataFrame
renamed_metascore_df["star_rating"] = pd.cut(renamed_metascore_df["metascore"], bins, labels=group_labels)
bin_df = renamed_metascore_df
bin_df.head()

Unnamed: 0,movie_title,metacritic_genre,metascore,star_rating
0,Anatomy of a Murder,"Drama,Mystery,Thriller,Crime",95,4 stars
1,Bringing Up Baby,"Comedy,Romance,Family",91,4 stars
2,After Life,"Drama,Fantasy",91,4 stars
3,Gavagai,Drama,91,4 stars
4,The Hustler,"Drama,Sport",90,4 stars


In [18]:
# Transform Step
# Sort Descending metascore
bin_df.sort_values(by='metascore', ascending=False)
movie_metacritic_df = bin_df
movie_metacritic_df.head()

Unnamed: 0,movie_title,metacritic_genre,metascore,star_rating
0,Anatomy of a Murder,"Drama,Mystery,Thriller,Crime",95,4 stars
1,Bringing Up Baby,"Comedy,Romance,Family",91,4 stars
2,After Life,"Drama,Fantasy",91,4 stars
3,Gavagai,Drama,91,4 stars
4,The Hustler,"Drama,Sport",90,4 stars


In [19]:
# Connect to local database - from activity / THIS WILL NOT RUN
# Create Connection
rds_connection_string = "<insert user name>:<insert password>@localhost:5432/Group9_ETL"
engine = create_engine(f'postgresql://{rds_connection_string}')

ModuleNotFoundError: No module named 'psycopg2'

In [None]:
# Connect to local database - from activity / THIS WILL NOT RUN
# Check for Tables
 engine.table_names()

In [None]:
# Connect to local database - from activity / THIS WILL NOT RUN
# Use pandas to load csv converted DataFrame into database
movie_netflix_data_df.to_sql(name='netflix_table_', con=engine, if_exists='append', index=False


In [None]:
# Connect to local database - from activity / THIS WILL NOT RUN
#  Use pandas to load json converted DataFrame into database
movie_metacritic_df.to_sql(name='metacritic_table', con=engine, if_exists='append', index=False)

In [None]:
# Connect to local database - from activity / THIS WILL NOT RUN
#  Confirm data has been added by querying the customer_name table
pd.read_sql_query('select * from netflix_table', con=engine).head()

In [None]:
# Connect to local database - from activity / THIS WILL NOT RUN
# Confirm data has been added by querying the customer_location table
pd.read_sql_query('select * from metascritic_table', con=engine).head()