In [1]:
#Python File for ETL Project

In [2]:
import pandas as pd
from sqlalchemy import create_engine

In [3]:
#Extract CSV's into DataFrames

In [6]:
# File Legend:

# CSV_1: Netflix Titles
# CSV_2: Metascore Data 


In [7]:
#Netflix Titles

CSV_1 = "Resources/netflix_titles.csv"
netflix_df = pd.read_csv(CSV_1)
netflix_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [8]:
#List total columns & identify columns for revised dataframe

for col in netflix_df.columns: 
    print(col) 
    

show_id
type
title
director
cast
country
date_added
release_year
rating
duration
listed_in
description


In [9]:
#Columns in final Netflix Table
# type (want to filter just to movies)
# title
# director
# cast (should separate each actor)
# county
# date added
# release year
# rating 
# listed in (genres)
# description

In [10]:
#Revised Netflix DataFrame

revised_netflix_df = netflix_df[["type","title","director","country","release_year","listed_in","description"]]
revised_netflix_df.head()

#Columns in final Netflix Table
# type (want to filter just to movies)
# title
# director
# county
# date added
# release year
# rating 
# listed in (genres)
# description

Unnamed: 0,type,title,director,country,release_year,listed_in,description
0,TV Show,3%,,Brazil,2020,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,Movie,7:19,Jorge Michel Grau,Mexico,2016,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,Movie,23:59,Gilbert Chan,Singapore,2011,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,Movie,9,Shane Acker,United States,2009,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,Movie,21,Robert Luketic,United States,2008,Dramas,A brilliant group of students become card-coun...


In [11]:
# filter out tv type
drop_netflix_df = revised_netflix_df[revised_netflix_df.type != 'TV Show']
drop_netflix_df.head()

Unnamed: 0,type,title,director,country,release_year,listed_in,description
1,Movie,7:19,Jorge Michel Grau,Mexico,2016,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,Movie,23:59,Gilbert Chan,Singapore,2011,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,Movie,9,Shane Acker,United States,2009,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,Movie,21,Robert Luketic,United States,2008,Dramas,A brilliant group of students become card-coun...
6,Movie,122,Yasir Al Yasiri,Egypt,2019,"Horror Movies, International Movies","After an awful accident, a couple admitted to ..."


In [12]:
# Rename columns
renamed_netflix_df = drop_netflix_df.rename(columns={"title":"movie_title","country":"produced_in","listed_in": "netflix_genre"})
renamed_netflix_df.head()

Unnamed: 0,type,movie_title,director,produced_in,release_year,netflix_genre,description
1,Movie,7:19,Jorge Michel Grau,Mexico,2016,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,Movie,23:59,Gilbert Chan,Singapore,2011,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,Movie,9,Shane Acker,United States,2009,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,Movie,21,Robert Luketic,United States,2008,Dramas,A brilliant group of students become card-coun...
6,Movie,122,Yasir Al Yasiri,Egypt,2019,"Horror Movies, International Movies","After an awful accident, a couple admitted to ..."


In [13]:
# Remove Type Column
movie_netflix_df = renamed_netflix_df.drop('type', axis=1)
movie_netflix_df.head(50)

Unnamed: 0,movie_title,director,produced_in,release_year,netflix_genre,description
1,7:19,Jorge Michel Grau,Mexico,2016,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,23:59,Gilbert Chan,Singapore,2011,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,9,Shane Acker,United States,2009,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,21,Robert Luketic,United States,2008,Dramas,A brilliant group of students become card-coun...
6,122,Yasir Al Yasiri,Egypt,2019,"Horror Movies, International Movies","After an awful accident, a couple admitted to ..."
7,187,Kevin Reynolds,United States,1997,Dramas,After one of his high school students attacks ...
8,706,Shravan Kumar,India,2019,"Horror Movies, International Movies","When a doctor goes missing, his psychiatrist w..."
9,1920,Vikram Bhatt,India,2008,"Horror Movies, International Movies, Thrillers",An architect and his wife move into a castle t...
10,1922,Zak Hilditch,United States,2017,"Dramas, Thrillers",A farmer pens a confession admitting to his wi...
13,2215,Nottapon Boonprakob,Thailand,2018,"Documentaries, International Movies, Sports Mo...",This intimate documentary follows rock star Ar...


In [14]:
# split produced_in
# Define 1st value in listed_in column as main country the movie was produced in
produced_df = movie_netflix_df

# split genre column
split_produced_df = produced_df['produced_in'].str.split(',', 1, expand=True)

# Rename columns
renamed_produced_df = split_produced_df.rename(columns={0:"primary_country",1:"other_countries"})
renamed_produced_df.head()

# drop column
country_df = renamed_produced_df.drop('other_countries', axis=1)
country_df.head()


Unnamed: 0,primary_country
1,Mexico
2,Singapore
3,United States
4,United States
6,Egypt


In [15]:
# Get Unique Values in 
country_list = country_df['primary_country'].unique()
country_list

array(['Mexico', 'Singapore', 'United States', 'Egypt', 'India',
       'Thailand', 'Nigeria', 'Norway', 'United Kingdom', 'South Korea',
       'Italy', 'Canada', 'Indonesia', 'Romania', 'Spain', 'Turkey',
       'Iceland', 'South Africa', 'France', 'Portugal', nan, 'Hong Kong',
       'Argentina', 'Germany', 'Denmark', 'Poland', 'Japan', 'Kenya',
       'New Zealand', 'Pakistan', 'Australia', 'China', 'Taiwan',
       'Netherlands', 'Philippines', 'United Arab Emirates', 'Brazil',
       'Iran', 'Israel', 'Uruguay', 'Bulgaria', 'Colombia',
       'Soviet Union', 'Sweden', 'Malaysia', 'Ireland', 'Serbia', 'Peru',
       'Chile', 'Ghana', 'Saudi Arabia', 'Namibia', 'Lebanon', 'Belgium',
       'Vietnam', 'Russia', 'Kuwait', 'Czech Republic', 'Zimbabwe',
       'Hungary', 'Finland', 'Venezuela', 'Cambodia', 'West Germany',
       'Slovenia', 'Switzerland', 'Austria', 'Bangladesh', 'Georgia',
       'Guatemala', 'Jamaica', 'Greece', 'Paraguay', 'Somalia', 'Croatia'],
      dtype=object)

In [16]:
# Classify International Movies 
# might need to be done by merging country_df with movie_netflix_df and then grouping US films... 


In [17]:
# next steps: - JP Can do 
# filter type to just movies 
# remove description column
# rename listed_in to 'Netflix Genre': extract first listed_in value as Netflix Genre
# could filter out international movies, country to United States 

In [18]:
# split neftlix_genre
# Define 1st value in listed_in column as main country the movie was produced in
genre_df = movie_netflix_df

# split genre column
split_genre_df = genre_df['netflix_genre'].str.split(',', 1, expand=True)

# Rename columns
renamed_genre_df = split_genre_df.rename(columns={0:"primary_genre",1:"other_genres"})

# drop column
drop_netflix_genre_df = renamed_genre_df.drop('other_genres', axis=1)

# Rename columns
netflix_genre_df = drop_netflix_genre_df.rename(columns={"primary_genre":"listed_in_netflix"})
netflix_genre_df.head()

Unnamed: 0,listed_in_netflix
1,Dramas
2,Horror Movies
3,Action & Adventure
4,Dramas
6,Horror Movies


In [19]:
#Metascore Data

CSV_2 = "Resources/metacritic_movies.csv"
metascore_df = pd.read_csv(CSV_2)
metascore_df.head()

Unnamed: 0,movie_title,release_date,genre,meta_mixed,meta_negative,meta_positive,metascore,user_mixed,user_negative,user_positive,userscore
0,Anatomy of a Murder,1-Jul-59,"Drama,Mystery,Thriller,Crime",0,0,15,95,0,0,3,tbd
1,Bringing Up Baby,18-Feb-38,"Comedy,Romance,Family",0,1,16,91,1,0,2,tbd
2,After Life,12-May-99,"Drama,Fantasy",0,0,19,91,0,2,1,tbd
3,Gavagai,3-Aug-18,Drama,1,0,6,91,0,1,2,tbd
4,The Hustler,25-Sep-61,"Drama,Sport",1,0,17,90,0,0,3,tbd


In [20]:
#List total columns & identify columns for revised dataframe
for col in metascore_df.columns: 
    print(col) 

movie_title
release_date
genre
meta_mixed
meta_negative
meta_positive
metascore
user_mixed
user_negative
user_positive
userscore


In [21]:
# movie_title
# genre
# metascore

revised_metascore_df = metascore_df[["movie_title","genre","metascore"]]
revised_metascore_df.head()


Unnamed: 0,movie_title,genre,metascore
0,Anatomy of a Murder,"Drama,Mystery,Thriller,Crime",95
1,Bringing Up Baby,"Comedy,Romance,Family",91
2,After Life,"Drama,Fantasy",91
3,Gavagai,Drama,91
4,The Hustler,"Drama,Sport",90


In [22]:
# Rename columns
renamed_metascore_df = revised_metascore_df.rename(columns={"genre": "metacritic_genre"})
renamed_metascore_df.head()

Unnamed: 0,movie_title,metacritic_genre,metascore
0,Anatomy of a Murder,"Drama,Mystery,Thriller,Crime",95
1,Bringing Up Baby,"Comedy,Romance,Family",91
2,After Life,"Drama,Fantasy",91
3,Gavagai,Drama,91
4,The Hustler,"Drama,Sport",90


In [24]:
# split metacritic_genre
# Define 1st value in listed_in column as main country the movie was produced in
metagenre_df = renamed_metascore_df

# split genre column
split_metagenre_df = renamed_metascore_df['metacritic_genre'].str.split(',', 1, expand=True)

# Rename columns
renamed_metagenre_df = split_metagenre_df.rename(columns={0:"primary_genre",1:"other_genres"})

# drop column
drop_genre_df = renamed_metagenre_df.drop('other_genres', axis=1)

# Rename column
metacritic_genre_df = drop_genre_df.rename(columns={"primary_genre":"metacritic_genre"})
metacritic_genre_df.head()

Unnamed: 0,metacritic_genre
0,Drama
1,Comedy
2,Drama
3,Drama
4,Drama


In [26]:
# merged_metacritic_df = pd.merge(revised_metascore_df, metacritic_genre_df, on="metacritic_genre")
# merged_metacritic_df.head()

In [None]:
# Transform Each DataFrame