In [1]:
import pandas as pd
import datetime
import warnings
warnings.filterwarnings("ignore")
from sqlalchemy import create_engine

# Data Extract

### Import CSV files from resource folder

In [2]:
netflix_all = pd.read_csv('Resources/netflix_titles.csv')
netflix_all.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [3]:
netflix_all.shape

(7787, 12)

In [4]:
rotten_tomatoes_movies = pd.read_csv('Resources/rotten_tomatoes_movies.csv')
rotten_tomatoes_movies.head(2)

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,...,Sony Pictures Classics,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19


In [5]:
csv_file = "Resources/IMDb ratings.csv"

In [6]:
ratings_data_df = pd.read_csv(csv_file)

In [7]:
ratings_data_df.head()

Unnamed: 0,imdb_title_id,weighted_average_vote,total_votes,mean_vote,median_vote,votes_10,votes_9,votes_8,votes_7,votes_6,...,females_30age_avg_vote,females_30age_votes,females_45age_avg_vote,females_45age_votes,top1000_voters_rating,top1000_voters_votes,us_voters_rating,us_voters_votes,non_us_voters_rating,non_us_voters_votes
0,tt0000009,5.9,154,5.9,6.0,12,4,10,43,28,...,5.7,13.0,4.5,4.0,5.7,34.0,6.4,51.0,6.0,70.0
1,tt0000574,6.1,589,6.3,6.0,57,18,58,137,139,...,6.2,23.0,6.6,14.0,6.4,66.0,6.0,96.0,6.2,331.0
2,tt0001892,5.8,188,6.0,6.0,6,6,17,44,52,...,5.8,4.0,6.8,7.0,5.4,32.0,6.2,31.0,5.9,123.0
3,tt0002101,5.2,446,5.3,5.0,15,8,16,62,98,...,5.5,14.0,6.1,21.0,4.9,57.0,5.5,207.0,4.7,105.0
4,tt0002130,7.0,2237,6.9,7.0,210,225,436,641,344,...,7.3,82.0,7.4,77.0,6.9,139.0,7.0,488.0,7.0,1166.0


# Data Transformation 

Netflix Data File Transformation: 
1. Checked the data types - Changed "date_added" to datetime format
2. Filtered out movies from all show types by loc function
3. Rename, organized and split the data into two tables- Netflix Movie Basic info and Netflix Movie Date info
4. Checked and handled missing values for each new table- replaced Netflix Movie Basic missing "country" to "other" and missing "Rating" to "NR"

In [8]:
netflix_all.dtypes

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object

In [9]:
netflix_all['date_added'] =pd.to_datetime(netflix_all.date_added)

In [10]:
# Check missing values
netflix_all.isnull().sum()

show_id            0
type               0
title              0
director        2389
cast             718
country          507
date_added        10
release_year       0
rating             7
duration           0
listed_in          0
description        0
dtype: int64

In [11]:
# Filter out Movies
netflix_movie = netflix_all.loc[netflix_all.type=='Movie']
netflix_movie.shape

(5377, 12)

In [12]:
netflix_movie = netflix_movie.rename(columns={'show_id':'netflix_show_id'})

In [13]:
netflix_movie_basic = netflix_movie [[ 'netflix_show_id','title','country','rating','duration','description']]
netflix_movie_basic.isnull().sum()

netflix_show_id      0
title                0
country            230
rating               5
duration             0
description          0
dtype: int64

In [14]:
netflix_movie_basic.rating.value_counts()

TV-MA       1845
TV-14       1272
R            663
TV-PG        505
PG-13        386
PG           247
TV-Y         117
TV-G         111
TV-Y7         95
NR            79
G             39
UR             5
TV-Y7-FV       5
NC-17          3
Name: rating, dtype: int64

In [15]:
# Change the countries from Na to "other"
netflix_movie_basic['country'] =netflix_movie_basic['country'].fillna('other')
netflix_movie_basic['rating'] =netflix_movie_basic['rating'].fillna('NR')

In [16]:
netflix_movie_basic.isnull().sum()

netflix_show_id    0
title              0
country            0
rating             0
duration           0
description        0
dtype: int64

In [17]:
netflix_movie_basic.set_index("netflix_show_id", inplace=True)
netflix_movie_basic.head()

Unnamed: 0_level_0,title,country,rating,duration,description
netflix_show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
s2,7:19,Mexico,TV-MA,93 min,After a devastating earthquake hits Mexico Cit...
s3,23:59,Singapore,R,78 min,"When an army recruit is found dead, his fellow..."
s4,9,United States,PG-13,80 min,"In a postapocalyptic world, rag-doll robots hi..."
s5,21,United States,PG-13,123 min,A brilliant group of students become card-coun...
s7,122,Egypt,TV-MA,95 min,"After an awful accident, a couple admitted to ..."


In [18]:
netflix_movie_date = netflix_movie [[ 'netflix_show_id','date_added','release_year']]
netflix_movie_date.isnull().sum()

netflix_show_id    0
date_added         0
release_year       0
dtype: int64

In [19]:
netflix_movie_date.set_index("netflix_show_id", inplace=True)
netflix_movie_date.head()

Unnamed: 0_level_0,date_added,release_year
netflix_show_id,Unnamed: 1_level_1,Unnamed: 2_level_1
s2,2016-12-23,2016
s3,2018-12-20,2011
s4,2017-11-16,2009
s5,2020-01-01,2008
s7,2020-06-01,2019


Rotten Tomatoes Movies Data File Transformation:

1. Checked the data types
2. Kept columns for title, rating and counts only
3. Renamed columns for label consistency between datasets
4. Checked missing calues and dropped NA rows

In [20]:
rotten_tomatoes_movies = rotten_tomatoes_movies.rename(columns={'rotten_tomatoes_link':'rotten_tomatoes_id','movie_title':'title'})

In [21]:
rotten_tomatoes_movies.shape

(17712, 22)

In [22]:
rotten_tomatoes_movies.dtypes

rotten_tomatoes_id                   object
title                                object
movie_info                           object
critics_consensus                    object
content_rating                       object
genres                               object
directors                            object
authors                              object
actors                               object
original_release_date                object
streaming_release_date               object
runtime                             float64
production_company                   object
tomatometer_status                   object
tomatometer_rating                  float64
tomatometer_count                   float64
audience_status                      object
audience_rating                     float64
audience_count                      float64
tomatometer_top_critics_count         int64
tomatometer_fresh_critics_count       int64
tomatometer_rotten_critics_count      int64
dtype: object

In [23]:
rotten_tomatoes_movies.isnull().sum()

rotten_tomatoes_id                     0
title                                  0
movie_info                           321
critics_consensus                   8578
content_rating                         0
genres                                19
directors                            194
authors                             1542
actors                               352
original_release_date               1166
streaming_release_date               384
runtime                              314
production_company                   499
tomatometer_status                    44
tomatometer_rating                    44
tomatometer_count                     44
audience_status                      448
audience_rating                      296
audience_count                       297
tomatometer_top_critics_count          0
tomatometer_fresh_critics_count        0
tomatometer_rotten_critics_count       0
dtype: int64

In [24]:
rotten_tomatoes_movies_clean = rotten_tomatoes_movies.dropna(subset = ['tomatometer_rating', 'audience_count'])
rotten_tomatoes_movies_clean['audience_status'] =rotten_tomatoes_movies_clean['audience_status'].fillna('other')

In [25]:
rotten_tomatoes_movies_rating = rotten_tomatoes_movies_clean[['rotten_tomatoes_id','title','tomatometer_status','tomatometer_rating','tomatometer_count','audience_status','audience_rating','audience_count']]
rotten_tomatoes_movies_rating.set_index("rotten_tomatoes_id", inplace=True)
rotten_tomatoes_movies_rating.head(2)

Unnamed: 0_level_0,title,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count
rotten_tomatoes_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
m/0814255,Percy Jackson & the Olympians: The Lightning T...,Rotten,49.0,149.0,Spilled,53.0,254421.0
m/0878835,Please Give,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0


IMDB Data Rating File Transformation:
Removed irrelevant columns and reorganized the votes by gender regardless of age.
The "mean_vote" and "median_vote" columns were also removed due to redundancy. 
The "total_votes" and "weighted_average_vote" columns were switched around for the sake of clarity. 

In [26]:
imdb_ratings_data_df = ratings_data_df[['imdb_title_id', 'total_votes', 'weighted_average_vote', 'males_allages_avg_vote', 'males_allages_votes', 'females_allages_avg_vote', 'females_allages_votes']].copy()

In [27]:
imdb_ratings_data_df.set_index ("imdb_title_id", inplace = True)

In [28]:
###Store CSV into DataFrame
#Data has been taken from Kaggle in csv files. 

movies_file ="Resources/IMDb movies.csv"
books_file ="Resources/books.csv"

In [29]:
###Transform files to DataFrame
#Read csv files by pandas, create a proper DataFrame by renaming, dropping duplicate datas, choosing columns needed, and setting index.

movies_data_df = pd.read_csv(movies_file)

In [30]:
#movies_data_df.head()

#Checking columns names
movies_data_df.columns

Index(['imdb_title_id', 'title', 'original_title', 'year', 'date_published',
       'genre', 'duration', 'country', 'language', 'director', 'writer',
       'production_company', 'actors', 'description', 'avg_vote', 'votes',
       'budget', 'usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics'],
      dtype='object')

In [31]:
movies_df = movies_data_df[["imdb_title_id", "title"]].copy()
movies_df.head()

Unnamed: 0,imdb_title_id,title
0,tt0000009,Miss Jerry
1,tt0000574,The Story of the Kelly Gang
2,tt0001892,Den sorte drøm
3,tt0002101,Cleopatra
4,tt0002130,L'Inferno


In [32]:
movies_df.count()

imdb_title_id    85855
title            85855
dtype: int64

In [33]:
movies_df.drop_duplicates("title", inplace=True)
movies_df.count()

imdb_title_id    82094
title            82094
dtype: int64

In [34]:
# movies_df["movie_title"]=movies_df["movie_title"].fillna("unknown")
# unknown = movies_df.loc[movies_df["movie_title"]=="unknown"]
# unknown.count()

movies_df = movies_df.rename(columns= {"imdb_title_id":"id", "title":"movie_title"})
movies_df.set_index("id", inplace = True)
movies_df.head()

Unnamed: 0_level_0,movie_title
id,Unnamed: 1_level_1
tt0000009,Miss Jerry
tt0000574,The Story of the Kelly Gang
tt0001892,Den sorte drøm
tt0002101,Cleopatra
tt0002130,L'Inferno


In [35]:
###Transform books_file to DataFrame
# Create a proper DataFrame by renaming, dropping duplicate datas, choosing columns needed, and setting index.

books_data_df = pd.read_csv(books_file)

In [36]:
#books_data_df.head()

books_data_df.columns

Index(['id', 'book_id', 'best_book_id', 'work_id', 'books_count', 'isbn',
       'isbn13', 'authors', 'original_publication_year', 'original_title',
       'title', 'language_code', 'average_rating', 'ratings_count',
       'work_ratings_count', 'work_text_reviews_count', 'ratings_1',
       'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 'image_url',
       'small_image_url'],
      dtype='object')

In [37]:
books_df = books_data_df[["book_id", "authors", "title", "original_publication_year"]].copy()
books_df.head()

Unnamed: 0,book_id,authors,title,original_publication_year
0,2767052,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",2008.0
1,3,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,1997.0
2,41865,Stephenie Meyer,"Twilight (Twilight, #1)",2005.0
3,2657,Harper Lee,To Kill a Mockingbird,1960.0
4,4671,F. Scott Fitzgerald,The Great Gatsby,1925.0


In [38]:
# book_df.count()
books_df.drop_duplicates(["title"], inplace=True)
books_df.count()

book_id                      9964
authors                      9964
title                        9964
original_publication_year    9943
dtype: int64

In [39]:
books_df["original_publication_year"]=books_df["original_publication_year"].fillna(0)
books_df.count()

book_id                      9964
authors                      9964
title                        9964
original_publication_year    9964
dtype: int64

In [40]:
books_df = books_df.rename(columns= {"book_id":"ID", "authors":"Authors", "title":"Book_Title", "original_publication_year":"Published_Year"})
books_df.set_index("ID", inplace = True)
books_df.head()

Unnamed: 0_level_0,Authors,Book_Title,Published_Year
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2767052,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",2008.0
3,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,1997.0
41865,Stephenie Meyer,"Twilight (Twilight, #1)",2005.0
2657,Harper Lee,To Kill a Mockingbird,1960.0
4671,F. Scott Fitzgerald,The Great Gatsby,1925.0


In [41]:
books_df["Published_Year"]=books_df["Published_Year"].astype(int)
books_df.head()

Unnamed: 0_level_0,Authors,Book_Title,Published_Year
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2767052,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",2008
3,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,1997
41865,Stephenie Meyer,"Twilight (Twilight, #1)",2005
2657,Harper Lee,To Kill a Mockingbird,1960
4671,F. Scott Fitzgerald,The Great Gatsby,1925


# Load Data to SQL

In [42]:
#Create database connection 
connection_string = "postgres:bootcamp@localhost:5432/Netflix_IMDB"
engine = create_engine(f'postgresql://{connection_string}')

In [43]:
netflix_movie_date.to_sql(name='netflix_movie_date', con=engine, if_exists='replace', index=True)

In [44]:
netflix_movie_basic.to_sql(name='netflix_movie_basic', con=engine, if_exists='replace', index=True)

In [45]:
rotten_tomatoes_movies_rating.to_sql(name='rotten_tomatoes_movies_rating', con=engine, if_exists='replace', index=True)

In [46]:
imdb_ratings_data_df.to_sql(name='imdb_ratings', con=engine, if_exists='replace', index=True)

In [47]:
movies_df.to_sql(name="imdb_movies", con=engine, if_exists="replace", index=True)

In [48]:
books_df.to_sql(name="books", con=engine, if_exists= "replace", index=True)