In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from unicodedata import normalize

In [2]:
#Read the IMDB_data file
csv_file = "movie_metadata.csv"
IMDB_data_df = pd.read_csv(csv_file)
IMDB_data_df.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [3]:
# Drop the unnecessary columns
IMDB_data_df = IMDB_data_df.loc[:,['movie_title','actor_1_name','genres','budget','gross','country','title_year','imdb_score']]
IMDB_data_df.head()

Unnamed: 0,movie_title,actor_1_name,genres,budget,gross,country,title_year,imdb_score
0,Avatar,CCH Pounder,Action|Adventure|Fantasy|Sci-Fi,237000000.0,760505847.0,USA,2009.0,7.9
1,Pirates of the Caribbean: At World's End,Johnny Depp,Action|Adventure|Fantasy,300000000.0,309404152.0,USA,2007.0,7.1
2,Spectre,Christoph Waltz,Action|Adventure|Thriller,245000000.0,200074175.0,UK,2015.0,6.8
3,The Dark Knight Rises,Tom Hardy,Action|Thriller,250000000.0,448130642.0,USA,2012.0,8.5
4,Star Wars: Episode VII - The Force Awakens ...,Doug Walker,Documentary,,,,,7.1


In [4]:
#Check for any null values
IMDB_data_df.isnull().any()

movie_title     False
actor_1_name     True
genres          False
budget           True
gross            True
country          True
title_year       True
imdb_score      False
dtype: bool

In [5]:
#drop NA's in the Budget and Gross columns
IMDB_data_df.dropna(subset=['budget','gross'], inplace=True)

In [6]:
#Rename the column to match with Streaming Dataframe columns
IMDB_data_df.rename(columns={"movie_title": "Title"},inplace = True)

#IMDB_data has Latin line breaks (special characters); Use normalize function to convert to regular form
IMDB_data_df['Title']=IMDB_data_df['Title'].str.normalize('NFKD')

#Strip any leading spaces in Title column
IMDB_data_df['Title']=IMDB_data_df["Title"].str.strip()

#Drop the duplicate data
IMDB_data_df.drop_duplicates()

Unnamed: 0,Title,actor_1_name,genres,budget,gross,country,title_year,imdb_score
0,Avatar,CCH Pounder,Action|Adventure|Fantasy|Sci-Fi,237000000.0,760505847.0,USA,2009.0,7.9
1,Pirates of the Caribbean: At World's End,Johnny Depp,Action|Adventure|Fantasy,300000000.0,309404152.0,USA,2007.0,7.1
2,Spectre,Christoph Waltz,Action|Adventure|Thriller,245000000.0,200074175.0,UK,2015.0,6.8
3,The Dark Knight Rises,Tom Hardy,Action|Thriller,250000000.0,448130642.0,USA,2012.0,8.5
5,John Carter,Daryl Sabara,Action|Adventure|Sci-Fi,263700000.0,73058679.0,USA,2012.0,6.6
...,...,...,...,...,...,...,...,...
5033,Primer,Shane Carruth,Drama|Sci-Fi|Thriller,7000.0,424760.0,USA,2004.0,7.0
5034,Cavite,Ian Gamazon,Thriller,7000.0,70071.0,Philippines,2005.0,6.3
5035,El Mariachi,Carlos Gallardo,Action|Crime|Drama|Romance|Thriller,7000.0,2040920.0,USA,1992.0,6.9
5037,Newlyweds,Kerry Bishé,Comedy|Drama,9000.0,4584.0,USA,2011.0,6.4


In [7]:
#Read in the Streaming dataset
csv_file = "MoviesOnStreamingPlatforms.csv"
streaming_data_df = pd.read_csv(csv_file)
streaming_data_df.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type
0,0,1,The Irishman,2019,18+,98/100,1,0,0,0,0
1,1,2,Dangal,2016,7+,97/100,1,0,0,0,0
2,2,3,David Attenborough: A Life on Our Planet,2020,7+,95/100,1,0,0,0,0
3,3,4,Lagaan: Once Upon a Time in India,2001,7+,94/100,1,0,0,0,0
4,4,5,Roma,2018,18+,94/100,1,0,0,0,0


In [8]:
#Drop the unncessary columns
streaming_data_df = streaming_data_df.loc[:,['Title','Rotten Tomatoes','Netflix','Hulu','Prime Video','Disney+']]
streaming_data_df.head()

Unnamed: 0,Title,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+
0,The Irishman,98/100,1,0,0,0
1,Dangal,97/100,1,0,0,0
2,David Attenborough: A Life on Our Planet,95/100,1,0,0,0
3,Lagaan: Once Upon a Time in India,94/100,1,0,0,0
4,Roma,94/100,1,0,0,0


In [9]:
streaming_data_df.info()

streaming_data_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9515 entries, 0 to 9514
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Title            9515 non-null   object
 1   Rotten Tomatoes  9508 non-null   object
 2   Netflix          9515 non-null   int64 
 3   Hulu             9515 non-null   int64 
 4   Prime Video      9515 non-null   int64 
 5   Disney+          9515 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 446.1+ KB


Unnamed: 0,Title,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+
0,The Irishman,98/100,1,0,0,0
1,Dangal,97/100,1,0,0,0
2,David Attenborough: A Life on Our Planet,95/100,1,0,0,0
3,Lagaan: Once Upon a Time in India,94/100,1,0,0,0
4,Roma,94/100,1,0,0,0
...,...,...,...,...,...,...
9510,Most Wanted Sharks,14/100,0,0,0,1
9511,Doc McStuffins: The Doc Is In,13/100,0,0,0,1
9512,Ultimate Viking Sword,13/100,0,0,0,1
9513,Hunt for the Abominable Snowman,10/100,0,0,0,1


In [10]:
streaming_data_df.isnull().any()

Title              False
Rotten Tomatoes     True
Netflix            False
Hulu               False
Prime Video        False
Disney+            False
dtype: bool

In [11]:
#Drop the NA's 
streaming_data_df.dropna(subset=['Rotten Tomatoes'], inplace=True)

#Drop the duplicates
streaming_data_df.drop_duplicates()

Unnamed: 0,Title,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+
0,The Irishman,98/100,1,0,0,0
1,Dangal,97/100,1,0,0,0
2,David Attenborough: A Life on Our Planet,95/100,1,0,0,0
3,Lagaan: Once Upon a Time in India,94/100,1,0,0,0
4,Roma,94/100,1,0,0,0
...,...,...,...,...,...,...
9510,Most Wanted Sharks,14/100,0,0,0,1
9511,Doc McStuffins: The Doc Is In,13/100,0,0,0,1
9512,Ultimate Viking Sword,13/100,0,0,0,1
9513,Hunt for the Abominable Snowman,10/100,0,0,0,1


In [12]:
#Use InnerJoin on the datasets on the column 'Title' 
merge_df = pd.merge(IMDB_data_df, streaming_data_df, on="Title",how='inner')
merge_df

Unnamed: 0,Title,actor_1_name,genres,budget,gross,country,title_year,imdb_score,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+
0,Avatar,CCH Pounder,Action|Adventure|Fantasy|Sci-Fi,237000000.0,760505847.0,USA,2009.0,7.9,86/100,0,0,0,1
1,Pirates of the Caribbean: At World's End,Johnny Depp,Action|Adventure|Fantasy,300000000.0,309404152.0,USA,2007.0,7.1,81/100,0,0,0,1
2,John Carter,Daryl Sabara,Action|Adventure|Sci-Fi,263700000.0,73058679.0,USA,2012.0,6.6,75/100,0,0,0,1
3,Tangled,Brad Garrett,Adventure|Animation|Comedy|Family|Fantasy|Musi...,260000000.0,200807262.0,USA,2010.0,7.8,83/100,0,0,0,1
4,Avengers: Age of Ultron,Chris Hemsworth,Action|Adventure|Sci-Fi,250000000.0,458991599.0,USA,2015.0,7.5,84/100,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
845,Monty Python and the Holy Grail,Eric Idle,Adventure|Comedy|Fantasy,229575.0,1229197.0,UK,1975.0,8.3,84/100,1,0,0,0
846,Better Luck Tomorrow,Parry Shen,Crime|Drama|Romance,250000.0,3799339.0,USA,2002.0,7.2,64/100,0,0,1,0
847,Like Crazy,Jennifer Lawrence,Drama|Romance,250000.0,3388210.0,USA,2011.0,6.7,69/100,0,1,0,0
848,Middle of Nowhere,Omari Hardwick,Drama,200000.0,78030.0,USA,2012.0,6.5,55/100,1,0,0,0


In [13]:
merge_df1=merge_df.copy()

#Replace the values of the below 4 columns with its column name to identify the platform
for i in merge_df1[['Netflix','Hulu','Prime Video','Disney+']].columns:
    merge_df1[i].replace(1,i,inplace=True)
    merge_df1[i].replace(0,'',inplace=True)

#use function .apply() and .join() to join the 4 columns and assign to new column 'Streaming_Platform'
merge_df1['Streaming_Platform'] = merge_df1[['Netflix','Hulu','Prime Video','Disney+']].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)
merge_df1

#use lstrip,rstrip,replace to remove the leading, trailing and extra comma's
merge_df1['Streaming_Platform']= merge_df1['Streaming_Platform'].map(lambda x: x.lstrip(',').rstrip(','))
merge_df1["Streaming_Platform"]=merge_df1["Streaming_Platform"].str.replace(",,,",",")
merge_df1["Streaming_Platform"]=merge_df1["Streaming_Platform"].str.replace(",,",",")

merge_df_final = merge_df1.loc[:,['Title','title_year','imdb_score','Rotten Tomatoes','actor_1_name','Streaming_Platform','budget','gross','country','genres']]

#Sort the final dataset in descending order on imdb_score and 'Rotten_Tomatoes'
merge_df_final=merge_df_final.sort_values(by = ['imdb_score','Rotten Tomatoes'], ascending = [False,False])

In [14]:
merge_df_final

Unnamed: 0,Title,title_year,imdb_score,Rotten Tomatoes,actor_1_name,Streaming_Platform,budget,gross,country,genres
31,The Dark Knight,2008.0,9.0,94/100,Christian Bale,Hulu,185000000.0,533316061.0,USA,Action|Crime|Drama|Thriller
227,Fight Club,1999.0,8.8,93/100,Brad Pitt,Prime Video,63000000.0,37023395.0,USA,Drama
119,Terminator 2: Judgment Day,1991.0,8.5,89/100,Joe Morton,Netflix,102000000.0,204843350.0,USA,Action|Sci-Fi
120,Django Unchained,2012.0,8.5,89/100,Leonardo DiCaprio,Netflix,100000000.0,162804648.0,USA,Drama|Western
143,The Departed,2006.0,8.5,89/100,Leonardo DiCaprio,Netflix,90000000.0,132373442.0,USA,Crime|Drama|Thriller
...,...,...,...,...,...,...,...,...,...,...
796,Phat Girlz,2006.0,3.0,46/100,Mo'Nique,Hulu,3000000.0,7059537.0,USA,Comedy
493,BloodRayne,2005.0,2.9,51/100,Meat Loaf,Prime Video,25000000.0,1550000.0,USA,Action|Adventure|Fantasy|Horror
181,Battlefield Earth,2000.0,2.4,51/100,Richard Tyson,Netflix,44000000.0,21471685.0,USA,Action|Adventure|Sci-Fi
279,Gigli,2003.0,2.4,49/100,Todd Giebenhain,Hulu,54000000.0,5660084.0,USA,Comedy|Crime|Romance


In [15]:
#Create SQL Alchemy engine to postgres database "IMDB_db"
protocol = 'postgresql'
username = 'postgres'
password = 'postgres'
host = 'localhost'
port = 5432
database_name = 'IMDB_db'
rds_connection_string = f'{protocol}://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string)

In [16]:
#show the existing table names in the database
engine.table_names()

  engine.table_names()


['IMDB_Streaming_data', 'IMDB_data', 'streaming_data']

In [17]:
#write the data in the dataframe IMDB_data_df to the database table "IMDB_data"
IMDB_data_df.to_sql(name='IMDB_data', con=engine, if_exists='replace', index=False)

891

In [18]:
#write the data in the dataframe streaming_data_df to the database table "streaming_data
streaming_data_df.to_sql(name='streaming_data', con=engine, if_exists='replace', index=False)

508

In [19]:
#Query the IMDB_data table to confirm the above data load
pd.read_sql_query('select * from public."IMDB_data"', con=engine).head()

Unnamed: 0,Title,actor_1_name,genres,budget,gross,country,title_year,imdb_score
0,Avatar,CCH Pounder,Action|Adventure|Fantasy|Sci-Fi,237000000.0,760505847.0,USA,2009.0,7.9
1,Pirates of the Caribbean: At World's End,Johnny Depp,Action|Adventure|Fantasy,300000000.0,309404152.0,USA,2007.0,7.1
2,Spectre,Christoph Waltz,Action|Adventure|Thriller,245000000.0,200074175.0,UK,2015.0,6.8
3,The Dark Knight Rises,Tom Hardy,Action|Thriller,250000000.0,448130642.0,USA,2012.0,8.5
4,John Carter,Daryl Sabara,Action|Adventure|Sci-Fi,263700000.0,73058679.0,USA,2012.0,6.6


In [20]:
#Query the streaming_data table to confirm the above data load
pd.read_sql_query('select * from public."streaming_data"', con=engine).head()

Unnamed: 0,Title,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+
0,The Irishman,98/100,1,0,0,0
1,Dangal,97/100,1,0,0,0
2,David Attenborough: A Life on Our Planet,95/100,1,0,0,0
3,Lagaan: Once Upon a Time in India,94/100,1,0,0,0
4,Roma,94/100,1,0,0,0


In [21]:
#write the data in the dataframe merge_df_final to the database table "IMDB_streaming_data"
merge_df_final.to_sql(name='IMDB_Streaming_data', con=engine, if_exists='replace', index=False)

850

In [22]:
#Query the IMDB_streaming_data table to confirm the above data load
pd.read_sql_query('select * from public."IMDB_Streaming_data"', con=engine)

Unnamed: 0,Title,title_year,imdb_score,Rotten Tomatoes,actor_1_name,Streaming_Platform,budget,gross,country,genres
0,The Dark Knight,2008.0,9.0,94/100,Christian Bale,Hulu,185000000.0,533316061.0,USA,Action|Crime|Drama|Thriller
1,Fight Club,1999.0,8.8,93/100,Brad Pitt,Prime Video,63000000.0,37023395.0,USA,Drama
2,Terminator 2: Judgment Day,1991.0,8.5,89/100,Joe Morton,Netflix,102000000.0,204843350.0,USA,Action|Sci-Fi
3,Django Unchained,2012.0,8.5,89/100,Leonardo DiCaprio,Netflix,100000000.0,162804648.0,USA,Drama|Western
4,The Departed,2006.0,8.5,89/100,Leonardo DiCaprio,Netflix,90000000.0,132373442.0,USA,Crime|Drama|Thriller
...,...,...,...,...,...,...,...,...,...,...
845,Phat Girlz,2006.0,3.0,46/100,Mo'Nique,Hulu,3000000.0,7059537.0,USA,Comedy
846,BloodRayne,2005.0,2.9,51/100,Meat Loaf,Prime Video,25000000.0,1550000.0,USA,Action|Adventure|Fantasy|Horror
847,Battlefield Earth,2000.0,2.4,51/100,Richard Tyson,Netflix,44000000.0,21471685.0,USA,Action|Adventure|Sci-Fi
848,Gigli,2003.0,2.4,49/100,Todd Giebenhain,Hulu,54000000.0,5660084.0,USA,Comedy|Crime|Romance
