# 1. Import Dataset & Packages

## 1.1 Import Packages

In [345]:
import pandas as pd # manipulate dataframe
import numpy as np # array, linear algebra

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Web Scraping
import requests
from bs4 import BeautifulSoup

# Make web scraping 10 times faster
import lxml
import cchardet

from datetime import date # get today's date for most updated data

# Save model for reuse
import pickle

# Create a web application
import streamlit as st

# Hide warning
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
# Run and display all commands in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## 1.2 Web Scraping

In [338]:
%%time

# Scrape data from IMDB movie database

# 1st pass requirements: Movie, English language, rating 4+, year 2018+, scrape 1st 3000 movies by popularity
# 2nd pass requirements: same as 1st, add concurrent thread for faster scraping
# Link to scrape: https://www.imdb.com/search/title/?title_type=feature&release_date=2018-01-01,2022-02-17&user_rating=4.0,&languages=en
# Link to go to next page, each page display only 50 movies: https://www.imdb.com/search/title/?title_type=feature&release_date=2019-01-01,2022-02-17&user_rating=4.0,&languages=en&start=51&ref_=adv_nxt

# Create empty array to store information:
movie_name = []
year =[]
runtime =[]
rating =[]
metascore =[]
stars =[]
synopsis =[]
genre= []

# Create condition for scraping:
# Link to get customized header: https://www.whatismybrowser.com/detect/what-http-headers-is-my-browser-sending
my_headers ={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
}

today = date.today().strftime("%Y-%m-%m") # Get today's date to get most relevant data
pages = np.arange(1,70000,250) #array of number of movies to impute to link to get next 250 movies

for page in pages:
    page = requests.get('https://www.imdb.com/search/title/?title_type=feature&release_date=2018-01-01,' +str(today) +
                        '&user_rating=4.0,&languages=en&count=250&start=' +str(page)+'&ref_=adv_nxt')
#    soup = BeautifulSoup(page.text, 'html.parser')
    soup = BeautifulSoup(page.text, 'lxml')
    movies = soup.findAll('div', class_ ='lister-item mode-advanced')

    for movie in movies:
        movie_name.append(movie.h3.a.text) # Get movie name
        year.append(movie.h3.text[-6:-2]) # Get year 
        runtime.append(movie.find('span', class_ ='runtime').text if movie.find('span', class_ ='runtime') else None)  # Get runtime 
        rating.append(movie.find('div', class_ = "inline-block ratings-imdb-rating").text.replace('\n', '') if movie.find('div', class_ ="inline-block ratings-imdb-rating") else None) # Get rating
        metascore.append(movie.find('span', class_ = "metascore").text if movie.find('span', class_ = "metascore") else None) # Get metascore
        synopsis.append(movie.find_all('p', class_ ='text-muted')[1].text.replace('\n','')) # Get synopsis
        genre.append(movie.find('span', class_ ='genre').text.replace('\n','') if movie.find('span', class_ ='genre') else None) # Get genre
        stars.append([x.text for x in d] if (d := movie.select('a[href*="name"]')) else '')

Wall time: 13min 53s


In [339]:
movies_df = pd.DataFrame({"Movie_Name": movie_name, 
                        "Release_Year" : year, 
                        "Runtime": runtime,
                       "Genre": genre,
                       "Movie_Rating": rating, 
                       "Critic_Rating": metascore,
                       "Stars": stars,
                       "Synopsis": synopsis})

In [340]:
movies_df.to_csv("imdb_scraped_movies.csv", index=False)

## Summary

In [315]:
movies_df.head(2)
movies_df.info()
movies_df.describe()

Unnamed: 0,Movie_Name,Release_Year,Runtime,Genre,Movie_Rating,Critic_Rating,Stars,Synopsis
0,Nightmare Alley,2021,150 min,"Crime, Drama, Thriller",7.2,70,"[Guillermo del Toro, Bradley Cooper, Cate Blan...",An ambitious carny with a talent for manipulat...
1,The Power of the Dog,2021,126 min,"Drama, Romance, Western",6.9,89,"[Jane Campion, Benedict Cumberbatch, Kirsten D...",Charismatic rancher Phil Burbank inspires fear...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Movie_Name     70000 non-null  object
 1   Release_Year   70000 non-null  object
 2   Runtime        69248 non-null  object
 3   Genre          69936 non-null  object
 4   Movie_Rating   70000 non-null  object
 5   Critic_Rating  57888 non-null  object
 6   Stars          70000 non-null  object
 7   Synopsis       70000 non-null  object
dtypes: object(8)
memory usage: 4.3+ MB


Unnamed: 0,Movie_Name,Release_Year,Runtime,Genre,Movie_Rating,Critic_Rating,Stars,Synopsis
count,70000,70000,69248,69936,70000.0,57888,70000,70000
unique,9798,9,160,476,61.0,88,9963,9757
top,The Courier,2021,109 min,Drama,7.3,53,"[David Gordon Green, Jamie Lee Curtis, Judy Gr...",Add a Plot
freq,242,31834,2985,4558,3785.0,2215,242,242


# Data Manipulation

In [341]:
# Convert Stars' list to string
movies_df['Stars'] = movies_df['Stars'].fillna('')

li =[]
for row in movies_df['Stars']:
    star=', '.join(map(str,row))
    li.append(star)
    
movies_df['Stars'] =pd.Series(li)

In [342]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Movie_Name     70000 non-null  object
 1   Release_Year   70000 non-null  object
 2   Runtime        69247 non-null  object
 3   Genre          69936 non-null  object
 4   Movie_Rating   70000 non-null  object
 5   Critic_Rating  57888 non-null  object
 6   Stars          70000 non-null  object
 7   Synopsis       70000 non-null  object
dtypes: object(8)
memory usage: 4.3+ MB


In [343]:
movies_df[movies_df['Release_Year'].isin(['bimi','mmer','ll 2','enda'])]['Release_Year']

3626    bimi
6137    mmer
7584    ll 2
9625    enda
Name: Release_Year, dtype: object

In [304]:
movies_df['Release_Year'].unique()

array(['2021', '2022', '2020', '2019', '2018', 'bimi', 'mmer', 'll 2',
       'enda'], dtype=object)

In [290]:
# Cast columns to correct datatype
movies_df['Release_Year'] =movies_df['Release_Year'].astype('int32')
movies_df['Movie_Rating'] =movies_df['Movie_Rating'].astype('float')
movies_df['Critic_Rating'] =movies_df['Critic_Rating'].astype('float')

# Replace empty values with NaN value
movies_df = movies_df.fillna('')
# movies_df.isnull().sum().sort_values(ascending=False)

# Remove 'min' in Runtime and cast to datetime
movies_df['Runtime'] =movies_df['Runtime'].str[:-4]
#movies_df['Runtime'] =movies_df['Runtime'].astype('int32')

In [None]:
df = movies_df.copy()

# Data Exploratory Analysis

# Content-Based Recommender using Cosine Similarity

## Recommender using Synopsis

In [None]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df['Synopsis'] = df['Synopsis'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['Synopsis'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

In [None]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
cosine_sim.shape

In [None]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(df.index, index=df['Movie_Name']).drop_duplicates()
indices[:10]

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim):
    #Get index of the movie with the input title
    idx = indices[title]

    #Get the cosine similarity pairwise score of the movie vs others
    scores = cosine_sim[idx]  #this is an array

    #Create a list with index for the pairwise score
    sim_scores = list(enumerate(
        scores))  # enumerate creates the index, list creates the list

    #Sort the movies based on similarity scores
    sim_scores = sorted(
        sim_scores,
        key=lambda list: list[1],  #sort by score (2nd position of the list)
        reverse=True)  #sort from highest to lowest

    #Get the scores of the top 10 most similar movies
    sim_scores = sim_scores[1:11]  #this is a list

    #Get the indices of those similar movies to get their names
    movie_indices = [i[0] for i in sim_scores]  #this is an array

    #Return the names of the top 10 similar movies
    return df['Movie_Name'].iloc[movie_indices]

In [344]:
get_recommendations('The Power of the Dog', cosine_sim)

303    Every Breath You Take
281            The Old Guard
12                      CODA
448                 Red Joan
132                Cry Macho
33       The French Dispatch
283             The Marksman
472                  Bruised
227                Swan Song
248         Coming 2 America
Name: Movie_Name, dtype: string

In [348]:
# Save model for reuse
filename = 'content_based_imdb_webscraping_model.sav'
pickle.dump(cosine_sim, open(filename, 'wb'))

# Create a web app

## Design frontend with streamlit

## Improve Cosine Similarity with Credits, Genres and Keyword

In [None]:
df['genres_split'] = df['genres'].str.replace('|', " ", regex=True)

In [None]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [None]:
# Apply clean_data function to features
features = [
    'production_companies', 'production_countries', 'actor1_name',
    'actor2_name', 'actor3_name', 'director_name', 'tag'
]

for feature in features:
    df[feature] = df[feature].apply(clean_data)

In [None]:
df.head(2)

In [None]:
df['soup'] = df['overview'] + " " + df['production_countries'] + " " + 
             df['production_companies'] + " " + df['actor1_name'] + " " + 
             df['actor2_name'] + " " + df['actor3_name'] + " " + 
             df['director_name'] + " " + df['year'] + " " + df['genres_split'] + " " + 
             df['tag']

In [None]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [None]:
count_matrix.shape

In [None]:
count_matrix.indices

In [None]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [None]:
df.head(2)

In [None]:
get_recommendations('Toy Story (1995)', cosine_sim)

In [None]:
get_recommendations('Toy Story (1995)', cosine_sim2)

# Save model for reuse

In [None]:
filename = 'content_based_imdb_webscraping_model.sav'
pickle.dump(svd, open(filename, 'wb'))