In [36]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import base64
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
import json
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import ast
from ydata_profiling import ProfileReport
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

Data Scrapping

Functions required for the scrapping of data

In [2]:
def load_data(url, driver_path, interact, scrolls, loadSleep):
    
    if interact:
        
        SCROLL_DOWN = "window.scrollTo(0, document.body.scrollHeight);"
        service = Service(driver_path)
        driver = webdriver.Chrome(service=service)
        driver.get(url)
        time.sleep(loadSleep)
        
        load_button = driver.find_element(
            By.XPATH,
            "//button[contains(@class, 'ipc-see-more__button')]"
        )
    
        for i in range(0, scrolls):
            print("Scrolling down...")
            driver.execute_script(SCROLL_DOWN)
            time.sleep(2)
            load_button.click()
            time.sleep(10)
        
        ret = BeautifulSoup(driver.page_source, 'html.parser')
        driver.quit()
    else:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        ret = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser') 
        
    return ret

def get_TitleCode(HTML_CONTENT, imdb_movie_title_code):
    href = HTML_CONTENT.find_all(
        'div',
        class_="ipc-title ipc-title--base ipc-title--title ipc-title-link-no-icon ipc-title--on-textPrimary sc-479faa3c-9 dkLVoC dli-title"
    )
    for href in href:
        imdb_movie_title_code.append(href.find('a')['href'].split('/')[2])

def get_Name(CONTENT, movie_names):
    
    name = CONTENT.find(
        'span',
        class_="sc-7f1a92f5-1 benbRT"
    )
    movie_names.append(name.text)
    print("DONE", end=" ")

def get_UserRating(CONTENT, movie_ratings):
    rating = CONTENT.find(
        'span',
        class_='sc-bde20123-1 cMEQkK'
    )
    movie_ratings.append(float(rating.text))
    print("DONE", end=" ")
    
def get_Release(CONTENT, movie_release_date):
    date = CONTENT.find_all(
        'a',
        class_='ipc-link ipc-link--baseAlt ipc-link--inherit-color'
    )
    movie_release_date.append(date[5].text)
    print("DONE", end=" ")

def get_MetaScore(CONTENT, movie_metascore):
    score = CONTENT.find(
        'span',
        class_='sc-b0901df4-0 bcQdDJ metacritic-score-box'
    )
    if score:
        movie_metascore.append(int(score.text))
    else:
        movie_metascore.append(np.nan)
    print("DONE", end=" ")
    
def get_Duration(CONTENT, movie_duration):
    time = CONTENT.find_all(
        'li',
        class_='ipc-inline-list__item'
    )
    movie_duration.append(time[6].text)
    print("DONE", end=" ")

def get_Genre(CONTENT, movie_genres):
    gen = CONTENT.find_all(
        'a',
        class_='ipc-chip ipc-chip--on-baseAlt'
    )
    gen_ = []
    for g in gen:
        gen_.append(g.text)
        
    movie_genres.append(gen_)
    print("DONE", end=" ")

def get_Director(CONTENT, movie_directors):
    dir_ = CONTENT.find(
        'a',
        class_='ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link'
    )
    movie_directors.append(dir_.text)
    print("DONE", end=" ")

def get_Star(CONTENT, movie_stars):
    str_ = CONTENT.find_all(
        'ul',
        class_='ipc-inline-list ipc-inline-list--show-dividers ipc-inline-list--inline ipc-metadata-list-item__list-content baseAlt'
    )
    
    stars = []
    for s in str_[2]:
       stars.append(s.text)
    
    movie_stars.append(stars)
    print("DONE", end=" ")

def get_review(TITLE_CODE, movie_reviews):
    review_url = f'https://www.imdb.com/title/{TITLE_CODE}/reviews?spoiler=hide&sort=reviewVolume&dir=desc&ratingFilter=0'
    soup = BeautifulSoup(requests.get(review_url).content, 'html.parser')
    review = soup.find_all('a', class_='title')
    movie_reviews.append([r.text for r in review])
    print("DONE", end=" ")

def convertMin(text):
    pattern = re.compile(r'(?:(?P<hours>\d+)h)? ?(?:(?P<minutes>\d+)m)?')
    match = pattern.match(text)
    
    hours = int(match.group('hours')) if match.group('hours') else 0
    minutes = int(match.group('minutes')) if match.group('minutes') else 0
    if hours == 0 and minutes == 0:
        return np.nan
    else:
        return hours * 60 + minutes

def convert_to_numeric(value):
    try:
        return int(value)
    except (ValueError, TypeError):
        return np.nan

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return " ".join(filtered_text)


Constant Variables Initialization

In [5]:
DRIVER_PATH = "A:\chromedriver-win64\chromedriver.exe"
URL_MOVIES = 'https://www.imdb.com/search/title/?sort=user_rating,asc&groups=top_1000&count=250'

HTML_TITLECODE = load_data(
    URL_MOVIES,
    DRIVER_PATH,
    interact=True,
    scrolls=3,
    loadSleep=5
)


Scrolling down...
Scrolling down...
Scrolling down...


In [17]:
# Scrapping the data from the website

imdb_movie_title_code = []
movie_names = []
movie_reviews = []
movie_user_ratings = []
movie_release_date = []
movie_metascore = []
movie_duration_time = []
movie_genres = []
movie_directors = []
movie_stars = []

# getting title codes for each movie to be used in the url
get_TitleCode(HTML_TITLECODE, imdb_movie_title_code)

for code, i in zip(imdb_movie_title_code, range(0, len(imdb_movie_title_code))):
    IND_MOV_URL = f'https://www.imdb.com/title/{code}/?ref_=tt_urv'
    
    # getting the HTML of each movie page
    HTML_CONTENT = load_data(
        IND_MOV_URL,
        DRIVER_PATH,
        interact=False,
        loadSleep=5
    )
    
    # getting the data from the HTML
    get_Name(HTML_CONTENT, movie_names)
    get_UserRating(HTML_CONTENT, movie_user_ratings)
    get_Release(HTML_CONTENT, movie_release_date)
    get_MetaScore(HTML_CONTENT, movie_metascore)
    get_Duration(HTML_CONTENT, movie_duration_time)
    get_Genre(HTML_CONTENT, movie_genres)
    get_Director(HTML_CONTENT, movie_directors)
    get_Star(HTML_CONTENT, movie_stars)
    get_review(code, movie_reviews)
    
    print(f'\nFinished for {i + 1}: [{code}]')

    

DONE DONE DONE DONE DONE DONE DONE DONE DONE 
Finished for 1: [tt0241527]
DONE DONE DONE DONE DONE DONE DONE DONE DONE 
Finished for 2: [tt7131622]
DONE DONE DONE DONE DONE DONE DONE DONE DONE 
Finished for 3: [tt0144084]
DONE DONE DONE DONE DONE DONE DONE DONE DONE 
Finished for 4: [tt0093870]
DONE DONE DONE DONE DONE DONE DONE DONE DONE 
Finished for 5: [tt0387564]
DONE DONE DONE DONE DONE DONE DONE DONE DONE 
Finished for 6: [tt0119116]
DONE DONE DONE DONE DONE DONE DONE DONE DONE 
Finished for 7: [tt0416449]
DONE DONE DONE DONE DONE DONE DONE DONE DONE 
Finished for 8: [tt0409459]
DONE DONE DONE DONE DONE DONE DONE DONE DONE 
Finished for 9: [tt0088680]
DONE DONE DONE DONE DONE DONE DONE DONE DONE 
Finished for 10: [tt0829482]
DONE DONE DONE DONE DONE DONE DONE DONE DONE 
Finished for 11: [tt2582782]
DONE DONE DONE DONE DONE DONE DONE DONE DONE 
Finished for 12: [tt5463162]
DONE DONE DONE DONE DONE DONE DONE DONE DONE 
Finished for 13: [tt0151804]
DONE DONE DONE DONE DONE DONE DONE

In [None]:
# storing the data in a dataframe and converting it to an excel file

dataframe = {
    'Name': movie_names,
    'Title Code': imdb_movie_title_code,
    'Release Year': movie_release_date,
    'Duration': movie_duration_time,
    'Genre': movie_genres,
    'Rating': movie_user_ratings,
    'MetaScore': movie_metascore,
    'Directors': movie_directors,
    'Stars': movie_stars,
    'Reviews': movie_reviews,
}

pd.DataFrame(dataframe).to_excel('1000_IMDB_Movies.xlsx', index=False)


In [6]:
# df = pd.read_excel('1000_IMDB_Movies_Cleaned.xlsx')
# movie_names = df['Name'].tolist()
# imdb_movie_title_code = df['Title Code'].tolist()
# movie_release_date = df['Release Year'].tolist()
# movie_duration_time = df['Duration'].tolist()
# movie_genres = df['Genre'].tolist()
# movie_ratings = df['Rating'].tolist()
# movie_metascore = df['MetaScore'].tolist()
# movie_directors = df['Directors'].tolist()
# movie_stars = df['Stars'].tolist()
# movie_reviews = df['Reviews'].tolist()

Data Wrangling

In [69]:
df = pd.read_excel('1000_IMDB_Movies.xlsx')
df

Unnamed: 0,Name,Title Code,Release Year,Duration,Genre,Rating,MetaScore,Directors,Stars,Reviews
0,Harry Potter and the Sorcerer's Stone,tt0241527,2001,2h 32m,"['Adventure', 'Family', 'Fantasy']",7.6,65.0,Chris Columbus,"['Daniel Radcliffe', 'Rupert Grint', 'Richard ...","[' Great fun!\n', ' Great chemistry set up for..."
1,Once Upon a Time in... Hollywood,tt7131622,2019,2h 41m,"['Comedy', 'Drama']",7.6,83.0,Quentin Tarantino,"['Leonardo DiCaprio', 'Brad Pitt', 'Margot Rob...",[' Really good...up until the incredibly blood...
2,American Psycho,tt0144084,2000,1h 42m,"['Crime', 'Drama', 'Horror']",7.6,64.0,Mary Harron,"['Christian Bale', 'Justin Theroux', 'Josh Luc...","[' useful\n', ' American Psycho\n', ' Behind t..."
3,RoboCop,tt0093870,1987,1h 42m,"['Action', 'Crime', 'Sci-Fi']",7.6,70.0,Paul Verhoeven,"['Peter Weller', 'Nancy Allen', ""Dan O'Herlihy""]","[' Fun 80s icon\n', ' Classic Action in Gory D..."
4,Saw,tt0387564,2004,1h 43m,"['Horror', 'Mystery', 'Thriller']",7.6,46.0,James Wan,"['Cary Elwes', 'Leigh Whannell', 'Danny Glover']","[' A puzzling game worth playing\n', ' convolu..."
...,...,...,...,...,...,...,...,...,...,...
995,The Godfather Part II,tt0071562,1974,3h 22m,"['Crime', 'Drama']",9.0,90.0,Francis Ford Coppola,"['Al Pacino', 'Robert De Niro', 'Robert Duvall']",[' My review of this film is pretty unimportan...
996,The Lord of the Rings: The Return of the King,tt0167260,2003,3h 21m,"['Action', 'Adventure', 'Drama']",9.0,94.0,Peter Jackson,"['Elijah Wood', 'Viggo Mortensen', 'Ian McKell...","[' Pretty much outstanding\n', ' awards winnin..."
997,12 Angry Men,tt0050083,1957,1h 36m,"['Crime', 'Drama']",9.0,97.0,Sidney Lumet,"['Henry Fonda', 'Lee J. Cobb', 'Martin Balsam']","[' Brilliant, brilliant, brilliant!!\n', ' A V..."
998,The Godfather,tt0068646,1972,2h 55m,"['Crime', 'Drama']",9.2,100.0,Francis Ford Coppola,"['Marlon Brando', 'Al Pacino', 'James Caan']","["" #2 on IMDb's Top 100, a multi-Oscar-winner ..."


In [70]:
#converting the data to the required format
df['Release Year'] = df['Release Year'].apply(convert_to_numeric)
df['Duration'] = df['Duration'].apply(convertMin).astype('Int64')

# getting info about the dataframe
print(df.shape, end="\n"*2)
print(df.dtypes, end="\n"*2)
print(df.isnull().sum(), end="\n"*2)
print(df.info(), end="\n"*2)

# filling missing values with the mode and mean as the data is rich
df['Release Year'].fillna(df['Release Year'].mode().values[0], inplace=True)
df['Duration'].fillna(int(df['Duration'].mean()), inplace=True)
df['MetaScore'].fillna(df['MetaScore'].mean(), inplace=True)
df['Reviews'] = df['Reviews'].apply(ast.literal_eval)
df['Genre'] = df['Genre'].apply(ast.literal_eval)

# checking if the missing values are filled
print(df.info())


(1000, 10)

Name             object
Title Code       object
Release Year    float64
Duration          Int64
Genre            object
Rating          float64
MetaScore       float64
Directors        object
Stars            object
Reviews          object
dtype: object

Name              0
Title Code        0
Release Year     16
Duration         34
Genre             0
Rating            0
MetaScore       155
Directors         0
Stars             0
Reviews           0
dtype: int64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          1000 non-null   object 
 1   Title Code    1000 non-null   object 
 2   Release Year  984 non-null    float64
 3   Duration      966 non-null    Int64  
 4   Genre         1000 non-null   object 
 5   Rating        1000 non-null   float64
 6   MetaScore     845 non-null    float64
 7   Directors     1000 no

In [71]:
# processing the reviews by removing stopwords etc
df['Reviews'] = df['Reviews'].apply(lambda x: [remove_stopwords(i) for i in x])

# remove non alphanumeric characters and convert to lowercase
df['Reviews'] = df['Reviews'].apply(lambda x: [re.sub(r'[^A-Za-z0-9 ]+', '', i) for i in x])
df['Reviews'] = df['Reviews'].apply(lambda x: [i.lower() for i in x])

In [None]:
# generating a report of the data containing visualizations and statistics
profile = ProfileReport(df)
profile.to_file("IMDB_Movies_Report.html")

BOW Model

In [72]:
df_Review_explode = df['Reviews'].explode()
# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the text data
X = vectorizer.fit_transform(df_Review_explode)

# Convert the result to an array
bag_of_words = X.toarray()

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Print the Bag of Words model
print("Bag of Words: ", bag_of_words)
print("Feature Names(BOW): ", feature_names)

Bag of Words:  [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Feature Names(BOW):  ['007' '03' '05' ... 'zwartboek' 'zwick' 'zzzzzzzzzzz']


TF-IDF Model

In [73]:
# Initialize the TfidfVectorizer with a custom tokenizer
vectorizer = TfidfVectorizer()

# Fit and transform the text data
X_tfidf = vectorizer.fit_transform(df_Review_explode)

# Convert the result to an array
tfidf_matrix = X_tfidf.toarray()

# Get the feature names (words)
feature_names_tfidf = vectorizer.get_feature_names_out()

# Print the TF-IDF matrix
print("TF-IDF Matrix: ", tfidf_matrix)
print("Feature Names (TF-IDF): ", feature_names_tfidf)


TF-IDF Matrix:  [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Feature Names (TF-IDF):  ['007' '03' '05' ... 'zwartboek' 'zwick' 'zzzzzzzzzzz']


One Hot Encoding the genre column

In [74]:
# Create an instance of MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Apply one-hot encoding and create a new DataFrame
encoded_genres = pd.DataFrame(mlb.fit_transform(df['Genre']), columns=mlb.classes_)

# Concatenate the new DataFrame with the original DataFrame
df_encoded = pd.concat([df, encoded_genres], axis=1)

In [75]:
# dropping the columns that are not required
df_encoded.drop(['Title Code', 'Release Year', 'Duration', 'Genre'], axis=1, inplace=True)

# saving to an excel file
df_encoded.to_excel('1000_IMDB_Movies_Cleaned.xlsx', index=False)