## <center> Scraping Movie Ratings </center>

Prerequisites:
* Make sure to place chromedriver.exe in the same directory as your code, in "\chromedriver-win64" subfolder.
* Make sure you have mysql installed. 
* Make sure you have a database named "scraping". Otherwise, execute this command.
<br>CREATE DATABASE scraping;
* Make sure you have table named "movie" in scraping database. Otherwise, execute this command.
<br>
<span style="color:green;">CREATE TABLE movie (movieid VARCHAR(255) PRIMARY KEY, movie_title VARCHAR(255), movie_year INT, movie_release_date DATE, movie_genre VARCHAR(255), movie_rating INT, movie_href VARCHAR(255), movie_desc TEXT, movie_cast TEXT, movie_tag TEXT, budget DECIMAL(15, 2), revenue DECIMAL(15, 2), page_count INT, download_flag INT);</span>
* Make sure you have table named "rating" in scraping database. Otherwise, execute this command.
<br>
<span style="color:green;">CREATE TABLE rating (reviewid VARCHAR(255) PRIMARY KEY, userid VARCHAR(255), movieid VARCHAR(255), movie_title VARCHAR(255), rating INT, review_date DATE, review_text TEXT, user_href VARCHAR(255), review_href VARCHAR(255), download_flag INT);</span>
* Make sure you have table named "user" in scraping database. Otherwise, execute this command.
<br>
<span style="color:green;">CREATE TABLE user (userid VARCHAR(50) PRIMARY KEY, user_name VARCHAR(100), user_href VARCHAR(255), user_join_date DATE);</span>

In [1]:
import pandas as pd
import re
import os
from tqdm.notebook import tqdm
from datetime import datetime
import time

import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup

from selenium import webdriver 
from selenium.webdriver.chrome.options import Options 
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By

import pymysql

In [2]:
def getPage(driver, url):
    driver.get(url)
    time.sleep(5) # Wait for the page to load, also respecting TMDB rate limit: 40 request per 10 seconds
    bsObj = BeautifulSoup(driver.page_source, 'lxml') # Get the page source and parse it with Beautiful Soup
    return bsObj

def download_Ratings (driver, conn, cur, main_url):
    
    sub_url = "/reviews"
    count = 0

    reviewid = ""
    userid = ""
    movieid = ""
    movie_title= ""
    rating = ""
    review_date = ""
    review_text = ""
    user_href = ""
    review_href = ""
    download_flag = 0

    filtered_movie_df = select_movie (cur)
    filtered_movie_df_len = len(filtered_movie_df)

    progress_bar = tqdm(desc='Rating download progress for movie #', total=filtered_movie_df_len)  # progress bar

    for index, row in filtered_movie_df.iterrows():
        try:
            url = main_url + row['movie_href'] + sub_url
            # print(url)
            movieid = row['movieid']
            movie_title = re.sub(r"['\"]", r"\\'", row['movie_title'])
            # print(movieid, movie_title)

            # Open the page
            bsObj = getPage(driver, url)

            time.sleep(2)
            progress_bar.update(1)
            
            # Find all review containers
            div_info = bsObj.find_all('div', {"class":"info"})
            # print("===========================================")

            if div_info:
                for div in div_info:
                    children = div.findChildren()
                    for child in children:
                        if child.name:

                            if child.get('href') and '/review/' in child.get('href'):
                                review_href = child.get('href')
                                reviewid = review_href.replace('/review/', '')
                                # print(reviewid, review_href)
                            
                            if child.get('href') and '/u/' in child.get('href'):
                                user_href = child.get('href')
                                userid = user_href.replace('/u/','')
                                # print(user_href, userid)

                            rating_tag = child.find('div', {"class": "rating_border rating"})
                            if rating_tag:
                                if rating_tag.get_text().strip()!='':
                                    rating = re.sub('%','',rating_tag.get_text().strip())
                                else:
                                    rating=0
                                # print('rating', rating)

                            date_tag = child.find('h5')
                            if date_tag:
                                # review_date = re.search(r'(\w+ \d{0,2}, \d{4})', date_tag.get_text().strip())
                                review_date = ' '.join(date_tag.get_text().strip().split()[-3:])
                                if review_date:
                                    review_date = datetime.strptime(review_date, "%B %d, %Y").date()
                                    review_date = review_date.strftime("%Y-%m-%d")
                                else:
                                    review_date = '1900-01-01'
                                    # print('review date', review_date)
                                
                    # Populate review text
                    url_review = main_url + review_href
                    bsObj_review = getPage(driver, url_review)
                    div_review = bsObj_review.find('div',{'class':'content column pad'})

                    if div_review:
                        reviews = div_review.get_text(separator="|", strip=True).split("|")
                        reviews = reviews[4:]
                        review_text = ' '.join (reviews)
                        review_text = re.sub(r"['\"]", r"\\'", review_text)

                    # print(f'{reviewid}, {userid}, {movieid}, {movie_title}, {rating}, {review_date}, {review_text}, {user_href}, {review_href}, {download_flag}')
                    # print(f'{movieid}, {review_href}')

                    # Insert new record to database
                    insert_update_rating_to_db(cur, reviewid, userid, movieid, movie_title, rating, review_date, review_text, user_href, review_href, download_flag)
                    conn.commit()

                    # Log entry to a text file in case of download stops due to reached limit
                    with open('log_rating.txt', 'a') as file:
                        file.write(f'{movieid}, {review_href}, {user_href}\n')
                    
                    count +=1
            
            # Update download_flag in movie table. This is to address the daily request limit in TMDB. Only movies without rating will be downloaded in next session.
            update_movie_downloadflag(cur, movieid)
            conn.commit()

        except Exception as e:
            print(f"Error processing {review_href}: {e}")
            print(f'{reviewid}, {userid}, {movieid}, {movie_title}, {rating}, {review_date}, {review_text}, {user_href}, {review_href}, {download_flag}')
            continue

    progress_bar.close()
    
    rating_df = select_all_rating (cur)

    return rating_df

def insert_update_rating_to_db (cursor, reviewid, userid, movieid, movie_title, rating, review_date, review_text, user_href, review_href, download_flag):
    sql = '''
    INSERT INTO rating (reviewid, userid, movieid, movie_title, rating, review_date, review_text, user_href, review_href, download_flag)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON DUPLICATE KEY UPDATE
        userid = values(userid),
        movieid = values(movieid),
        movie_title = values(movie_title),
        rating = values(rating),
        review_date = values(review_date),
        review_text = values(review_text),
        user_href = values(user_href),
        review_href = values(review_href);
    '''
    cursor.execute ("USE scraping;")
    cursor.execute(sql, (reviewid, userid, movieid, movie_title, rating, review_date, review_text, user_href, review_href, download_flag))

def select_all_movie (cursor):
    cursor.execute ("USE scraping;")
    cursor.execute("SELECT * from movie;")
    rows = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]
    df_mysql = pd.DataFrame(rows, columns=columns)
    return df_mysql

def select_all_rating (cursor):
    cursor.execute ("USE scraping;")
    cursor.execute("SELECT * from rating;")
    rows = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]
    df_mysql = pd.DataFrame(rows, columns=columns)
    return df_mysql

def select_movie (cursor):
    cursor.execute ("USE scraping;")
    cursor.execute("SELECT * from movie where download_flag = 0 or download_flag is null order by page_count, movieid;")
    rows = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]
    df_mysql = pd.DataFrame(rows, columns=columns)
    return df_mysql

def update_movie_downloadflag (cursor, movieid):
    sql = "UPDATE movie SET download_flag = 1 where movieid ='" + movieid + "';"
    cursor.execute ("USE scraping;")
    cursor.execute(sql)

In [3]:
main_url = "https://www.themoviedb.org"
user_agent = {"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"}

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless=new")
# chrome_options.add_argument("--window-position=-2400,-2400") # hide window
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-web-security")
chrome_options.add_argument("--disable-site-isolation-trials")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--disable-blink-features=BlockCredentialedSubresources")
chrome_options.add_argument(f"user-agent={user_agent}")

# Set up the WebDriver
chrome_driver = os.getcwd() + "\\chromedriver-win64" + "\\chromedriver.exe"
service = Service(chrome_driver)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Connect to database
try:
    conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='scraping')
    cur = conn.cursor()
    print("Connection successful!")
except pymysql.MySQLError as e:
    print(f"Error connecting to the database: {e}")

Connection successful!


In [4]:
rating_df = download_Ratings (driver, conn, cur, main_url)
print(rating_df.shape)
display(rating_df.tail(2))

# Export movie to csv
rating_df.to_csv("rating.csv", index=False)

Rating download progress for movie #:   0%|          | 0/4796 [00:00<?, ?it/s]

(10468, 10)


Unnamed: 0,reviewid,userid,movieid,movie_title,rating,review_date,review_text,user_href,review_href,download_flag
10466,67267f06554083a56a0d4e20,GenerationofSwine,9411-fallen,Fallen,0,2024-11-02,I generally have a distaste for movies where D...,/u/GenerationofSwine,/review/67267f06554083a56a0d4e20,0
10467,672794d7c0bc0749d0d89f29,BiankaMalburg,1184918-the-wild-robot,The Wild Robot,90,2024-11-03,Dreamworks at its best!!!! 😍😭❤️,/u/BiankaMalburg,/review/672794d7c0bc0749d0d89f29,0


In [5]:
cur.close()
conn.close()
driver.close
driver.quit

<bound method ChromiumDriver.quit of <selenium.webdriver.chrome.webdriver.WebDriver (session="2675a55c5fd4e183112336a76c0b0c4e")>>