## <center> Scraping User Reviews from TMDB</center>

Prerequisites:
* Make sure to place chromedriver.exe in the same directory as your code, in "\chromedriver-win64" subfolder.

In [1]:
import pandas as pd
import re
import os
from tqdm.notebook import tqdm
from datetime import datetime
import time

import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup

from selenium import webdriver 
from selenium.webdriver.chrome.options import Options 
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By

In [2]:
def getPage(driver, url):
    driver.get(url)
    time.sleep(5) # Wait for the page to load, also respecting TMDB rate limit: 40 request per 10 seconds
    bsObj = BeautifulSoup(driver.page_source, 'lxml') # Get the page source and parse it with Beautiful Soup
    return bsObj

def download_Ratings (driver, movie_df, main_url, fname):
    
    sub_url = "/reviews"
    count = 0

    movieid = ""
    reviewid = ""
    userid = ""
    user_href = ""
    rating = ""
    review_date = ""
    review_href = ""
    review_text = ""

    if os.path.exists(fname):
        rating_df = pd.read_csv(fname)
    else:
        rating_df = pd.DataFrame(columns=['reviewid', 'userid', 'movieid', 'movie_title', 'rating', 'review_date', 'review_text', 'user_href', 'review_href','download_flag'])

    # The download flag is added to address the daily request limit in TMDB. Only movies without rating will be downloaded in next run.
    filtered_movie_df = movie_df[(movie_df['download_flag'] == 0) | (movie_df['download_flag'].isnull())]
    filtered_movie_df_len = len(filtered_movie_df)

    progress_bar = tqdm(desc='Rating download progress for movie #', total=filtered_movie_df_len)  # progress bar

    for index, row in filtered_movie_df.iterrows():
        try:
            url = main_url + row['movie_href'] + sub_url
            # print(url)
            movieid = row['movieid']
            movie_title = re.sub(r"['\"]", r"\\'", row['movie_title'])
            # print(movieid, movie_title)

            # Open the page
            bsObj = getPage(driver, url)

            time.sleep(2)
            progress_bar.update(1)
            
            # Find all review containers
            div_info = bsObj.find_all('div', {"class":"info"})
            # print("===========================================")

            if div_info:
                for div in div_info:
                    children = div.findChildren()
                    for child in children:
                        if child.name:

                            if child.get('href') and '/review/' in child.get('href'):
                                review_href = child.get('href')
                                reviewid = review_href.replace('/review/', '')
                                # print(reviewid, review_href)
                            
                            if child.get('href') and '/u/' in child.get('href'):
                                user_href = child.get('href')
                                userid = user_href.replace('/u/','')
                                # print(user_href, userid)

                            rating_tag = child.find('div', {"class": "rating_border rating"})
                            if rating_tag: 
                                rating = re.sub('%','',rating_tag.get_text().strip())
                            else:  
                                rating=0
                            # print('rating', rating)

                            date_tag = child.find('h5')
                            if date_tag:
                                # review_date = re.search(r'(\w+ \d{0,2}, \d{4})', date_tag.get_text().strip())
                                review_date = ' '.join(date_tag.get_text().strip().split()[-3:])
                                if review_date:
                                    review_date = datetime.strptime(review_date, "%B %d, %Y").date()
                                    review_date = review_date.strftime("%Y-%m-%d")
                                else:
                                    review_date = '1900-01-01'
                                    # print('review date', review_date)
                                
                    # Populate review text
                    url_review = main_url + review_href
                    bsObj_review = getPage(driver, url_review)
                    div_review = bsObj_review.find('div',{'class':'content column pad'})

                    if div_review:
                        reviews = div_review.get_text(separator="|", strip=True).split("|")
                        reviews = reviews[4:]
                        review_text = ' '.join (reviews)
                        review_text = re.sub(r"['\"]", r"\\'", review_text)

                    # print(f'{reviewid}, {userid}, {movieid}, {movie_title}, {rating}, {review_date}, {review_text}, {user_href}, {review_href}, {download_flag}')
                    # print(f'{movieid}, {review_href}')

                    # Check duplicate
                    if reviewid not in rating_df['reviewid'].values:
                        # Insert row to df
                        new_row = [reviewid, userid, movieid, movie_title, rating, review_date, review_text, user_href, review_href, 0]
                        rating_df.loc[len(rating_df)] = new_row

                        new_row_df = pd.DataFrame([new_row], columns=rating_df.columns)
                        new_row_df.to_csv(fname, index=False, mode='a', header=not os.path.exists(fname))

                    count +=1
                    # print(count)
        
        except Exception as e:
            print(f"Error processing {review_href}: {e}")
            continue

    progress_bar.close()

    # rating_df.drop_duplicates(keep="first", inplace=True)
    # rating_df.to_csv(fname, index=False)

    # Update review_count in movie_df. This is to address the daily request limit in TMDB. Only movies without rating will be downloaded in next session.
    rating_counts = rating_df.groupby('movieid')['rating'].count().reset_index()

    filtered_movie_df = filtered_movie_df.merge(rating_counts, on='movieid', how='left')
    filtered_movie_df ['download_flag'] = filtered_movie_df['rating']
    filtered_movie_df = filtered_movie_df.drop(columns=['rating'])

    movie_df = movie_df[movie_df['download_flag'] != 0]
    movie_df = pd.concat([movie_df, filtered_movie_df], ignore_index=True)
    movie_df.to_csv('output/movie.csv', index=False)

    return rating_df

In [3]:
main_url = "https://www.themoviedb.org"
user_agent = {"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"}

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless=new")
# chrome_options.add_argument("--window-position=-2400,-2400") # hide window
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-web-security")
chrome_options.add_argument("--disable-site-isolation-trials")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--disable-blink-features=BlockCredentialedSubresources")
chrome_options.add_argument(f"user-agent={user_agent}")

# Set up the WebDriver
chrome_driver = os.getcwd() + "\\chromedriver-win64" + "\\chromedriver.exe"
service = Service(chrome_driver)
driver = webdriver.Chrome(service=service, options=chrome_options)

In [4]:
movie_df = pd.read_csv('output/movie.csv')
print(movie_df.shape)
movie_df.head(2)

(2, 13)


Unnamed: 0,movieid,movie_title,movie_release_date,movie_genre,movie_rating,movie_href,movie_desc,movie_cast,movie_tag,budget,revenue,page_count,download_flag
0,19995-avatar,Avatar,2010-08-26,Action|Adventure|Fantasy|Science Fiction|,76,/movie/19995-avatar,"In the 22nd century, a paraplegic Marine is di...",Sam Worthington|Zoe Saldaña|Sigourney Weaver|S...,paraplegic|culture clash|attachment to nature|...,237000000.0,2923706000.0,1,0
1,299534-avengers-endgame,Avengers: Endgame,2019-04-24,Adventure|Science Fiction|Action|,82,/movie/299534-avengers-endgame,After the devastating events of Avengers: Infi...,Robert Downey Jr.|Chris Evans|Mark Ruffalo|Chr...,space travel|time travel|time machine|sequel|s...,356000000.0,2799439000.0,1,0


In [5]:
rating_df = download_Ratings (driver, movie_df, main_url, "output/rating.csv")
print(rating_df.shape)
rating_df.head(2)

Rating download progress for movie #:   0%|          | 0/2 [00:00<?, ?it/s]

(16, 10)


Unnamed: 0,reviewid,userid,movieid,movie_title,rating,review_date,review_text,user_href,review_href,download_flag
0,5b6964e1925141404f0d1094,daisyrowley,19995-avatar,Avatar,0,2018-08-07,"Avatar is an overwhelming, immersive spectacle...",/u/daisyrowley,/review/5b6964e1925141404f0d1094,0
1,5ea448e4bdc34c0020261327,John+Chard,19995-avatar,Avatar,0,2020-04-25,"It\'s cinema Jim, but not as we know it. So he...",/u/John+Chard,/review/5ea448e4bdc34c0020261327,0


In [6]:
driver.close
driver.quit

<bound method ChromiumDriver.quit of <selenium.webdriver.chrome.webdriver.WebDriver (session="432d7988ab1bf2ba97828d983c96fa30")>>