## <center> Scraping Movies from TMDB</center>

Prerequisites:
* Make sure to place chromedriver.exe in the same directory as your code, in "\chromedriver-win64" subfolder.

In [6]:
import pandas as pd
import re
import os
from tqdm.notebook import tqdm
from datetime import datetime
import time

import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup

from selenium import webdriver 
from selenium.webdriver.chrome.options import Options 
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By

In [7]:
def getPage(driver, url):
    driver.get(url)
    time.sleep(5) # Wait for the page to load, also respecting TMDB rate limit: 40 request per 10 seconds
    bsObj = BeautifulSoup(driver.page_source, 'lxml') # Get the page source and parse it with Beautiful Soup
    return bsObj

# def load_movie_links (fname='ml-32m/links.csv'):
#     df = pd.read_csv(fname)
#     df=df[['movieId', 'tmdbId']]
#     return df

def download_Movies (driver, main_url, fname, max_count):

    if os.path.exists(fname):
        movie_df = pd.read_csv(fname)
    else:
        # 13 columns
        movie_df = pd.DataFrame(columns=['movieid', 'movie_title','movie_release_date','movie_genre', 'movie_rating', 'movie_href', 'movie_desc', 'movie_cast', 'movie_tag', 'budget', 'revenue', 'page_count', 'download_flag'])
    
    # Sort movies by revenue to include popular movies
    sub_url = main_url + "/movie?with_original_language=en&sort_by=revenue.desc&page="
    movie_df_len = len(movie_df)

    page_count = 0
    download_flag = 0
    count = 0
    progress_bar = tqdm(desc='Movie download progress', total=max_count)  # progress bar

    while count < max_count:
        page_count +=1
        url = sub_url + str(page_count)
        # print(url)
        
        # Open the page
        bsObj = getPage(driver, url)   
        
        div_tags = bsObj.find_all("div", {"class":"card style_1"})

        for div_tag in div_tags:
            try:
                a_tag = div_tag.find('a', {"class": "image"})
                # print(a_tag)

                # Populate movie id, title, link
                movie_href = a_tag.get('href').strip()
                movieid = re.sub('/movie/','', movie_href).strip()

                movie_title = ""
                movie_year = ""
                movie_release_date = ""
                movie_genre = ""
                movie_rating = 0 
                movie_desc = ""
                movie_cast = ""
                movie_tag = ""
                budget = 0.0
                revenue = 0.0

                movie_title = a_tag.get('title').strip()
                movie_title = re.sub(r"['\"]", r"\\'", movie_title)
                # print("Movie Title:", movie_title)

                movie_release_date = div_tag.find('p').get_text()
                if movie_release_date:
                    movie_release_date = datetime.strptime(movie_release_date, "%b %d, %Y").date()
                    movie_release_date = movie_release_date.strftime("%Y-%m-%d")
                # print(movie_release_date)

                # Populate movie's year of release
                bsObj_movie_page = getPage(driver, main_url + movie_href)
                span_tag = bsObj_movie_page.find('span', {"class":"tag release_date"})
                movie_year = re.sub(r'[()]','',span_tag.get_text()).strip()
                # print(movie_year)

                # Populate movie genres
                span_tag = bsObj_movie_page.find('span', {"class":"genres"})
                a_tags = span_tag.find_all("a")
                for a_tag in a_tags:
                    movie_genre = movie_genre + a_tag.get_text() + "|"
                    movie_genre = movie_genre.strip()
                # print(movie_genre)

                # Populate movie rating
                div_tag = bsObj_movie_page.find('div', {"class":"user_score_chart"})
                if div_tag:
                    movie_rating = div_tag.get('data-percent').strip()
                # print(movie_rating)
                
                # Populate movie description
                div_tag = bsObj_movie_page.find('div', {"class":"overview"})
                if div_tag:
                    movie_desc = div_tag.get_text().strip()
                    movie_desc = re.sub(r"['\"]", r"\\'", movie_desc)
                # print(movie_desc)

                # Populate casts
                div_tag = bsObj_movie_page.find('ol', {"class":"people scroller"})
                if div_tag:
                    a_tags = div_tag.find_all('a', href=lambda href: href and '/person/' in href)
                    a_tags = [tag for tag in a_tags if tag.get_text(strip=True)]
                    movie_cast = ""
                    if a_tags:
                        for tag in a_tags:
                            movie_cast = movie_cast + tag.get_text().strip()+ "|"
                # print(movie_cast)

                # Populate keywords
                # div_tag = bsObj_movie_page.find('section', {"class":"keywords right_column"})
                a_tags = bsObj_movie_page.find_all('a', href=lambda href: href and '/keyword/' in href)
                a_tags = [tag for tag in a_tags if tag.get_text(strip=True)]
                movie_tag = ""
                if a_tags:
                    for tag in a_tags:
                        movie_tag = movie_tag + tag.get_text().strip()+ "|"
                # print(movie_tag)
                
                # Populate revenue & budget
                div_tag = bsObj_movie_page.find('section', {"class":"facts left_column"})
                p_tags = div_tag.find_all('p')
                budget_pattern = r"Budget\s*\$(\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+\.\d{2})"
                revenue_pattern = r"Revenue\s*\$(\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+\.\d{2})"
                if p_tags:
                    tag_text = " ".join(tag.get_text() for tag in p_tags)
                    budget_match = re.findall(budget_pattern, tag_text)
                    budget = float(budget_match[0].replace(",", "")) if budget_match else 0.0
                    revenue_match = re.findall(revenue_pattern, tag_text)
                    revenue = float(revenue_match[0].replace(",", "")) if revenue_match else 0.0
                # print(f"Budget: {budget}, Revenue: {revenue}")

                # print(movieid, movie_title, movie_release_date, movie_genre, movie_rating, movie_href, movie_desc, movie_cast, movie_tag, budget, revenue)
                
                # Check duplicate
                if movieid not in movie_df['movieid'].values:
                    # Insert row to df
                    new_row = [movieid, movie_title, movie_release_date, movie_genre, movie_rating, movie_href, movie_desc, movie_cast, movie_tag, budget, revenue, page_count, download_flag]
                    movie_df.loc[len(movie_df)] = new_row

                    new_row_df = pd.DataFrame([new_row], columns=movie_df.columns)
                    new_row_df.to_csv(fname, index=False, mode='a', header=not os.path.exists(fname))

                    count += 1
                    time.sleep(2)
                    progress_bar.update(1)

                if count >= max_count:
                    break
            
            except Exception as e:
                print(f"Error processing {movie_href}: {e}")
                print(movieid, movie_title, movie_release_date, movie_genre, movie_rating, movie_href, movie_desc, movie_cast, movie_tag, budget, revenue)
                continue

    progress_bar.close()

    # movie_df.drop_duplicates(keep="first", inplace=True)
    # movie_df.to_csv(fname, index=False)

    return movie_df

In [8]:
main_url = "https://www.themoviedb.org"
user_agent = {"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"}

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless=new")
# chrome_options.add_argument("--window-position=-2400,-2400") # hide window
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-web-security")
chrome_options.add_argument("--disable-site-isolation-trials")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--disable-blink-features=BlockCredentialedSubresources")
chrome_options.add_argument(f"user-agent={user_agent}")

# Set up the WebDriver
chrome_driver = os.getcwd() + "\\chromedriver-win64" + "\\chromedriver.exe"
service = Service(chrome_driver)
driver = webdriver.Chrome(service=service, options=chrome_options)

In [9]:
# movie_df = download_Movies (driver, main_url, "ml-32m/links_tmdb.csv", 5000) # number of movies to download
movie_df = download_Movies (driver, main_url, "output/movie.csv", 2) # number of movies to download
print(movie_df.shape)
movie_df.head(2)

Movie download progress:   0%|          | 0/2 [00:00<?, ?it/s]

(4, 13)


Unnamed: 0,movieid,movie_title,movie_release_date,movie_genre,movie_rating,movie_href,movie_desc,movie_cast,movie_tag,budget,revenue,page_count,download_flag
0,19995-avatar,Avatar,2010-08-26,Action|Adventure|Fantasy|Science Fiction|,76,/movie/19995-avatar,"In the 22nd century, a paraplegic Marine is di...",Sam Worthington|Zoe Saldaña|Sigourney Weaver|S...,paraplegic|culture clash|attachment to nature|...,237000000.0,2923706000.0,1,6
1,299534-avengers-endgame,Avengers: Endgame,2019-04-24,Adventure|Science Fiction|Action|,82,/movie/299534-avengers-endgame,After the devastating events of Avengers: Infi...,Robert Downey Jr.|Chris Evans|Mark Ruffalo|Chr...,space travel|time travel|time machine|sequel|s...,356000000.0,2799439000.0,1,10


In [10]:
driver.close()
driver.quit()