## <center> Scraping User Profiles </center>

Prerequisites:
* Make sure to place chromedriver.exe in the same directory as your code, in "\chromedriver-win64" subfolder.
* Make sure you have mysql installed. 
* Make sure you have a database named "scraping". Otherwise, execute this command.
<br>CREATE DATABASE scraping;
* Make sure you have table named "movie" in scraping database. Otherwise, execute this command.
<br>
<span style="color:green;">CREATE TABLE movie (movieid VARCHAR(255) PRIMARY KEY, movie_title VARCHAR(255), movie_year INT, movie_release_date DATE, movie_genre VARCHAR(255), movie_rating INT, movie_href VARCHAR(255), movie_desc TEXT, movie_cast TEXT, movie_tag TEXT, budget DECIMAL(15, 2), revenue DECIMAL(15, 2), page_count INT, download_flag INT);</span>
* Make sure you have table named "rating" in scraping database. Otherwise, execute this command.
<br>
<span style="color:green;">CREATE TABLE rating (reviewid VARCHAR(255) PRIMARY KEY, userid VARCHAR(255), movieid VARCHAR(255), movie_title VARCHAR(255), rating INT, review_date DATE, review_text TEXT, user_href VARCHAR(255), review_href VARCHAR(255), download_flag INT);</span>
* Make sure you have table named "user" in scraping database. Otherwise, execute this command.
<br>
<span style="color:green;">CREATE TABLE user (userid VARCHAR(50) PRIMARY KEY, user_name VARCHAR(100), user_href VARCHAR(255), user_join_date DATE);</span>

In [1]:
import pandas as pd
import re
import os
from tqdm.notebook import tqdm
from datetime import datetime
import time

import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup

from selenium import webdriver 
from selenium.webdriver.chrome.options import Options 
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By

import pymysql

In [None]:
def getPage(driver, url):
    driver.get(url)
    time.sleep(5) # Wait for the page to load, also respecting TMDB rate limit: 40 request per 10 seconds
    bsObj = BeautifulSoup(driver.page_source, 'lxml') # Get the page source and parse it with Beautiful Soup
    return bsObj

def download_Users (driver, conn, cur, main_url):

    count = 0
    unique_user_df_len = 0

    userid = ""
    user_href = ""
    user_join_date = ""
    user_genres = ""

    filtered_rating_df = select_rating (cur)
    filtered_rating_df_len = len(filtered_rating_df)

    progress_bar = tqdm(desc='User profile download progress', total=filtered_rating_df_len)  # progress bar

    for index, row in filtered_rating_df.iterrows():
        try:
            url = main_url + row['user_href']
            # print(url)

            reviewid = row['reviewid']
            userid = row['userid']
            user_href = row['user_href']
            # print(reviewid, userid, user_href)

            # Open the page
            bsObj = getPage(driver, url)
            
            time.sleep(1)
            progress_bar.update(1)
            
            # Find all review containers
            h2_tag = bsObj.find('h2')
            user_name = h2_tag.get_text().strip()
            # print(f'{user_name}')

            user_join_date = '1900-01-01'
            if 'deleted' not in user_name and 'suspended' not in user_name and 'can\'t find' not in user_name: # handling deleted user account
                h3_tag = bsObj.find('h3')
                if h3_tag:
                    user_join_text = re.search(r'(\w+) (\d{4})', h3_tag.get_text())
                    user_join_date = datetime.strptime(user_join_text[0], "%B %Y").date()
                else:
                    user_join_date = '1900-01-01'
            # print(user_join_date)

            # print(f'{userid}, {user_name}, {user_href}, {user_join_date}')

            # Insert new record to database
            insert_user_to_db(cur, userid, user_name, user_href, user_join_date)
            
            # Log entry to a text file in case of download stops due to reached limit
            with open('log_user.txt', 'a') as file:
                file.write(f'{userid}, {user_name}, {user_href}, {user_join_date}\n')

            count +=1
            
            # Update download_flag in rating table. This is to address the daily request limit in TMDB. Only movies without rating will be downloaded in next session.       
            update_rating_downloadflag(cur, reviewid)
            conn.commit()

        except Exception as e:
            print(f"Error processing {user_href}: {e}")
            print(f'{userid}, {user_name}, {user_href}, {user_join_date}')
            continue

    progress_bar.close()

    user_df = select_all_user (cur)
    
    return user_df

def insert_user_to_db (cursor, userid, user_name, user_href, user_join_date):
    # Insert new if user record is not found, else update the user record to keep user info in db up-to-date
    sql = '''
    INSERT INTO user (userid, user_name, user_href, user_join_date)
    VALUES (%s, %s, %s, %s)
    ON DUPLICATE KEY UPDATE
        user_name = values(user_name),
        user_href = values(user_href),
        user_join_date = values(user_join_date);
    '''
    cursor.execute ("USE scraping;")
    cursor.execute(sql, (userid, user_name, user_href, user_join_date))

def select_all_rating (cursor):
    cursor.execute ("USE scraping;")
    cursor.execute("SELECT * from rating;")
    rows = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]
    df_mysql = pd.DataFrame(rows, columns=columns)
    return df_mysql

def select_all_user (cursor):
    cursor.execute ("USE scraping;")
    cursor.execute("SELECT * from user;")
    rows = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]
    df_mysql = pd.DataFrame(rows, columns=columns)
    return df_mysql

def select_rating (cursor):
    cursor.execute ("USE scraping;")
    cursor.execute("SELECT rating.* from rating left join movie on movie.movieid = rating.movieid where rating.download_flag = 0 or rating.download_flag is null order by movie.page_count, movie.movieid, rating.reviewid;")
    rows = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]
    df_mysql = pd.DataFrame(rows, columns=columns)
    return df_mysql

def update_rating_downloadflag (cursor, reviewid):
    sql = "UPDATE rating SET download_flag = 1 where reviewid ='" + reviewid + "';"
    cursor.execute ("USE scraping;")
    cursor.execute(sql)

In [3]:
main_url = "https://www.themoviedb.org"
user_agent = {"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"}

# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless=new")
# chrome_options.add_argument("--window-position=-2400,-2400") # hide window
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-web-security")
chrome_options.add_argument("--disable-site-isolation-trials")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--disable-blink-features=BlockCredentialedSubresources")
chrome_options.add_argument(f"user-agent={user_agent}")

# Set up the WebDriver
chrome_driver = os.getcwd() + "\\chromedriver-win64" + "\\chromedriver.exe"
service = Service(chrome_driver)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Connect to database
try:
    conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='scraping')
    cur = conn.cursor()
    print("Connection successful!")
except pymysql.MySQLError as e:
    print(f"Error connecting to the database: {e}")

Connection successful!


In [4]:
user_df = download_Users (driver, conn, cur, main_url)
print(user_df.shape)
display(user_df.head(2))

# Export movie to csv
user_df.to_csv("user.csv", index=False)

User profile download progress:   0%|          | 0/31 [00:00<?, ?it/s]

(1203, 4)


Unnamed: 0,userid,user_name,user_href,user_join_date
0,007ace,007ace,/u/007ace,2016-07-01
1,15bq1a05k9,Waseem Farooq Shaik,/u/15bq1a05k9,2019-01-01


In [None]:
cur.close()
conn.close()
driver.close
driver.quit

<bound method ChromiumDriver.quit of <selenium.webdriver.chrome.webdriver.WebDriver (session="dcd57b3e1d46f5bd20824d6525a84e38")>>

: 