In [1]:
# pip install beautifulsoup4 requests selenium

In [2]:
# Imports for scraping
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time


# Imports for data formatting
import uuid
import hashlib
import datetime
import json


In [3]:
TODAY = datetime.date.today().strftime("%Y-%m-%d")

def generate_json(author, text, date_created):
    output = {}
    # get doc id
    m = hashlib.md5()
    m.update(text.encode('utf-8'))
    output['documentID'] = str(uuid.UUID(m.hexdigest())) 

    # get authorID
    m = hashlib.md5()
    m.update(author.encode('utf-8'))
    output['authorIDs'] = [str(uuid.UUID(m.hexdigest()))]

    output['fullText'] = text
    output["spanAttribution"] = [{"authorID":output['authorIDs'][0],
                                    "start":0,
                                    "end":len(text)}]
    output["isNeedle"] = False
    output["collectionNum"] = "HRS 1"
    output["source"] = 'https://www.rottentomatoes.com/'
    output["dateCollected"] = TODAY
    output["dateCreated"] = datetime.datetime.strptime(date_created, "%b %d, %Y").strftime("%Y-%m-%d")
    output["publiclyAvailable"] = True
    output["deidentified"] = True
    output["languages"] = ["en"]
    output["lengthWords"] = len(text.split(' '))
    output["sourceSpecific"] = {"authorName": author}
    return output

In [4]:
def scrape_reviews(critic): 
    driver = webdriver.Chrome()
    page_url = f'https://www.rottentomatoes.com/critics/{critic}/movies/'
    driver.get(page_url)
    reviews = []
    # amount of time to wait for 
    next_wait = WebDriverWait(driver, 4)
    table_wait = WebDriverWait(driver, 10)
    while True:
        # extract reviews section from each page
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        table = soup.find('table', {'data-qa': 'critic-reviews-table'})
        if table:
            # Find all rows in the table's tbody
            rows = table.find('tbody').find_all('tr', {'data-qa': 'row'})

            # Iterate through the rows to extract the review text
            for row in rows:
                # Find the review's td element
                review_td = row.find('td', {'data-qa': 'critic-review'})
                # Check if the review_td is found
                if review_td:
                    # Extract the review text
                    review_text = review_td.find('span').text.strip()
                    date_created = review_td.find('div').find('span').text
                    if(len(review_text) > 5):
                        reviews.append(generate_json(critic, review_text, date_created))
        try:
            next_button = next_wait.until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, 'rt-button.next'))
            )
            # Click the "Next" button
            next_button.click()
            # Wait for the table's data to change
            table_wait.until(EC.staleness_of(driver.find_element(By.CSS_SELECTOR, 'table[data-qa="critic-reviews-table"]')))
            time.sleep(0.2)
                        
        except Exception as e:
            break
            
    # Close the WebDriver when done
    driver.quit()
    print("reviews found for critic " + critic + ": " + str(len(reviews)))
    return reviews


def get_reviews(critics):
    review_list = []
    for critic in critics:
        review_list.append(scrape_reviews(critic))
    return review_list



In [5]:
critics = ["ella-taylor"]
scraped = get_reviews(critics)
with open("rtcorpus.jsonl", 'a') as corpus, open("critics.txt", 'a') as critic_list:
    for rl in scraped: 
        critic = rl[0]["sourceSpecific"]["authorName"]
        critic_list.write(critic + ': ' + str(len(rl)) + '\n')
        for review in rl:
            corpus.write(json.dumps(review) + '\n')
        

reviews found for critic ella-taylor: 1202
