In [None]:
# Imports for scraping
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

from bs4 import BeautifulSoup
import time

# Imports for data formatting
import json
from datetime import datetime, date

In [None]:
TODAY = datetime.date.today().strftime("%Y-%m-%d")

def generate_url_json(author, urls):
    output = {}
    output["critic"] = author
    output["review_count"] = len(urls)
    output["urls"] = urls
    return output

In [None]:
def scrape_reviews(critic):
    """
    Scrapes movie review urls from a Rotten Tomatoes critic's page.

    Parameters:
    critic (str): The name or ID of the critic to scrape reviews for.

    Returns:
    dict: A dictionary containing critic information and a list of review URLs.
    """
    
    # Create a WebDriver instance (Chrome in this case)
    driver = webdriver.Chrome()

    # Construct the URL for the critic's Rotten Tomatoes page
    page_url = f'https://www.rottentomatoes.com/critics/{critic}/movies/'
    driver.get(page_url)

    reviews = []

    # wait time for the next button to become clickable
    next_wait = WebDriverWait(driver, 3) 
    # wait time for table data to change
    table_wait = WebDriverWait(driver, 7.5)

    while True:
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        # Find the review table on the page
        table = soup.find('table', {'data-qa': 'critic-reviews-table'})
        
        if table:
            # Find all rows in the table
            rows = table.find('tbody').find_all('tr', {'data-qa': 'row'})

            for row in rows:
                review_td = row.find('td', {'data-qa': 'critic-review'})

                if review_td:
                    date_created = datetime.strptime(review_td.find('div').find('span').text, "%b %d, %Y")
                    if date_created < datetime(2021, 1, 1):
                        link = review_td.find('a', string="Read More")['href']
                        if len(link) > 0:
                            reviews.append(link)
        
        try:
            # Find and click the "Next" button
            next_button = next_wait.until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, 'rt-button.next'))
            )
            next_button.click()

            try:
                # Wait for the old table to become stale
                table_wait.until(EC.staleness_of(driver.find_element(By.CSS_SELECTOR, 'table[data-qa="critic-reviews-table"]')))
            except TimeoutException:
                print("TDS timeout. Critic: " + critic + " scraping terminated with " + str(len(reviews)) + " reviews.")
                driver.quit()
                return generate_url_json(critic, reviews)

            time.sleep(0.15)
        except Exception as e:
            break

    # Quit the WebDriver instance
    driver.quit()

    # Print the number of review URLs found
    print("URLs found for critic " + critic + ": " + str(len(reviews)))

    # Return a JSON representation containing critic information and review URLs
    return generate_url_json(critic, reviews)


In [None]:
def scrape_reviews(critic): 
    driver = webdriver.Chrome()
    page_url = f'https://www.rottentomatoes.com/critics/{critic}/movies/'
    driver.get(page_url)
    reviews = []
    next_wait = WebDriverWait(driver, 3)
    table_wait = WebDriverWait(driver, 7.5) 
    while True:
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        table = soup.find('table', {'data-qa': 'critic-reviews-table'})
        if table:
            rows = table.find('tbody').find_all('tr', {'data-qa': 'row'})

            for row in rows:
                review_td = row.find('td', {'data-qa': 'critic-review'})
                
                if review_td:
                    date_created = datetime.strptime(review_td.find('div').find('span').text, "%b %d, %Y")
                    if date_created < datetime(2021, 1, 1):
                        link = review_td.find('a', string="Read More")['href']
                        if len(link) > 0:
                            reviews.append(link)
        try:
            next_button = next_wait.until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, 'rt-button.next'))
            )
            next_button.click()

            try:
                table_wait.until(EC.staleness_of(driver.find_element(By.CSS_SELECTOR, 'table[data-qa="critic-reviews-table"]')))
            except TimeoutException:
                print("TDS timeout. Critic: " + critic + " scraping terminated with " + str(len(reviews)) + " reviews.")
                driver.quit()
                return generate_url_json(critic, reviews)

            time.sleep(0.15)
        except Exception as e:
            break
        
    driver.quit()
    print("urls found for critic " + critic + ": " + str(len(reviews)))
    return generate_url_json(critic, reviews)


def get_reviews(critics):
    review_list = []
    for critic in critics:
        review_list.append(scrape_reviews(critic))
    return review_list



In [None]:
critics = ["alonso-duralde", "david-sims", "jami-bernard", "ed-gonzalez", "nell-minow", "sara-michelle-fetters-8689", "valerie-complex", "victoria-luxford", "wenlei-ma"]
scraped = get_reviews(critics)
with open("rtlinks.jsonl", 'a') as link_jsonl, open("critics.txt", 'a') as critic_list:
    for rd in scraped: 
        critic = rd["critic"]
        critic_list.write(critic + ': ' + str(rd["review_count"]) + '\n')
        link_jsonl.write(json.dumps(rd) + '\n')
        
        