In [26]:
# Imports for scraping
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

from bs4 import BeautifulSoup
import time


# Imports for data formatting
import json
from datetime import datetime, date

In [2]:
# TODAY = datetime.date.today().strftime("%Y-%m-%d")

def generate_url_json(author, urls):
    output = {}
    output["critic"] = author
    output["review_count"] = len(urls)
    output["urls"] = urls
    return output

In [31]:
def scrape_reviews(critic): 
    driver = webdriver.Chrome()
    page_url = f'https://www.rottentomatoes.com/critics/{critic}/movies/'
    driver.get(page_url)
    reviews = []
    next_wait = WebDriverWait(driver, 3) # time to wait for next button to be clickable
    table_wait = WebDriverWait(driver, 7.5) # time to wait for the table to change
    while True:
        # extract reviews section from each page
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        table = soup.find('table', {'data-qa': 'critic-reviews-table'})
        if table:
            # Find all rows in the table's tbody
            rows = table.find('tbody').find_all('tr', {'data-qa': 'row'})

            # Iterate through the rows to extract the review text
            for row in rows:
                # Find the review's td element
                review_td = row.find('td', {'data-qa': 'critic-review'})
                
                # Check if the review_td is found
                if review_td:
                    # Extract the review text
                    date_created = datetime.strptime(review_td.find('div').find('span').text, "%b %d, %Y")
                    if date_created < datetime(2021, 1, 1):
                        link = review_td.find('a', string="Read More")['href']
                        if len(link) > 0:
                            reviews.append(link)
        try:
            n = driver.find_element(By.CSS_SELECTOR, 'rt-button.next')
            next_button = next_wait.until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, 'rt-button.next'))
            )
            # Click the "Next" button
            next_button.click()

            try:
                # Wait for the table's data to change
                table_wait.until(EC.staleness_of(driver.find_element(By.CSS_SELECTOR, 'table[data-qa="critic-reviews-table"]')))
            except TimeoutException:
                # Print a message when the table_wait times out
                print("TDS timeout. Critic: " + critic + " scraping ended with " + str(len(reviews)) + " reviews.")
                driver.quit()
                return generate_url_json(critic, reviews)

            time.sleep(0.15)
        except Exception as e:
            break
   
    # Close the WebDriver when done
    driver.quit()
    print("urls found for critic " + critic + ": " + str(len(reviews)))
    return generate_url_json(critic, reviews)


def get_reviews(critics):
    review_list = []
    for critic in critics:
        review_list.append(scrape_reviews(critic))
    return review_list



In [32]:
critics = ["alonso-duralde", "david-sims", "jami-bernard", "ed-gonzalez", "nell-minow", "sara-michelle-fetters-8689", "valerie-complex", "victoria-luxford", "wenlei-ma"]
scraped = get_reviews(critics)
with open("rtlinks.jsonl", 'a') as link_jsonl, open("critics.txt", 'a') as critic_list:
    for rd in scraped: 
        critic = rd["critic"]
        critic_list.write(critic + ': ' + str(rd["review_count"]) + '\n')
        link_jsonl.write(json.dumps(rd) + '\n')
        
        

TDS timeout. Critic: alonso-duralde scraping ended with 548 reviews.
TDS timeout. Critic: david-sims scraping ended with 0 reviews.
TDS timeout. Critic: jami-bernard scraping ended with 69 reviews.
urls found for critic ed-gonzalez: 0


KeyboardInterrupt: 