In [3]:
# Imports for scraping RT
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup

# Imports for scraping individual websites
import requests
import re
from collections import defaultdict

# Imports for data formatting
import uuid
import hashlib
import json
from datetime import datetime, date

In [4]:
TODAY = date.today().strftime("%Y-%m-%d")

def generate_json(author, text, summary, date_created, source_url):
    """
    Generate a JSON representation of a document with author and content information.

    Parameters:
    author (str): The name or identifier of the document's author.
    text (str): The full text content of the document.
    summary (str): A summary or brief description of the document.
    date_created (str): The date the document was created (formatted as "%Y %m, %d").
    source_url (str): The source url of the document

    Returns:
    dict: A dictionary representing the document in JSON format with various attributes.
    """
    output = {}
    # get doc id
    m = hashlib.md5()
    m.update(text.encode('utf-8'))
    output['documentID'] = str(uuid.UUID(m.hexdigest())) 

    # get authorID
    m = hashlib.md5()
    m.update(author.encode('utf-8'))
    output['authorIDs'] = [str(uuid.UUID(m.hexdigest()))]

    output['fullText'] = text
    output["spanAttribution"] = [{"authorID":output['authorIDs'][0],
                                    "start":0,
                                    "end":len(text)}]
    output["isNeedle"] = False
    output["collectionNum"] = "HRS 1"
    output["source"] = source_url
    output["dateCollected"] = TODAY
    output["dateCreated"] = date_created
    output["publiclyAvailable"] = True
    output["deidentified"] = True
    output["languages"] = ["en"]
    output["lengthWords"] = len(text.split(' '))
    output["sourceSpecific"] = {
        "authorName": author,
        "rtSummary": summary,
    }
    return output

In [5]:
def scrape_page(url):
    """
    Scrapes and process text from a webpage, focusing on <p> tags.

    Parameters:
    url (str): The URL of the webpage to scrape.

    Returns:
    str: The cleaned-up text extracted from the web page, or an empty string if an issue occurs during scraping.
    """

    # returns class attributes of an HTML element or "NOCLASS" if it doesn't have any.
    def get_class(p):
        return ''.join(p['class']) if p.has_attr('class') else "&&NOCLASS&&"

    p_dict = defaultdict(list)

    try:
        response = requests.get(url)

        # if we've been redirected, return.
        if response.url != url:
            return ""

        soup = BeautifulSoup(response.text, 'html.parser')
        # finds all <p> tags
        paragraphs = soup.find_all('p')

        for p in paragraphs:
            if len(p.text.split(' ')) > 5:
                p_dict[get_class(p)].append(p.text)
        
    except Exception as e:
        return ""

    # Find the class with the longest list of paragraphs
    longest_key = max(p_dict, key=lambda k: len(p_dict[k]))

    # Combine and clean text
    full_text = re.sub(r'\s+', ' ', ' '.join(p_dict[longest_key])).strip()

    return full_text


In [6]:
def scrape_reviews(critic): 
    """
    Scrapes reviews given the name of a rotten tomatoes critic into a JSON file.

    Args:
    critic (str): The name of the critic.

    Returns:
    list: Returns a list of scraped review JSONs.

    """
    # Set up Chrome driver
    driver = webdriver.Chrome()

    # Define and open the target URL
    page_url = f'https://www.rottentomatoes.com/critics/{critic}/movies/'
    driver.get(page_url)

    # Initialize an empty list to save reviews
    reviews = []

    # How long to wait for next button to be clickable and table data to change, respectively.
    next_wait = WebDriverWait(driver, 3)
    table_wait = WebDriverWait(driver, 7.5) 

    while True:
        page_source = driver.page_source

        # Create a Soup object and find the reviews table within it
        soup = BeautifulSoup(page_source, 'html.parser')
        table = soup.find('table', {'data-qa': 'critic-reviews-table'})
        
        if table:
            # find all rows within the table and iterate
            rows = table.find('tbody').find_all('tr', {'data-qa': 'row'})
            for row in rows:
                review_td = row.find('td', {'data-qa': 'critic-review'})
                if review_td:
                    date_created = datetime.strptime(review_td.find('div').find('span').text, "%b %d, %Y")
                    if date_created < datetime(2021, 1, 1):
                        rt_summary = review_td.find('span').text.strip()
                        review_url = review_td.find('a', string="Read More")['href']
                        if len(review_url) > 0:
                            fullText = scrape_page(review_url)
                            if fullText != '':
                                reviews.append(generate_json(
                                    critic,
                                    fullText,
                                    rt_summary,
                                    date_created.strftime("%Y-%m-%d"),
                                    review_url
                                ))
                                print(reviews[-1])
        try:
            next_button = next_wait.until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, 'rt-button.next'))
            )
            next_button.click()

            try:
                table_wait.until(EC.staleness_of(driver.find_element(By.CSS_SELECTOR, 'table[data-qa="critic-reviews-table"]')))
            except TimeoutException:
                print("TDS timeout. Critic " + critic + " scraping terminated with " + str(len(reviews)) + " reviews.")
                driver.quit()
                return reviews
            
        except Exception as e:
            break
        
    driver.quit()
    print("reviews found for critic " + critic + ": " + str(len(reviews)))
    return reviews


def get_reviews(critics):
    review_list = []
    for critic in critics:
        review_list.append(scrape_reviews(critic))
    return review_list



In [7]:
critics = ["ella-taylor"]
review_jsons = get_reviews(critics)
with open("rtcorpus.jsonl", 'a') as corpus, open("critics.txt", 'a') as critic_list:
    for rl in review_jsons: 
        critic = rl[0]["sourceSpecific"]["authorName"]
        critic_list.write(critic + ': ' + str(len(rl)) + '\n')
        for review in rl:
            corpus.write(json.dumps(review) + '\n')

# critics = ["alonso-duralde", "david-sims", "jami-bernard", "ed-gonzalez", "nell-minow", "sara-michelle-fetters-8689", "valerie-complex", "victoria-luxford", "wenlei-ma"]
# scraped = get_reviews(critics)
# with open("rtlinks.jsonl", 'a') as link_jsonl, open("critics.txt", 'a') as critic_list:
#     for rd in scraped: 
#         critic = rd["critic"]
#         critic_list.write(critic + ': ' + str(rd["review_count"]) + '\n')
#         link_jsonl.write(json.dumps(rd) + '\n')
        
        

{'documentID': '0f50038c-9c25-876f-3646-b733e83f99b5', 'authorIDs': ['a80b433f-e737-206c-625e-973e341a972f'], 'fullText': 'Romance Rejuvenated: Inge (Ursula Werner) and Karl (Horst Westphal) find that hearts don\'t always grow wiser with age in Andreas Dresen\'s Cloud 9. Music Box Films hide caption Not RatedWith: Ursula Werner, Horst Westphal, Steffi Kuhnert If the woman caught in a love triangle in Cloud 9 passed you on the street, it\'s unlikely you\'d give her a second look — let alone imagine the passions bestowed on the 67-year-old seamstress (and the two men she loves) by German director Andreas Dresen. On an impulse, Inge (Ursula Werner) hand-delivers the pants she\'s just repaired for a customer, Karl (Horst Westphal). After a hasty come-hither glance or two, she climbs straight into bed with him. As in the real world, their robust lovemaking involves full-frontal nudity, and it\'s shown in unforgiving close-up, which delivers precisely the bracing shock Dresen doubtless inten

KeyboardInterrupt: 