In [2]:
# Imports for scraping individual websites
import json
import re
from bs4 import BeautifulSoup
from collections import defaultdict

In [3]:
def scrape_soup(soup):
    """
    Scrapes and process text from a webpage, focusing on <p> tags.

    Parameters:
    url (str): The URL of the webpage to scrape.

    Returns:
    str: The cleaned-up text extracted from the web page, or an empty string if an issue occurs during scraping.
    """
    
    # returns class attributes of an HTML element or "NOCLASS" if it doesn't have any.
    def get_class(p):
        return ''.join(p['class']) if p.has_attr('class') else "&&NOCLASS&&"

    p_dict = defaultdict(list)
    try:
        for element in soup.find_all():
            if element.has_attr('class') and any("caption" in class_name.lower() or "newsletter" in class_name.lower() 
                                                 for class_name in element['class']):
                element.extract()
        # finds all <p> tags
        paragraphs = soup.find_all('p')

        for p in paragraphs:
            if len(p.text.split(' ')) > 5:
                p_dict[get_class(p)].append(p.text)

        # Find the class with the longest list of paragraphs
        longest_key = max(p_dict, key=lambda k: len(p_dict[k]))

        # Combine and clean text
        full_text = re.sub(r'\s+', ' ', ' '.join(p_dict[longest_key])).strip()

    except Exception as e:
        # print("Exception "  + str(e) + " occurred for url: " + url)
        return ""


    return full_text

In [4]:
jsonl_file_path = 'output/reviewsoups.jsonl'

# Open the JSONL file for reading
with open(jsonl_file_path, 'r') as jsonl_file:
    for line in jsonl_file:
        json_obj = json.loads(line.strip())
        soup = BeautifulSoup(json_obj['fullText'], "html.parser")
        print(scrape_soup(soup))

The first time I saw Prisoners was at a friend’s apartment in college. Looking back, I can’t imagine a worse setting to experience an atmospheric crime thriller: several dudes packed like sardines on a small, stained couch, sipping whiskey out of red Solo cups. (As all good whiskey should be served.) Nevertheless, over the next two and a half hours, we all sat there in stunned silence—completely at the mercy of the French Canadian director responsible for this bleak, gorgeous film. Admittedly, none of us had any idea who the guy was, but everyone on that crappy sofa was in agreement: We will watch your career with great interest. While Denis Villeneuve was by no means an unknown commodity among cinephiles—his 2010 drama, Incendies, was nominated for Best Foreign Language Film at the Oscars—Prisoners marked the filmmaker’s thrilling introduction to the mainstream. Now, a decade after Prisoners’ debut, Villeneuve’s résumé speaks for itself: Enemy (featuring an all-time WTF ending), Sicar