In [37]:
# Imports for scraping individual websites
import json
import re
from bs4 import BeautifulSoup
from collections import defaultdict


In [38]:
def scrape_soup(soup):
    # print(soup)
    """
    Scrapes and process text from a webpage, focusing on <p> tags.

    Parameters:
    url (str): The URL of the webpage to scrape.

    Returns:
    str: The cleaned-up text extracted from the web page, or an empty string if an issue occurs during scraping.
    """
    
    # returns class attributes of an HTML element or "NOCLASS" if it doesn't have any.
    def get_class(p):
        return ''.join(p['class']) if p.has_attr('class') else "&&NOCLASS&&"
    
    p_dict = defaultdict(list)
    try:
        for element in soup.find_all():
            if element.attrs and 'class' in element.attrs:
                # if any("caption" in class_name.lower() 
                #         for class_name in element['class']):
                #     element.decompose()
                if any("ssba-classic-2" in class_name.lower() for class_name in element['class']):
                    for sibling in element.find_all_next():
                        sibling.extract()
                    element.extract()
                    break
            # if element.name in ['ol', 'ul', 'dl']:
            #     element.decompose()

        # finds all <p> tags
        paragraphs = soup.find_all('p')
        for p in paragraphs:
            if len(p.text.split(' ')) > 5 and not p.find(['ol', 'ul', 'dl' ]):
                p_dict[get_class(p)].append(p.text)

        # Find the class with the longest list of paragraphs
        longest_key = max(p_dict, key=lambda k: len(p_dict[k]))

        # Combine and clean text
        full_text = re.sub(r'\s+', ' ', ' '.join(p_dict[longest_key])).strip()


    except Exception as e:
        print("Exception "  + str(e) + " occured")
        return ""


    return full_text


In [39]:
def process_json(json_obj):
    fullText = scrape_soup(BeautifulSoup(json_obj['fullText'], "html.parser"))
    json_obj['fullText'] = fullText
    json_obj["spanAttribution"] = [{"authorID":json_obj['authorIDs'][0],
                                    "start":0,
                                    "end":len(fullText)}]
    json_obj["lengthWords"] = len(fullText.split(' '))
    return json_obj


In [40]:
jsonl_file_path = '/shared/3/projects/hiatus/rotten_tomatoes/raw_output/reviewsoups.jsonl'
rt_corpus_path = '/shared/3/projects/hiatus/rotten_tomatoes/rtcorpus.jsonl'
# rt_corpus_path = './output/reviewsoups.jsonl'


      
with open(jsonl_file_path, 'r') as jsonl_file, \
     open(rt_corpus_path, 'a') as corpus:
    seen_text = set()
    for line in jsonl_file:
        json_obj = json.loads(line.strip())
        processed_json = process_json(json_obj)
        if processed_json['lengthWords'] >= 50 and processed_json["fullText"] not in seen_text:
            seen_text.add(processed_json['fullText'])
            corpus.write(json.dumps(processed_json) + '\n')


        
    
        
 

In [41]:
# gets the number of unique authors
# Read the JSONL file and store JSON objects in a list
with open(rt_corpus_path, 'r') as file:
    json_list = [json.loads(line) for line in file]
    # print(json_list)

    # Create a set to store unique "fullText" values
    unique_ids = set()

    # Create a new list to store filtered JSON objects
    filtered_json_list = []

    # # Iterate through the JSON objects
    for json_obj in json_list:
        # Extract the "fullText" value
        ids = json_obj.get("authorIDs")
        unique_ids.add(ids[0])

    print(len(unique_ids))



168
