Code to scrape websites and return json objects.

Called OPaL: Opposing Positions and Lingo

In [1]:
import requests
from bs4 import BeautifulSoup
import json

In [2]:
def make_request(urls:list):
    responses = []
    for url in urls:
        try:
            response = requests.get(url, timeout=5)
            response.raise_for_status()
            responses.append(response.text)
        except requests.exceptions.RequestException as exception:
            if hasattr(exception, 'response') and exception.response is not None:
                status_code = exception.response.status_code
                raise requests.HTTPError(f"HTTP Error: {status_code} occured", 
                                    response=exception.response)
            raise requests.HTTPError("Non-Specific Request Error: 500")
    return responses

In [3]:
# 1819 specific parser

def article_parser_1819(urls):
    #create request object by calling the make_request function
    
    request_responses = make_request(urls)
    all_articles = []

    #parse the request_responses
    for request_response in request_responses:
        soup = BeautifulSoup(request_response, 'html.parser')
        json_soup = {
            'title': '',
            'author': '',
            'date': '',
            'line_count': 0,
            'line_content': {}
            }

        #pull the title element and add it to the json object
        title = soup.title.string
        title = title.strip()
        json_soup['title'] = title

        #extract author and date
        author_date_div = soup.find('div', class_='author-date')
        author_name = author_date_div.find('a').text
        date = author_date_div.text.split('|')[1].strip()

        #assign author and date to json object
        json_soup['author'] = author_name
        json_soup['date'] = date

        #pull the paragraph elements and create an empty array for the contents
        paragraphs = soup.find_all(['p'])
        paragraph_texts = []


        for p in paragraphs:
            # Get the text and strip whitespace
            text = p.get_text().strip()
            # Split by line breaks that might be in the HTML
            lines = text.split('\n')
            # Add each non-empty line
            for line in lines:
                if line.strip():  # Only add non-empty lines
                    paragraph_texts.append(line.strip())
        
        # create a json object of paragraph lines and line contents
        json_soup['line_count'] = len(paragraph_texts)
        for i, line in enumerate(paragraph_texts, 1):
            json_soup['line_content'][f"line {i}"] = line
        
        all_articles.append(json_soup)

    return json.dumps(all_articles, indent=4, ensure_ascii=False)


In [4]:
if __name__ == "__main__":
    urls = ['https://1819news.com/news/item/allen-mendenhall-returning-to-the-fundamental-truths-that-positively-shaped-society', 
            'https://1819news.com/news/item/house-speaker-ledbetter-on-senate-adva-bill-theres-a-good-chance-we-may-pass-it-as-is',
            'https://1819news.com/news/item/devin-foley-the-real-debate-is-over-human-nature']
    result = article_parser_1819(urls)
    print(result)

[
    {
        "title": "Allen Mendenhall: Returning to the fundamental truths that positively shaped society",
        "author": "Allen Mendenhall",
        "date": "02.21.25",
        "line_count": 22,
        "line_content": {
            "line 1": "The pale London winter light filtered through the taxi’s windows as we wound our way from Heathrow’s concrete sprawl toward the city proper. My driver spoke with an accent that sounded of somewhere far warmer than this gray English morning. Through delicate conversational choreography – “How long have you called London home?” – his story emerged like a flower slowly opening: Zimbabwe, then London in 2000.",
            "line 2": "Politics tumbled out unexpectedly. Trump, he declared with the fervor of the converted, was a champion of the working man, a disruptor of calcified systems, and perhaps even a potential peacemaker for that distant war in Ukraine. His enthusiasm sparkled in the rearview mirror as he spoke.",
            "line 3"