Code to scrape websites and return json objects.

Called OPAL: Oppositional Positions in Alabama

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import json
import os
from datetime import datetime

In [None]:
def get_all_news_urls(base_url, suffix:str, max_pages:int=None, output_file:str=None):
    """
    First step in the web scraping process. This function will search for all URLs
    that contain the suffix in the base URL. It will continue searching until it
    reaches the maximum number of pages or until it reaches the end of the search.
    
    Includes improved error handling.

    Args:
    base_url: str: The base URL to start the search
    suffix: str: The suffix to search for in the URLs
    max_pages: int: The maximum number of pages to search
    output_file: str: Optional file path to save the URLs
    
    Returns:
    list: A list of URLs found
    """
    # Base variables to store the URLS, track the page number, and continue the search
    news_urls = []
    page = 1
    continue_search = True
    
    # Headers for request to avoid being blocked
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
            AppleWebKit/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,\
            image/webp,*/*;q=0.8'
    }

    try:
        while continue_search:
            # Determine current URL based on page number
            if page == 1:
                current_url = base_url
            else:
                current_url = f"{base_url}/page/{page}"
            
            # Make the request with improved error handling
            try:
                response = requests.get(current_url, headers=headers, timeout=5)
                response.raise_for_status()  # Raise exception for bad status codes
            except requests.exceptions.RequestException as exception:
                if hasattr(exception, 'response') and exception.response is not None:
                    status_code = exception.response.status_code
                    print(f"HTTP Error: {status_code} occurred while fetching page {page}")
                else:
                    print(f"Non-Specific Request Error on page {page}: {str(exception)}")
                # Stop the search if we can't get this page
                break
            
            # Parse the page to find URLs
            soup = BeautifulSoup(response.text, 'html.parser')
            links = soup.find_all('a')
            
            found_on_page = 0
            
            # Collect URLs from this page
            for link in links:
                href = link.get('href')
                if href:
                    full_url = urljoin(current_url, href)
                    if suffix in full_url:
                        if full_url not in news_urls:
                            news_urls.append(str(full_url))
                            found_on_page += 1
            
            print(f"Page {page}: Found {found_on_page} new URLs")
            
            # Check conditions to continue or stop the search
            if max_pages is not None and page >= max_pages:
                print(f"Reached maximum pages limit: {max_pages}")
                break
                
            if found_on_page == 0:
                print("No new URLs found on this page")
                break
                
            # Increment page and add delay
            page += 1
            time.sleep(1)  # Polite delay between requests
        
        # Save URLs to file if output_file is provided
        if output_file and news_urls:
            try:
                with open(output_file, 'w') as f:
                    for url in news_urls:
                        f.write(f"{url}\n")
                print(f"Saved {len(news_urls)} URLs to {output_file}")
            except Exception as e:
                print(f"Error saving URLs to file: {e}")
            
        return news_urls
        
    except Exception as e:
        print(f"Unexpected error in URL collection: {e}")
        # Return whatever URLs we've collected so far
        return news_urls


In [None]:
#In the jupyter file this is a function. 
# In the python app it is a subclass
def article_parser_1819(urls, output_file:str=None):
    """
    Parser specific to 1819 News site format.
    This function directly makes HTTP requests to the URLs and parses them,
    without using a separate make_request function.
    
    Args:
    urls (list): List of article URLs to parse
    output_file: str: Optional file path to save the JSON data
    
    Returns:
    str: JSON string with parsed article data
    """
    all_articles = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
            AppleWebKit/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,\
            image/webp,*/*;q=0.8'
    }

    # Process each URL directly
    for url in urls:
        try:
            print(f"Fetching: {url}")
            response = requests.get(url, headers=headers, timeout=5)
            response.raise_for_status()
            
            # Parse the article content
            soup = BeautifulSoup(response.text, 'html.parser')
            json_soup = {
                'title': '',
                'author': '',
                'date': '',
                'line_count': 0,
                'line_content': {}
            }

            # Extract title
            if soup.title:
                title = soup.title.string
                if title:
                    title = title.strip()
                    json_soup['title'] = title

            # Extract author and date
            author_date_div = soup.find('div', class_='author-date')
            if author_date_div:
                author_link = author_date_div.find('a')
                if author_link:
                    author_name = author_link.text
                    json_soup['author'] = author_name
                
                # Extract date
                text_parts = author_date_div.text.split('|')
                if len(text_parts) > 1:
                    date = text_parts[1].strip()
                    json_soup['date'] = date

            # Extract paragraphs and create content array
            paragraphs = soup.find_all(['p'])
            paragraph_texts = []

            for p in paragraphs:
                # Get the text and strip whitespace
                text = p.get_text().strip()
                # Split by line breaks that might be in the HTML
                lines = text.split('\n')
                # Add each non-empty line
                for line in lines:
                    if line.strip():  # Only add non-empty lines
                        paragraph_texts.append(line.strip())
            
            # Create JSON structure of paragraph content
            json_soup['line_count'] = len(paragraph_texts)
            for i, line in enumerate(paragraph_texts, 1):
                json_soup['line_content'][f"line {i}"] = line
            
            all_articles.append(json_soup)
            
            # Add a small delay between requests
            time.sleep(0.5)
            
        except requests.exceptions.RequestException as exception:
            if hasattr(exception, 'response') and exception.response is not None:
                status_code = exception.response.status_code
                print(f"HTTP Error: {status_code} occurred while fetching {url}")
            else:
                print(f"Request Error for {url}: {str(exception)}")
            # Continue with next URL instead of failing completely
            continue
        except Exception as e:
            print(f"Error processing {url}: {str(e)}")
            continue

    # Convert to JSON string
    json_data = json.dumps(all_articles, indent=4, ensure_ascii=False)
    
    # Save JSON to file if output_file is provided
    if output_file and all_articles:
        try:
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(json_data)
            print(f"Saved parsed data to {output_file}")
        except Exception as e:
            print(f"Error saving JSON to file: {e}")
    
    return json_data

In [None]:
def ensure_output_directory(directory):
    """Creates the output directory if it doesn't exist"""
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")

In [None]:
if __name__ == "__main__":
    # Example usage
    base_url = 'https://1819news.com/'
    suffix = '/news/item'
    max_pages = 1  # Adjust this number based on how many pages you want to scrape
    
    # Create timestamped output directory
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = f"opal_output_{timestamp}"
    ensure_output_directory(output_dir)
    
    # Define output files
    urls_file = os.path.join(output_dir, "collected_urls.txt")
    json_file = os.path.join(output_dir, "parsed_articles.json")
    
    # Get the URLs using the improved URL collector and save to file
    urls = get_all_news_urls(base_url, suffix, max_pages, urls_file)
    print(f"Found {len(urls)} articles to parse")
    
    # Process those URLs directly with the article parser and save to file
    result = article_parser_1819(urls, json_file)
    print(f"Parsing complete. Results saved to {output_dir} directory.")

Created directory: opal_output_20250305_124718
Page 1: Found 27 new URLs
Reached maximum pages limit: 1
Saved 27 URLs to opal_output_20250305_124718/collected_urls.txt
Found 27 articles to parse
Fetching: https://1819news.com/news/item/gop-leaders-applaud-trumps-powerful-joint-session-speech-americas-golden-age-has-only-just-begun
Fetching: https://1819news.com/news/item/hollis-towns-resigns-as-aldotcom-editor
Fetching: https://1819news.com/news/item/recount-possible-in-2022-contested-conecuh-county-sheriffs-race-after-in-court-precinct-recount-places-doubt
Fetching: https://1819news.com/news/item/col-john-eidsmoe-and-becky-gerritson-myths-and-realities-about-the-controversial-veterans-bill
Fetching: https://1819news.com/news/item/elon-musks-doge-cancels-13-more-federal-office-leases-in-alabama
Fetching: https://1819news.com/news/item/troy-carico-think-the-thinkable-and-call-for-iveys-resignation
Fetching: https://1819news.com/news/item/theodore-man-left-journal-from-his-last-days-duri