In [24]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import json
import os

def get_subpages():
    base_url = "https://www.gwhatchet.com/archives/"
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, "html.parser")

    subpages = []

    issue_elements = soup.find_all("li")

    for issue_element in issue_elements:
        link = issue_element.find("a")

        if link is not None and link.has_attr("href"):
            subpage_url = link["href"]
            if "/issue/" in subpage_url and len(subpage_url) >= 26:
                date_str = subpage_url.split("/issue/")[1][:10]
                try:
                    subpage_date = datetime.strptime(date_str, "%Y-%m-%d")
                    if subpage_date.year >= 2009 and subpage_date <= datetime.now():
                        subpages.append(subpage_url)
                except ValueError:
                    pass

    return subpages

def get_article_links_and_dates():
    subpages = get_subpages()
    article_data = {}

    # Load existing data if available
    if os.path.isfile("gw_article_pages.json"):
        try:
            with open("gw_article_pages.json", "r") as json_file:
                article_data = json.load(json_file)
        except FileNotFoundError:
            pass

    for subpage in subpages:
        response = requests.get(subpage)
        soup = BeautifulSoup(response.content, "html.parser")

        articles = soup.find_all("article", class_="post")

        for article in articles:
            link = article.find("h2", class_="post-title").find("a")
            article_url = link["href"]

            article_date = subpage.split("/issue/")[1][:10].replace("-", "_")

            if article_date in article_data:
                if article_url in article_data[article_date]:
                    print(f"Link already exists for {article_date}: {article_url}")
                    continue
                else:
                    article_data[article_date].append(article_url)
            else:
                article_data[article_date] = [article_url]

            print(f"Added link: {article_url}")

    with open("gw_article_pages.json", "w") as json_file:
        json.dump(article_data, json_file, indent=4)

    return article_data

get_article_links_and_dates()

Added link: https://www.gwhatchet.com/2023/05/26/gw-to-create-global-food-institute-research-food-system-solutions/
Added link: https://www.gwhatchet.com/2023/05/25/d-c-council-asks-gw-hospital-officials-to-allow-union-vote-stop-alleged-unfair-labor-practices/
Added link: https://www.gwhatchet.com/2023/05/24/revolutionaries-to-replace-colonials-as-next-university-moniker/
Added link: https://www.gwhatchet.com/2023/05/27/photo-essay-gw-celebrates-2023-commencement/
Added link: https://www.gwhatchet.com/2023/05/24/quick-take-theres-nothing-revolutionary-about-revolutionaries/
Added link: https://www.gwhatchet.com/2023/05/22/bowser-selects-interim-police-chief-as-contee-exits/
Added link: https://www.gwhatchet.com/2023/05/22/crime-log-unknown-man-allegedly-attempts-to-punch-student-uses-homophobic-slur/
Added link: https://www.gwhatchet.com/2023/05/22/staff-editorial-new-dining-halls-are-a-step-up-but-need-better-labeling-food-options/
Added link: https://www.gwhatchet.com/2023/05/22/op-e

{'2023_05_25': ['https://www.gwhatchet.com/2023/05/26/gw-to-create-global-food-institute-research-food-system-solutions/',
  'https://www.gwhatchet.com/2023/05/25/d-c-council-asks-gw-hospital-officials-to-allow-union-vote-stop-alleged-unfair-labor-practices/',
  'https://www.gwhatchet.com/2023/05/24/revolutionaries-to-replace-colonials-as-next-university-moniker/'],
 '2023_05_22': ['https://www.gwhatchet.com/2023/05/27/photo-essay-gw-celebrates-2023-commencement/',
  'https://www.gwhatchet.com/2023/05/24/quick-take-theres-nothing-revolutionary-about-revolutionaries/',
  'https://www.gwhatchet.com/2023/05/22/bowser-selects-interim-police-chief-as-contee-exits/',
  'https://www.gwhatchet.com/2023/05/22/crime-log-unknown-man-allegedly-attempts-to-punch-student-uses-homophobic-slur/',
  'https://www.gwhatchet.com/2023/05/22/staff-editorial-new-dining-halls-are-a-step-up-but-need-better-labeling-food-options/',
  'https://www.gwhatchet.com/2023/05/22/op-ed-students-shouldnt-have-to-carry-bu

In [1]:
from helpers.txt_to_text import get_article_text
from helpers.make_txt_entry import make_txt_entry
import os
import json
import time
from concurrent.futures import ThreadPoolExecutor, TimeoutError

# Open the JSON file
with open('gw_article_pages.json', 'r') as f:
    all_links_dates = json.load(f)

school_name = "GW"
dates = sorted(all_links_dates.keys())

def process_article(article):
    try:
        return get_article_text(article)
    except Exception as e:
        print(f"Error processing article: {article}")
        print(f"Error message: {str(e)}")
        return ""  # Return empty string for failed articles

def process_date(date):
    start_time = time.time()
    articles = all_links_dates[date]
    file_path = f"journal_data/txt/{school_name.replace(' ', '_')}/{date}.txt"
    
    if os.path.exists(file_path):
        print(f"Date {date} already added.")
        return
    
    with ThreadPoolExecutor() as executor:
        try:
            article_texts = executor.map(process_article, articles, timeout=30)
        except TimeoutError:
            print(f"Timeout occurred while processing articles for date: {date}")
            return
    
    article_text = "\n".join(article_texts)
    try:
        make_txt_entry(school_name="GW", publication_date=date, text=article_text)
        print(f"Date {date} added.")
    except Exception as e:
        print(f"Error occurred while adding date: {date}")
        print(f"Error message: {str(e)}")
    
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Execution time: {execution_time} seconds")

if __name__ == "__main__":
    for date in reversed(dates):
        process_date(date)

Date 2023_05_25 already added.
Date 2023_05_22 already added.
Date 2023_05_17 already added.
Date 2023_05_16 already added.
Date 2023_05_15 already added.
Date 2023_05_11 already added.
Date 2023_05_09 already added.
Date 2023_05_04 already added.
Date 2023_05_01 already added.
Date 2023_04_27 already added.
Date 2023_04_24 already added.
Date 2023_04_20 already added.
Date 2023_04_17 already added.
Date 2023_04_13 already added.
Date 2023_04_10 already added.
Date 2023_04_06 already added.
Date 2023_04_03 already added.
Date 2023_03_30 already added.
Date 2023_03_27 already added.
Date 2023_03_23 already added.
Date 2023_03_09 already added.
Date 2023_03_06 already added.
Date 2023_03_02 already added.
Date 2023_02_27 already added.
Date 2023_02_23 already added.
Date 2023_02_16 already added.
Date 2023_02_13 already added.
Date 2023_02_09 already added.
Date 2023_02_06 already added.
Date 2023_02_02 already added.
Date 2023_01_30 already added.
Date 2023_01_26 already added.
Date 202