In [37]:
import requests
from bs4 import BeautifulSoup
import re
import json
from datetime import datetime

def get_year_subpages(url):
    current_year = datetime.now().year
    year_links = []

    # Iterate from 2009 to the current year
    for year in range(2009, current_year + 1):
        year_url = f"{url}{year}/"
        year_links.append(year_url)

    return year_links

def get_subpages(year_links):
    day_subpages = []

    for year_link in year_links:
        response = requests.get(year_link)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            sitemap_cells = soup.find_all('li', class_='sitemap-cell')
            for cell in sitemap_cells:
                day_link = f"https://www.thecrimson.com{cell.find('a')['href']}"
                day_subpages.append(day_link)

    print(f"Total day subpages found: {len(day_subpages)}")
    return day_subpages

def get_article_links(day_subpage):
    article_links = {}

    response = requests.get(day_subpage)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        sitemap_articles = soup.find_all('li', class_='sitemap-cell')

        for article in sitemap_articles:
            link = f"https://www.thecrimson.com{article.find('a')['href']}"
            print(link)
            date_match = re.search(r'/(\d{4})/(\d{1,2})/(\d{1,2})/', article.find('a')['href'])
            if date_match:
                year = date_match.group(1)
                month = date_match.group(2).zfill(2)
                day = date_match.group(3).zfill(2)
                date = f"{year}_{month}_{day}"
                if date not in article_links:
                    article_links[date] = []
                article_links[date].append(link)

    return article_links

In [38]:
import json

url = "https://www.thecrimson.com/sitemap/"
year_subpages = get_year_subpages(url)
day_subpages = get_subpages(year_subpages)

article_links = {}
for day_subpage in day_subpages:
    article_links.update(get_article_links(day_subpage))
    
# Sort the article links dictionary by keys (dates) in descending order
sorted_article_links = dict(sorted(article_links.items(), key=lambda x: x[0], reverse=True))

# Write the output to a JSON file
output_file = "harvard_article_pages.json"
with open(output_file, 'w') as f:
    json.dump(sorted_article_links, f, indent=4)

print(f"Output written to {output_file}.")

Total day subpages found: 4932
https://www.thecrimson.com/article/2009/1/2/mather-house-senior-remembered-for-love/
https://www.thecrimson.com/article/2009/1/3/team-effort-paces-mens-hoops-win/
https://www.thecrimson.com/article/2009/1/3/william-mary-takes-basketball-battle/
https://www.thecrimson.com/article/2009/1/4/a-legacy-to-be-proud-of/
https://www.thecrimson.com/article/2009/1/4/mexicos-newest-luxury-item-several-weeks/
https://www.thecrimson.com/article/2009/1/4/mumbai-bias-visiting-the-sites-of/
https://www.thecrimson.com/article/2009/1/4/barack-like-me-far-be-it/
https://www.thecrimson.com/article/2009/1/4/harvard-law-school-dean-kagan-wins/
https://www.thecrimson.com/article/2009/1/4/cha-budget-comes-under-city-scrutiny/
https://www.thecrimson.com/article/2009/1/4/budget-crunch-hits-radcliffe-hds-as/
https://www.thecrimson.com/article/2009/1/4/late-game-dry-spell-downs-harvard-williamsburg/
https://www.thecrimson.com/article/2009/1/4/first-victory-lone-bright-spot-in/
https:

In [46]:
from helpers.txt_to_text import get_article_text
from helpers.make_txt_entry import make_txt_entry
import os
import json 

# Open the JSON file
with open('harvard_article_pages.json', 'r') as f:
    all_links_dates = json.load(f)

school_name = "Harvard"
dates = sorted(all_links_dates.keys())

for date in reversed(dates):
    articles = all_links_dates[date]
    file_path = "journal_data/txt/"+school_name.replace(" ","_")+"/"+date+".txt"
    if(os.path.exists(file_path)):
        print(f"Date {date} already added.")
    else:
        article_text = ""
        for article in articles:
            article_text += get_article_text(article) + "\n"
            
        make_txt_entry(school_name="Harvard", publication_date=date, text=article_text)
        print(f"Date {date} added.")

Date 2023_06_07 already added.
Date 2023_06_05 already added.
Date 2023_06_02 already added.
Date 2023_05_31 already added.
Date 2023_05_30 already added.
Date 2023_05_29 already added.
Date 2023_05_26 already added.
Date 2023_05_25 already added.
Date 2023_05_24 already added.
Date 2023_05_23 already added.
Date 2023_05_22 already added.
Date 2023_05_21 already added.
Date 2023_05_20 already added.
Date 2023_05_19 already added.
Date 2023_05_18 already added.
Date 2023_05_17 already added.
Date 2023_05_16 already added.
Date 2023_05_15 already added.
Date 2023_05_14 already added.
Date 2023_05_13 already added.
Date 2023_05_12 already added.
Date 2023_05_11 already added.
Date 2023_05_10 already added.
Date 2023_05_09 already added.
Date 2023_05_08 already added.
Date 2023_05_07 already added.
Date 2023_05_06 already added.
Date 2023_05_05 already added.
Date 2023_05_04 already added.
Date 2023_05_03 already added.
Date 2023_05_02 already added.
Date 2023_05_01 already added.
Date 202