In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re

def get_subpages(url):
    response = requests.get(url)
    html_content = response.content
    soup = BeautifulSoup(html_content, "html.parser")

    current_year = datetime.now().year  # Get the current year dynamically
    
    subpages = set()
    for year in range(current_year, 2009, -1):
        for month in range(12, 0, -1):
            if (year == current_year and month > datetime.datetime.now().month) or (year == 2010 and month < 9) or year > current_year or year < 2010:
                continue  # Skip this year-month combination
            subpage_url = f"{url}{str(year)}/{month:02d}/".replace("archives/", "")
            subpages.add(subpage_url)

    return subpages, soup
      # Return the set of subpages

def get_article_links_and_dates(subpages):
    articles = {}
    links_set = set()
    for subpage in subpages:
        response = requests.get(subpage)
        soup = BeautifulSoup(response.content, 'html.parser')
        articles_html = soup.find_all('article')
        for article_html in articles_html:
            link = article_html.find('a')['href']
            if link not in links_set:
                links_set.add(link)
                date_str = re.findall(r'/(\d{4})/(\d{2})/(\d{2})/', link)[0]
                date = "_".join(date_str)
                if date_str:
                    if date not in articles:
                        articles[date] = []
                    articles[date].append(link)

    return dict(sorted(articles.items(), key=lambda x: x[0], reverse=False))


In [None]:
import json
sub_pages = get_subpages("https://www.excal.on.ca/archives/")

# Call the function to get the scraped data
scraped_data = get_article_links_and_dates(sub_pages)

# Open a new file for writing and save the scraped data to it
with open('york_article_pages.json', 'w') as outfile:
        json.dump(scraped_data, outfile, indent=4)

In [None]:
from helpers.txt_to_text import get_article_text
from helpers.make_txt_entry import make_txt_entry
import os

school_name = "York"
dates = sorted(scraped_data.keys())

for date in reversed(dates):
    articles = scraped_data[date]
    file_path = "journal_data/txt/"+school_name.replace(" ","_")+"/"+date+".txt"
    if(os.path.exists(file_path)):
        print(f"Date {date} already added.")
    else:
        article_text = ""
        for article in articles:
            article_text += get_article_text(article) + "\n"
            
        make_txt_entry(school_name="York", publication_date=date, text=article_text)
        print(f"Date {date} added.")


In [None]:
from helpers.graphers import monthly_grapher
from helpers.queries import hillel_counter

school_name = "York"
newspaper_name = "Excalibur"
school_name_lower = school_name.lower()

York_pipeline = []

directory = 'journal_data/txt/' + school_name

monthly_grapher(directory=directory,
                pipeline=York_pipeline,
                query=hillel_counter,
                y_label="Hillel Mentions Per Month",
                title="Mentions of Hillel in " + newspaper_name,
                save_path='figures/' + school_name + '/' + school_name + '_hillel_mentions.png')