In [36]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
import json


In [37]:
# Find a list of all links leading to each month's articles
def issue_link_finder():
    url = "https://newspapers.uflib.ufl.edu/titles/UF00028290"

    # Make a request to the website
    response = requests.get(url)
    html_content = response.content

    # Parse the html content using BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")

    # Initialize an empty list to store links of each possible subdirectory
    links = []  

def extract_links():
    url = 'https://newspapers.uflib.ufl.edu/titles/UF00028290'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = []
    for a in soup.find_all('a', href=True):
        link = a.get('href')
        if bool(re.match("^https://newspapers.uflib.ufl.edu/UF00028290/.*$", link)):
            links.append(link)
    return links

# Example usage
links = extract_links()
print(links)


# Find all possible dates (representing all issues) for a given
def get_subpages(url):
    response = requests.get(url)
    html_content = response.content
    soup = BeautifulSoup(html_content, "html.parser")

    nav_tag = soup.find('nav', class_='pagination')
    if not nav_tag:
        return [url]
    
    max_page = int(nav_tag.find('span', class_='pagination-meta').text.split()[-1])
    subpages = [f"{url}page/{i}/" for i in range(2, max_page+1)]

    return [url] + subpages

def get_article_links_and_dates(url):
    # Fetch the HTML content for the page
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the article links and dates within the page
    article_links = {}
    article_dates = set()
    for article in soup.find_all('article'):
        link = article.find('a')['href']
        date = link.split('/')[3:6]
        date = '_'.join(date)
        article_dates.add(date)
        if date in article_links:
            article_links[date].append(link)
        else:
            article_links[date] = [link]

    return article_links, article_dates

def get_article_text(url):
    response = requests.get(url)
    html_content = response.content
    soup = BeautifulSoup(html_content, "html.parser")

    p_tags = soup.find_all('p')
    text = "\n".join([p.get_text() for p in p_tags])

    return text

from make_txt_entry import make_txt_entry

def make_table_entry(school_name="UF",
                     journal_name="The Independent Florida Alligator",
                     publication_date=None,
                     raw_data_directory=None,
                     reference_link=None):
    if publication_date is None or raw_data_directory is None or reference_link is None:
        return RuntimeError("Function make_table_entry has invalid arguments.")

SSLError: HTTPSConnectionPool(host='newspapers.uflib.ufl.edu', port=443): Max retries exceeded with url: /titles/UF00028290 (Caused by SSLError(CertificateError("hostname 'newspapers.uflib.ufl.edu' doesn't match 'netaccess.noc.ucla.edu'")))

In [None]:
issue_links = issue_link_finder()

In [None]:
article_dates = set()
article_pages = {}

for month_sub_pages in sub_pages:
    for sub_page in month_sub_pages:
        try:
            article_links, dates = get_article_links_and_dates(sub_page)
            for date in dates:
                if date in article_pages:
                    article_pages[date].extend(article_links[date])
                else:
                    article_pages[date] = article_links.copy()[date]
                    print(f"Date {date} added.")
            article_dates.update(dates)
        except ConnectionError as ce:
            print(ce, month_sub_pages)

with open("usc_article_pages.json", "w") as outfile:
    json.dump(article_pages, outfile, indent=4)

In [None]:
print(f"There are {len(article_pages)} dates with articles")
total_size = sum([len(article_pages[key]) for key in article_pages])
print(f"There are {total_size} articles to iterate over")

There are 0 dates with articles
There are 0 articles to iterate over


In [None]:
school_name = "UF"
for date, articles in article_pages.items():
    file_path = "journal_data/txt/"+school_name.replace(" ","_")+"/"+date+".txt"
    if(os.path.exists(file_path)):
        print(f"Date {date} already added.")
    else:
        article_text = ""
        for article in articles:
            article_text += get_article_text(article) + "\n"
            
        make_txt_entry(school_name="USC", publication_date=date, text=article_text)
        print(f"Date {date} added.")

In [None]:
# import importlib
# importlib.reload(queries)

from processors import text_removal_processing

with open("UF_text.txt", "r", encoding="utf8") as f:
    UF_TEXT = f.read()

UF_pipeline = [text_removal_processing(removable_string=UF_TEXT)]

from queries import no_query

from queries import mention_tracker

In [None]:
with open("journal_data/txt/USC/2009_05_27.txt", "r", encoding="utf8") as f:
    example_entry = f.read()
print(f"Before processing there are {len(example_entry)} characters in the entry.")
processed_example_entry = mention_tracker(example_entry, UF_pipeline, no_query)
print(f"After processing there are {len(processed_example_entry)} characters in the entry.")

Before processing there are 96169 characters in the entry.
After processing there are 192339 characters in the entry.
