In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
import json


In [175]:
def get_daily_links(base_link):
    downloads_link = base_link.replace("newspapers","api.fdnl.patron") + '/downloads'
    response = requests.get(downloads_link)

    

    daily_links = []
    if response.status_code == 200:
        data = response.json()
        for file_name in data['pdf']:
            if file_name.endswith('pdf'):
                file_url = f"{base_link}/{file_name}"

                # Split the URL by '/'
                parts = file_url.split('/')

                # Extract the relevant parts of the URL
                id = parts[3]
                path = '/'.join(parts[5:])

                # Reformat the URL
                new_url = f'https://ufdcimages.uflib.ufl.edu/{id[:2]}/{id[2:4]}/{id[4:6]}/{id[6:8]}/{id[8:10]}/{parts[4]}/{path}'

                daily_links.append(new_url)
                break # Only add first pdf link 
    return daily_links

def get_article_links_and_dates(url, school=None):
    if school == "USC": 
        # Fetch the HTML content for the page
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all the article links and dates within the page
        article_links = {}
        for article in soup.find_all('article'):
            link = article.find('a')['href']
            date = link.split('/')[3:6]
            date = '_'.join(date)
            if date in article_links:
                article_links[date].append(link)
            else:
                article_links[date] = [link]

        return article_links
    elif school == "UF":
        import requests
        import pandas as pd
        from datetime import datetime

        api_url = "https://api.fdnl.patron.uflib.ufl.edu/serialhierarchy?bibid=UF00028290"

        response = requests.get(api_url)
        data = response.json()

        if os.path.exists('uf_article_pages.json'):
            # Open the JSON file for reading
            with open('uf_article_pages.json', 'r') as f:
               article_links = json.load(f)
        else:
            article_links = {}
        for year in data:
            # print(year)
            if int(year['key']) >= 2009:

                for month in year['values']:
                    for entry in month['values']:
                        date_str = entry['text']
                        
                        try:
                            date = datetime.strptime(date_str, '%B %d, %Y')
                            year_month_day = date.strftime('%Y_%m_%d')
                            if year_month_day in article_links:
                                print(f"Date {year_month_day} exists.")
                                continue
                            else:
                                article_links[year_month_day] = get_daily_links(url + entry['vid'])
                                print(f"Date {year_month_day} added.")
                        except ValueError:
                            try:
                                date = datetime.strptime(f"{year['key']} {month['key']} {date_str}", '%Y %B %d')
                                year_month_day = date.strftime('%Y_%m_%d')
                                if year_month_day in article_links:
                                    print(f"Date {year_month_day} exists.")
                                    continue
                                else:
                                    article_links[year_month_day] = get_daily_links(url + entry['vid'])
                                    print(f"Date {year_month_day} added.")
                                article_links[year_month_day] = get_daily_links(url + entry['vid'])
                            except ValueError as ve:
                                # if there is an error parsing the date string, print an error message and continue to the next iteration
                                print(f"{ve} Error parsing date string: {year_month_day}")
                            except TypeError as te:
                                print(f"{te} Error parsing date string: {year_month_day}") 
                        
                print(f"{year['key']} saved to json.")
                with open("uf_article_pages.json", "w") as outfile:
                    json.dump(article_links, outfile, indent=4)           
        return(article_links)



def get_article_text(url):
    response = requests.get(url)
    html_content = response.content
    soup = BeautifulSoup(html_content, "html.parser")

    p_tags = soup.find_all('p')
    text = "\n".join([p.get_text() for p in p_tags])

    return text

from make_txt_entry import make_txt_entry

def make_table_entry(school_name="UF",
                     journal_name="The Independent Florida Alligator",
                     publication_date=None,
                     raw_data_directory=None,
                     reference_link=None):
    if publication_date is None or raw_data_directory is None or reference_link is None:
        return RuntimeError("Function make_table_entry has invalid arguments.")

In [176]:
article_pages = get_article_links_and_dates(url="https://newspapers.uflib.ufl.edu/UF00028290/", school="UF")

Date 2009_01_06 exists.
Date 2009_01_07 exists.
Date 2009_01_08 exists.
Date 2009_01_09 exists.
Date 2009_01_12 exists.
Date 2009_01_13 exists.
Date 2009_01_14 exists.
Date 2009_01_15 exists.
Date 2009_01_16 exists.
Date 2009_01_20 exists.
Date 2009_01_21 exists.
Date 2009_01_22 exists.
Date 2009_01_23 exists.
Date 2009_01_26 exists.
Date 2009_01_27 exists.
Date 2009_01_28 exists.
Date 2009_01_29 exists.
Date 2009_01_30 exists.
Date 2009_02_02 exists.
Date 2009_02_03 exists.
Date 2009_02_04 exists.
Date 2009_02_05 exists.
Date 2009_02_06 exists.
Date 2009_02_09 exists.
Date 2009_02_10 exists.
Date 2009_02_11 exists.
Date 2009_02_12 exists.
Date 2009_02_13 exists.
Date 2009_02_16 exists.
Date 2009_02_17 exists.
Date 2009_02_18 exists.
Date 2009_02_19 exists.
Date 2009_02_20 exists.
Date 2009_02_23 exists.
Date 2009_02_24 exists.
Date 2009_02_25 exists.
Date 2009_02_26 exists.
Date 2009_02_27 exists.
Date 2009_03_02 exists.
Date 2009_03_03 exists.
Date 2009_03_04 exists.
Date 2009_03_05 

TypeError: list indices must be integers or slices, not str

In [147]:
import importlib
import pdf_to_text
importlib.reload(pdf_to_text)

from pdf_to_text import extract_pdf_text

print(extract_pdf_text(article_pages['2009_01_15'][0]))

Today
visit www.alligator.orgWe Inform. You Decide.VOLUME 102 ISSUE 82 THURSDAY, JANUARY 15, 2009Not officially associated with the University of Florida Published by Campus Communications, Inc. of Gainesville, FloridaRecycle
Partly 
cloudy
62/31FORECAST 2
OPINIONS 6
the AVENUE 9
CLASSIFIEDS 13
CROSSWORD 14
SPORTS 18
David Cumming / Alligator
Across the Americas
Taylor Sincich traverses from one tree to another across the Plaza of the Americas on Wednesday afternoon.
n MAYOR ALSO PROMISES              
INTEGRITY AMONG OFFICERS.
By HUNTER SIZEMORE
Alligator Contributing Writer
Economic and environmental sustainability 
were themes of the 5th annual State of the City 
address given by Mayor Peegen Hanrahan Tuesday morning.
She announced several new additions 
to the Gainesville Police Department and Gainesville Fire Rescue, praised the city’s en-vironmental friendliness and spoke against an upcoming amendment.
An Intelligence Center will be built behind 
the GPD’s Operations Center, whic

In [None]:
print(f"There are {len(article_pages)} dates with articles")
total_size = sum([len(article_pages[key]) for key in article_pages])
print(f"There are {total_size} articles to iterate over")

There are 0 dates with articles
There are 0 articles to iterate over


In [None]:
school_name = "UF"
for date, articles in article_pages.items():
    file_path = "journal_data/txt/"+school_name.replace(" ","_")+"/"+date+".txt"
    if(os.path.exists(file_path)):
        print(f"Date {date} already added.")
    else:
        article_text = ""
        for article in articles:
            article_text += get_article_text(article) + "\n"
            
        make_txt_entry(school_name="USC", publication_date=date, text=article_text)
        print(f"Date {date} added.")

In [None]:
# import importlib
# importlib.reload(queries)

from processors import text_removal_processing

with open("UF_text.txt", "r", encoding="utf8") as f:
    UF_TEXT = f.read()

UF_pipeline = [text_removal_processing(removable_string=UF_TEXT)]

from queries import no_query

from queries import mention_tracker

In [None]:
with open("journal_data/txt/USC/2009_05_27.txt", "r", encoding="utf8") as f:
    example_entry = f.read()
print(f"Before processing there are {len(example_entry)} characters in the entry.")
processed_example_entry = mention_tracker(example_entry, UF_pipeline, no_query)
print(f"After processing there are {len(processed_example_entry)} characters in the entry.")

Before processing there are 96169 characters in the entry.
After processing there are 192339 characters in the entry.
