In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import os

In [16]:
# Find a list of all links leading to each month's articles
def issue_link_finder():
    url = "https://dailytrojan.com/archives/#tab-id-5"

    # Make a request to the website
    response = requests.get(url)
    html_content = response.content

    # Parse the html content using BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")

    # Initialize an empty list to store links of each possible subdirectory
    links = []  

    # Find all the links on the website
    li_tags = soup.find_all("li")
    for li in li_tags:
        a = li.find('a')
        link = a.get('href')
        if bool(re.match("^https://dailytrojan\.com/20\d{2}/\d{2}/$", link)):
            links.append(link)
    
    return links

# Find all possible dates (representing all issues) for a given
def get_subpages(url):
    response = requests.get(url)
    html_content = response.content
    soup = BeautifulSoup(html_content, "html.parser")

    nav_tag = soup.find('nav', class_='pagination')
    if not nav_tag:
        return [url]
    
    max_page = int(nav_tag.find('span', class_='pagination-meta').text.split()[-1])
    subpages = [f"{url}page/{i}/" for i in range(2, max_page+1)]

    return [url] + subpages

def get_article_links_and_dates(url):
    # Fetch the HTML content for the page
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the article links and dates within the page
    article_links = {}
    article_dates = set()
    for article in soup.find_all('article'):
        link = article.find('a')['href']
        date = link.split('/')[3:6]
        date = '_'.join(date)
        article_dates.add(date)
        if date in article_links:
            article_links[date].append(link)
        else:
            article_links[date] = [link]

    return article_links, article_dates

def get_article_text(url):
    response = requests.get(url)
    html_content = response.content
    soup = BeautifulSoup(html_content, "html.parser")

    p_tags = soup.find_all('p')
    text = "\n".join([p.get_text() for p in p_tags])

    return text

from make_txt_entry import make_txt_entry

def make_table_entry(school_name="USC",
                     journal_name="Daily Trojan",
                     publication_date=None,
                     raw_data_directory=None,
                     reference_link=None):
    if publication_date is None or raw_data_directory is None or reference_link is None:
        return RuntimeError("Function make_table_entry has invalid arguments.")
    

In [14]:
issue_links = issue_link_finder()

['https://dailytrojan.com/2023/04/', 'https://dailytrojan.com/2023/03/', 'https://dailytrojan.com/2023/02/', 'https://dailytrojan.com/2023/01/', 'https://dailytrojan.com/2022/12/', 'https://dailytrojan.com/2022/11/', 'https://dailytrojan.com/2022/10/', 'https://dailytrojan.com/2022/09/', 'https://dailytrojan.com/2022/08/', 'https://dailytrojan.com/2022/07/', 'https://dailytrojan.com/2022/06/', 'https://dailytrojan.com/2022/05/', 'https://dailytrojan.com/2022/04/', 'https://dailytrojan.com/2022/03/', 'https://dailytrojan.com/2022/02/', 'https://dailytrojan.com/2022/01/', 'https://dailytrojan.com/2021/12/', 'https://dailytrojan.com/2021/11/', 'https://dailytrojan.com/2021/10/', 'https://dailytrojan.com/2021/09/', 'https://dailytrojan.com/2021/08/', 'https://dailytrojan.com/2021/07/', 'https://dailytrojan.com/2021/06/', 'https://dailytrojan.com/2021/05/', 'https://dailytrojan.com/2021/04/', 'https://dailytrojan.com/2021/03/', 'https://dailytrojan.com/2021/02/', 'https://dailytrojan.com/20

In [40]:
sub_pages = []
for url in issue_links:
    sub_pages.append(get_subpages(url))

In [43]:
print(f"There are {len(issue_links)} months")
sub_page_count = sum([len(sub_page) for sub_page in sub_pages])
print(f"There are {sub_page_count} total sub-pages to traverse")

There are 167 articles
There are 3246 total sub-pages to traverse


In [76]:
article_dates = set()
article_pages = {}

for month_sub_pages in sub_pages:
    for sub_page in month_sub_pages:
        try:
            article_links, dates = get_article_links_and_dates(sub_page)
            for date in dates:
                if date in article_pages:
                    article_pages[date].extend(article_links[date])
                else:
                    article_pages[date] = article_links.copy()[date]
                    print(f"Date {date} added.")
            article_dates.update(dates)
        except ConnectionError as ce:
            print(ce, month_sub_pages)

Date 2023_04_07 added.
Date 2023_04_06 added.
Date 2023_04_05 added.
Date 2023_04_04 added.
Date 2023_04_03 added.
Date 2023_04_02 added.
Date 2023_03_31 added.
Date 2023_03_30 added.
Date 2023_03_29 added.
Date 2023_03_28 added.
Date 2023_03_27 added.
Date 2023_03_24 added.
Date 2023_03_26 added.
Date 2023_03_23 added.
Date 2023_03_22 added.
Date 2023_03_21 added.
Date 2023_03_20 added.
Date 2023_03_10 added.
Date 2023_03_17 added.
Date 2023_03_15 added.
Date 2023_03_18 added.
Date 2023_03_11 added.
Date 2023_03_13 added.
Date 2023_03_09 added.
Date 2023_03_08 added.
Date 2023_03_07 added.
Date 2023_03_06 added.
Date 2023_03_05 added.
Date 2023_03_03 added.
Date 2023_03_02 added.
Date 2023_03_01 added.
Date 2023_02_28 added.
Date 2023_02_27 added.
Date 2023_02_24 added.
Date 2023_02_26 added.
Date 2023_02_23 added.
Date 2023_02_22 added.
Date 2023_02_17 added.
Date 2023_02_16 added.
Date 2023_02_15 added.
Date 2023_02_14 added.
Date 2023_02_13 added.
Date 2023_02_10 added.
Date 2023_0

In [80]:
print(f"There are {len(article_pages)} dates with articles")
total_size = sum([len(article_pages[key]) for key in article_pages])
print(f"There are {total_size} articles to iterate over")

There are 3117 dates with articles
There are 31709 articles to iterate over


In [81]:
for date, articles in article_pages.items():
    # if(os.path.exists(f"journal_data/txt/{school_name.replace(" ","_")}/{publication_date}.txt")):
    #     skip
    # else:
    article_text = ""
    for article in articles:
        article_text += get_article_text(article) + "\n"
    make_txt_entry(school_name="USC", publication_date=date, text=article_text)
    print(f"Date {date} added.")

Date 2023_04_07 added.
Date 2023_04_06 added.
Date 2023_04_05 added.
Date 2023_04_04 added.
Date 2023_04_03 added.
Date 2023_04_02 added.
Date 2023_03_31 added.
Date 2023_03_30 added.
Date 2023_03_29 added.
Date 2023_03_28 added.
Date 2023_03_27 added.
Date 2023_03_24 added.
Date 2023_03_26 added.
Date 2023_03_23 added.
Date 2023_03_22 added.
Date 2023_03_21 added.
Date 2023_03_20 added.
Date 2023_03_10 added.
Date 2023_03_17 added.
Date 2023_03_15 added.
Date 2023_03_18 added.
Date 2023_03_11 added.
Date 2023_03_13 added.
Date 2023_03_09 added.
Date 2023_03_08 added.
Date 2023_03_07 added.
Date 2023_03_06 added.
Date 2023_03_05 added.
Date 2023_03_03 added.
Date 2023_03_02 added.
Date 2023_03_01 added.
Date 2023_02_28 added.
Date 2023_02_27 added.
Date 2023_02_24 added.
Date 2023_02_26 added.
Date 2023_02_23 added.
Date 2023_02_22 added.
Date 2023_02_17 added.
Date 2023_02_16 added.
Date 2023_02_15 added.
Date 2023_02_14 added.
Date 2023_02_13 added.
Date 2023_02_10 added.
Date 2023_0

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [15]:
from processors import text_removal_processing

# USC_text = "This site uses cookies. By continuing to browse the site, you are agreeing to our use of cookies.
#     We may request cookies to be set on your device. We use cookies to let us know when you visit our websites, how you interact with us, to enrich your user experience, and to customize your relationship with our website. 
#     Click on the different category headings to find out more. You can also change some of your preferences. Note that blocking some types of cookies may impact your experience on our websites and the services we are able to offer.
#     These cookies are strictly necessary to provide you with services available through our website and to use some of its features.
#     Because these cookies are strictly necessary to deliver the website, refusing them will have impact how our site functions. You always can block or delete cookies by changing your browser settings and force blocking all cookies on this website. But this will always prompt you to accept/refuse cookies when revisiting our site.
#     We fully respect if you want to refuse cookies but to avoid asking you again and again kindly allow us to store a cookie for that. You are free to opt out any time or opt in for other cookies to get a better experience. If you refuse cookies we will remove all set cookies in our domain.
#     We provide you with a list of stored cookies on your computer in our domain so you can check what we stored. Due to security reasons we are not able to show or modify cookies from other domains. You can check these in your browser security settings.
#     These cookies collect information that is used either in aggregate form to help us understand how our website is being used or how effective our marketing campaigns are, or to help us customize our website and application for you in order to enhance your experience.
#     If you do not want that we track your visit to our site you can disable tracking in your browser here:
#     We also use different external services like Google Webfonts, Google Maps, and external Video providers. Since these providers may collect personal data like your IP address we allow you to block them here. Please be aware that this might heavily reduce the functionality and appearance of our site. Changes will take effect once you reload the page.
#     Google Webfont Settings:
#     Google Map Settings:
#     Google reCaptcha Settings:
#     Vimeo and Youtube video embeds:
#     The following cookies are also needed - You can choose if you want to allow them:"

USC_pipeline = [text_removal_processing(removable_string="Test")]

def mention_tracker(text, pipeline, query):
    processed_text = text
    for step in pipeline:
        processed_text = step(text=processed_text)
    query_result = query(processed_text)

AttributeError: module 'processors' has no attribute 'text_removal_processing1'