In [23]:

with open ('sites.txt') as f:
    lines = f.readlines()
    lines = [line.rstrip() for line in lines]
f.close()

# manual div class keywords
keywords = ["event", "content", "detail", "card", "views","location","time", "date", "notes", "evt"]

# previous 10 years
years = [str(i) for i in range(2010, 2024)]

# all monnth in title case
old_months = ["January", "February", "March"]

In [22]:
# test get req of 9th site
import requests
from bs4 import BeautifulSoup
html = requests.get(lines[4]).text
html



### Main Functions


In [38]:
# Custom HTML Parsing as solution vs. LLM Text Extraction w/ HTML Filtering
import bs4
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

"""
1. Retrieve HTML from a site
2. Extract event text from HTML
2.5 Preprocess Event Text
3. Store event text in a file
4. Convert to JSON or CSV via LLM
5. Store in a database

"""

# 1
def get_html(site):
    response = requests.get(site)
    if response is None:
        print('Failed to retrieve html from site')
        return None
    return response.text

# 1.1
def get_html_selenium(site):
    driver = webdriver.Chrome()
    driver.get(site)
    # wait for (By.CLASS_NAME, "s-lc-mc-evt")
    element = WebDriverWait(driver, 30).until(
        EC.presence_of_element_located((By.CLASS_NAME, "s-lc-mc-evt"))
    )
    print('Element found', element)
    html = driver.page_source
    driver.quit()
    return html

# 2
def extract_event_text(soup):
    all_divs = soup.find_all('div')
    lowercase_all_divs_classes(all_divs)
    event_divs = filter_event_divs(all_divs)
    event_text = extract_text_from_event_divs(event_divs)
    return event_text

# 2.5
def lowercase_all_divs_classes(divs):
    for div in divs:
        if div.has_attr('class'):
            div['class'] = [x.lower() for x in div['class']]

def filter_event_divs(all_divs):
    event_divs = []
    for div in all_divs:
        if div.get('class') is not None and any(keyword in div.get('class')[0] for keyword in keywords):
            event_divs.append(div)
    return event_divs

def extract_text_from_event_divs(event_divs):
    event_text = []
    for div in event_divs:
        text = div.get_text()
        text = [x for x in text.split('\n') if x != '']
        for line in text:
            if is_old_event(line):
                continue
            while '\n' in line:
                line = line.replace('\n', ' ')
            event_text.append(line + '\n')
    return event_text


def is_old_event(line):
    if any(year in line for year in years):
        return True
    elif any(month in line for month in old_months):
        return True
    return False

def remove_duplicates(event_text):
    return list(set(event_text))

# 3
def write_event_text(event_text, filename):
    event_text = [x.encode('ascii', 'ignore').decode('ascii') for x in event_text]
    folder = "extracted_txt"
    file_path = folder + "/" + filename
    with open(file_path, 'w') as f:
        for event in event_text:
            if len(event) > 0 or event != ' ':
                f.write(event)
    f.close()
    return 0

# 4
def convert_to_json(event_text):
    # TODO: call LLM API or make one
    return 0

def single_site(site):
    html = get_html_selenium(site)
    soup = bs4.BeautifulSoup(html, 'html.parser')
    event_text = extract_event_text(soup)
    write_event_text(event_text, 'site.txt')
    
    print(len(event_text))
    print(event_text)
    return 0

def main():
    for i, site in enumerate(lines):
        html = get_html(site)
        soup = bs4.BeautifulSoup(html, 'html.parser')
        event_text = extract_event_text(soup)
        write_event_text(event_text, f'site_{i}.txt')
    return 0

In [27]:
get_html(lines[4])



In [39]:
get_html_selenium(lines[4])



In [37]:
single_site(lines[4])

12
['Previous«2024Next»JanFebMarAprMayJunJulAugSepOctNovDecTodayClear\n', 'Previous«2000-2900Next»190020002100220023002400250026002700280029003000TodayClear\n', '            iCal\n', '        \n', '    Time Zone:\n', '    Eastern Time - US & Canada\n', '    (change)\n', '        To subscribe to this calendar, copy the link below into any application that supports the iCal format.\n', '        \n', '    Time Zone:\n', '    Eastern Time - US & Canada\n', '    (change)\n']


0

In [18]:
main()

0

### Test Functions

In [13]:
def test_parsings(sites):
    for site in sites:
        html = get_html(site)
        if html is not None:
            soup = bs4.BeautifulSoup(html, 'html.parser')
            event_text = extract_event_text(soup)
            print(len(event_text), site)
    return 0 

In [25]:
test_parsings(lines)

84 https://scholarslab.lib.virginia.edu/events/
21 https://www.virginia.edu/calendar
595 https://education.virginia.edu/events
88 https://global.virginia.edu/events
10 https://cal.lib.virginia.edu/calendar/events?cid=4299&t=m&d=0000-00-00&cal=4299&ct=69160,33395,66337,31015,30813,51597,58853,58854,58855,58856,70846,45972,31362,27888,30045,27381,57994,54907,26930,29624,56703,66253,66255,66338,46136,70848,33496,70427,27725,29618,63738,28898,33396,38996,50481,70849,51598,29985&inc=0
1152 https://engineering.virginia.edu/news-events/events
24 https://commcal.mcintire.virginia.edu/
22 https://www.arch.virginia.edu/events?search=&start=&end=&range=upcoming&events=&pageindex=1&pagesize=12
135 https://news.med.virginia.edu/
0 https://events.batten.virginia.edu/
986 https://economics.virginia.edu/calendar/month?date=2024-04
436 https://career.virginia.edu/Employers


0