In [184]:

with open ('sites.txt') as f:
    lines = f.readlines()
    lines = [line.rstrip() for line in lines]
f.close()

# manual div class keywords
keywords = ["event", "content", "views"]

# previous 10 years
years = [str(i) for i in range(2010, 2024)]

# all monnth in title case
old_months = ["January", "February", "March"]

### Main Functions


In [185]:
# Custom HTML Parsing as solution vs. LLM Text Extraction w/ HTML Filtering
import bs4

"""
1. Retrieve HTML from a site
2. Extract event text from HTML
2.5 Preprocess Event Text
3. Store event text in a file
4. Convert to JSON or CSV via LLM
5. Store in a database

"""

# 1
def get_html(site):
    import requests
    response = requests.get(site)
    if response is None:
        print('Failed to retrieve html from site')
        return None
    return response.text

# 2
def extract_event_text(soup):
    all_divs = soup.find_all('div')
    lowercase_all_divs_classes(all_divs)
    event_divs = filter_event_divs(all_divs)
    event_text = extract_text_from_event_divs(event_divs)
    return event_text

# 2.5
def lowercase_all_divs_classes(divs):
    for div in divs:
        if div.has_attr('class'):
            div['class'] = [x.lower() for x in div['class']]

def filter_event_divs(all_divs):
    event_divs = []
    for div in all_divs:
        if div.get('class') is not None and any(keyword in div.get('class')[0] for keyword in keywords):
            event_divs.append(div)
    return event_divs

def extract_text_from_event_divs(event_divs):
    event_text = []
    for div in event_divs:
        text = div.get_text()
        text = [x for x in text.split('\n') if x != '']
        for line in text:
            if is_old_event(line):
                continue
            while '\n' in line:
                line = line.replace('\n', ' ')
            event_text.append(line + '\n')
    return event_text


def is_old_event(line):
    if any(year in line for year in years):
        return True
    elif any(month in line for month in old_months):
        return True
    return False

def remove_duplicates(event_text):
    return list(set(event_text))

# 3
def write_event_text(event_text, filename):
    event_text = [x.encode('ascii', 'ignore').decode('ascii') for x in event_text]
    folder = "extracted_txt"
    file_path = folder + "/" + filename
    with open(file_path, 'w') as f:
        for event in event_text:
            if len(event) > 0 or event != ' ':
                f.write(event)
    f.close()
    return 0

# 4
def convert_to_json(event_text):
    # TODO: call LLM API or make one
    return 0

def single_site(site):
    html = get_html(site)
    soup = bs4.BeautifulSoup(html, 'html.parser')
    event_text = extract_event_text(soup)
    write_event_text(event_text, 'site.txt')
    
    print(len(event_text))
    print(event_text)
    return 0

def main():
    for i, site in enumerate(lines):
        html = get_html(site)
        soup = bs4.BeautifulSoup(html, 'html.parser')
        event_text = extract_event_text(soup)
        write_event_text(event_text, f'site_{i}.txt')
    return 0

In [186]:
single_site(lines[0])

84
['Events\n', "Current and past events hosted, sponsored, or partnered on by the Scholars' Lab.\n", 'Upcoming Events\n', 'Apr17\n', 'Make a leather book cover\n', 'When: Wednesday, April 17, 2024, 1:00PM-3:00PM\n', "Where: Scholars' Lab Makerspace - Alderman 308i\n", 'Details ›\n', 'May3\n', 'May the 4th\n', 'When: Friday, May 3, 2024, -\n', "Where: Scholars' Lab Makerspace - Alderman 308i\n", 'Details ›\n', 'May6\n', "Mother's Day\n", 'When: Monday, May 6, 2024, -\n', "Where: Scholars' Lab Makerspace - Alderman 308i\n", 'Details ›\n', ' \n', 'Previous Events\n', '2024\n', 'Make a Website (Wednesday, April 3, 2024)\n', "April Fools' Make-a-Prank (Monday, April 1, 2024)\n", ' \n', 'Events\n', "Current and past events hosted, sponsored, or partnered on by the Scholars' Lab.\n", 'Upcoming Events\n', 'Apr17\n', 'Make a leather book cover\n', 'When: Wednesday, April 17, 2024, 1:00PM-3:00PM\n', "Where: Scholars' Lab Makerspace - Alderman 308i\n", 'Details ›\n', 'May3\n', 'May the 4th\n', '

0

In [140]:
main()

0

### Test Functions

In [86]:
def test_parsings(sites):
    for site in sites:
        html = get_html(site)
        if html is not None:
            soup = bs4.BeautifulSoup(html, 'html.parser')
            event_text = extract_event_text(soup)
            print(len(event_text), site)
    return 0 

In [97]:
test_parsings(lines)

23 https://scholarslab.lib.virginia.edu/events/
7 https://www.virginia.edu/calendar
22 https://education.virginia.edu/events
11 https://global.virginia.edu/events
0 https://cal.lib.virginia.edu/calendar/events?cid=4299&t=m&d=0000-00-00&cal=4299&ct=69160,33395,66337,31015,30813,51597,58853,58854,58855,58856,70846,45972,31362,27888,30045,27381,57994,54907,26930,29624,56703,66253,66255,66338,46136,70848,33496,70427,27725,29618,63738,28898,33396,38996,50481,70849,51598,29985&inc=0
93 https://engineering.virginia.edu/news-events/events
24 https://commcal.mcintire.virginia.edu/
1 https://www.arch.virginia.edu/events?search=&start=&end=&range=upcoming&events=&pageindex=1&pagesize=12
1 https://news.med.virginia.edu/
0 https://events.batten.virginia.edu/
117 https://economics.virginia.edu/calendar/month?date=2024-04
102 https://career.virginia.edu/Employers


0