In [80]:
with open ('sites.txt') as f:
    lines = f.readlines()
    lines = [line.rstrip() for line in lines]
f.close()

# manual div class keywords
keywords = ["event", "content", "views"]

In [81]:
# Custom HTML Parsing as solution vs. LLM Text Extraction w/ HTML Filtering
import bs4

"""
1. Retrieve HTML from a site
2. Extract event text from HTML
3. Process Event Text via LLM
"""

# retrieve html from a site
def get_html(site):
    import requests
    response = requests.get(site)
    if response is None:
        print('Failed to retrieve html from site')
        return None
    return response.text

def extract_event_text(soup):
    all_divs = soup.find_all('div')  
    for div in all_divs:
        if div.has_attr('class'):
            div['class'] = [x.lower() for x in div['class']]
            
    event_divs = []
    for div in all_divs:
        if div.get('class') is not None and any(keyword in div.get('class')[0] for keyword in keywords):
            event_divs.append(div)

    event_text = []
    for div in event_divs:
        event_text.append(div.get_text())
    return event_text

# test html parsing
def test_parsings(sites):
    for site in sites:
        html = get_html(site)
        if html is not None:
            soup = bs4.BeautifulSoup(html, 'html.parser')
            event_text = extract_event_text(soup)
            print(len(event_text), site)
    return 0 

In [82]:
test_parsings(lines)

23 https://scholarslab.lib.virginia.edu/events/
7 https://www.virginia.edu/calendar
22 https://education.virginia.edu/events
11 https://global.virginia.edu/events
0 https://cal.lib.virginia.edu/calendar/events?cid=4299&t=m&d=0000-00-00&cal=4299&ct=69160,33395,66337,31015,30813,51597,58853,58854,58855,58856,70846,45972,31362,27888,30045,27381,57994,54907,26930,29624,56703,66253,66255,66338,46136,70848,33496,70427,27725,29618,63738,28898,33396,38996,50481,70849,51598,29985&inc=0
93 https://engineering.virginia.edu/news-events/events
24 https://commcal.mcintire.virginia.edu/
1 https://www.arch.virginia.edu/events?search=&start=&end=&range=upcoming&events=&pageindex=1&pagesize=12
1 https://news.med.virginia.edu/
0 https://events.batten.virginia.edu/
117 https://economics.virginia.edu/calendar/month?date=2024-04
102 https://career.virginia.edu/Employers


0

In [73]:
get_html(lines[1])

'<!DOCTYPE html>\n  <!--[if IEMobile 7]><html class="no-js ie iem7" lang="en" dir="ltr"><![endif]-->\n  <!--[if lte IE 6]><html class="no-js ie lt-ie9 lt-ie8 lt-ie7" lang="en" dir="ltr"><![endif]-->\n  <!--[if (IE 7)&(!IEMobile)]><html class="no-js ie lt-ie9 lt-ie8" lang="en" dir="ltr"><![endif]-->\n  <!--[if IE 8]><html class="no-js ie lt-ie9" lang="en" dir="ltr"><![endif]-->\n  <!--[if (gte IE 9)|(gt IEMobile 7)]><html class="no-js ie" lang="en" dir="ltr" prefix="fb: https://ogp.me/ns/fb# og: https://ogp.me/ns#"><![endif]-->\n  <!--[if !IE]><!--><html class="no-js" lang="en" dir="ltr" prefix="fb: https://ogp.me/ns/fb# og: https://ogp.me/ns#"><!--<![endif]-->\n<head>\n  <meta charset="utf-8" /><script type="text/javascript">(window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={licenseKey:"5e6a8310c7",applicationID:"55446903"};;/*! For license information please see nr-loader-rum-1.255.0.min.js.LICENSE.txt */\n(()=>{var e,t,r={

In [63]:
soup = bs4.BeautifulSoup(html, 'html.parser')
event_classes = extract_event_text(soup)
with open('event_classes.txt', 'w') as f:
    for event_class in event_classes:
        f.write(event_class)



['\n\nEvents\nCurrent and past events hosted, sponsored, or partnered on by the Scholars\' Lab.\n\n\n\nUpcoming Events\n\n\n\nApr17\n\n\nMake a leather book cover\n\n\n\n\n\nWhen: Wednesday, April 17, 2024, 1:00PM-3:00PM\nWhere: Scholars\' Lab Makerspace - Alderman 308i\n\n\nDetails ›\n\n\n\n\nMay3\n\n\nMay the 4th\n\n\n\n\n\nWhen: Friday, May 3, 2024, -\nWhere: Scholars\' Lab Makerspace - Alderman 308i\n\n\nDetails ›\n\n\n\n\nMay6\n\n\nMother\'s Day\n\n\n\n\n\nWhen: Monday, May 6, 2024, -\nWhere: Scholars\' Lab Makerspace - Alderman 308i\n\n\nDetails ›\n\n\n\n \nPrevious Events\n\n\n2024\n\n\nMake a Website (Wednesday, April 3, 2024)\n\n\nApril Fools\' Make-a-Prank (Monday, April 1, 2024)\n\n\nIntroduction to ArcGIS StoryMaps (Wednesday, March 27, 2024)\n\n\nArduino Basics (Wednesday, March 20, 2024)\n\n\nApp Your Map with Web AppBuilder (Wednesday, March 20, 2024)\n\n\nTote Bag Workshop (Sunday, March 17, 2024)\n\n\nPi Day (Thursday, March 14, 2024)\n\n\nCollect Spatial Data in the F

['\n\n\n', '\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHome\nAbout\nBlog\nMakerspace\nOur Work\nEvents\nFor Students\nResearch & Development\nSpatial Tech\nAccessibility\nYear of Blogging\nCharter\nLibrary\nPeople\nSearch\nSite Map\n\nMore›\n\nHome\nAbout\nBlog\nMakerspace\nOur Work\nEvents\nFor Students\nResearch & Development\nSpatial Tech\nAccessibility\nYear of Blogging\nCharter\nLibrary\nPeople\nSearch\nSite Map\n\n\n\n \n', '\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHome\nAbout\nBlog\nMakerspace\nOur Work\nEvents\nFor Students\nResearch & Development\nSpatial Tech\nAccessibility\nYear of Blogging\nCharter\nLibrary\nPeople\nSearch\nSite Map\n\nMore›\n\nHome\nAbout\nBlog\nMakerspace\nOur Work\nEvents\nFor Students\nResearch & Development\nSpatial Tech\nAccessibility\nYear of Blogging\nCharter\nLibrary\nPeople\nSearch\nSite Map\n\n\n\n', '\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n', 'More›', '\nThe Scholars’ Lab staff is offering a mix of remote and in-person\nconsultations, workshops, and eve

In [None]:
# TODO: extract the correct information to fit the schema
