In [2]:

with open ('sites.txt') as f:
    lines = f.readlines()
    lines = [line.rstrip() for line in lines]
f.close()

# manual div class keywords
keywords = ["event", "content", "detail", "card", "views","location","time", "date", "notes", "evt"]

# previous 10 years
years = [str(i) for i in range(2010, 2024)]

# all monnth in title case
old_months = ["January", "February", "March"]

### Main Functions


In [78]:
# Custom HTML Parsing as solution vs. LLM Text Extraction w/ HTML Filtering
import bs4
import requests
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

"""
1. Retrieve HTML from a site
1.1 Retrieve HTML from a site using Selenium
2. Extract event text from HTML
2.5 Preprocess Event Text
3. Store event text in a file
4. Convert to JSON or CSV via LLM
5. Store in a database

"""

# 1 - Retrieve HTML from a site
def get_html(site):
    response = requests.get(site)
    if response is None:
        print('Failed to retrieve html from site')
        return None
    return response.text

# 1.1 - Retrieve HTML from a site using Selenium
def get_html_selenium(site):
    driver = webdriver.Chrome()
    driver.get(site)
    element = None
    # Library Calendar: 4
    if "lib" in site:
        element = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CLASS_NAME, "s-lc-mc-evt"))
        )

    # Battern  Calendar: 9
    elif "batten" in site:
        print("Batten")
        iframe_id = "spud913f6613-59b3-4547-ba85-97693a7c9dbb.iframe"
        element = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.ID, iframe_id))
        )
        time.sleep(5)
        driver.switch_to.frame(iframe_id)
        print("Found IFRAME")
        time.sleep(5)

    # University Calendar: 1
    elif "virginia.edu/calendar" in site:
        print('Virginia')
        iframe_id = "trumba.spud.5.iframe"
        element = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.ID, iframe_id))
        )
        driver.switch_to.frame(iframe_id)
        print("Found IFRAME")
        time.sleep(5)

    html = driver.page_source
    driver.quit()
    return html

# --------------------------------------------------------------------------------------------

# 2 - Extract event text from HTML
def extract_event_text(soup):
    all_divs = soup.find_all('div')
    lowercase_all_divs_classes(all_divs)
    event_divs = filter_event_divs(all_divs)
    event_text = extract_text_from_event_divs(event_divs)
    return event_text

# 2.5 - Preprocess Event Text
def lowercase_all_divs_classes(divs):
    for div in divs:
        if div.has_attr('class'):
            div['class'] = [x.lower() for x in div['class']]

def filter_event_divs(all_divs):
    event_divs = []
    for div in all_divs:
        if div.get('class') is not None and any(keyword in div.get('class')[0] for keyword in keywords):
            event_divs.append(div)
    return event_divs

def extract_text_from_event_divs(event_divs):
    event_text = []
    for div in event_divs:
        text = div.get_text()
        text = [x for x in text.split('\n') if x != '']
        for line in text:
            if is_old_event(line):
                continue
            while '\n' in line:
                line = line.replace('\n', ' ')
            event_text.append(line + '\n')
    return event_text


def is_old_event(line):
    if any(year in line for year in years):
        return True
    elif any(month in line for month in old_months):
        return True
    return False

def remove_duplicates(event_text):
    return list(set(event_text))

# --------------------------------------------------------------------------------------------


# 3 - Store event text in a file
def write_event_text(event_text, filename):
    event_text = [x.encode('ascii', 'ignore').decode('ascii') for x in event_text]
    folder = "extracted_txt"
    file_path = folder + "/" + filename
    with open(file_path, 'w') as f:
        for event in event_text:
            if len(event) > 0 or event != ' ':
                f.write(event)
    f.close()
    return 0

# 4 - Convert to JSON or CSV via LLM
def convert_to_json(event_text):
    # TODO: call LLM API or make one
    
    return 0

# --------------------------------------------------------------------------------------------

def single_site(site):
    html = get_html_selenium(site)
    print(html)
    soup = bs4.BeautifulSoup(html, 'html.parser')
    event_text = extract_event_text(soup)
    write_event_text(event_text, 'site.txt')
    
    print(len(event_text))
    print(event_text)
    return 0

def main():
    for i, site in enumerate(lines):
        if i == 1 or i == 4 or i == 9:
            html = get_html_selenium(site)
        else:
            html = get_html(site)
        soup = bs4.BeautifulSoup(html, 'html.parser')
        event_text = extract_event_text(soup)
        write_event_text(event_text, f'site_{i}.txt')
    return 0

In [79]:
main()

Virginia
Found IFRAME
Batten
Found IFRAME


0

### LLM Playground

In [9]:
input_text = """Convert this text to JSON for events with this schema: 
Event:
- ID: Unique identifier for the event (Integer)
- Title: The name of the event (String)
- Description: Detailed information about the event (String)
- Start Time: The date and time when the event starts (DateTime)
- End Time: The date and time when the event ends (DateTime)
- Location: Where the event takes place (String)
- Organizer: The person or organization responsible for the event (String)
- Guest Speaker: The person or organization responsible for the event (String)
"""
with open('extracted_txt/site_1.txt') as f:
    site_text = f.read()
f.close()
input_text += site_text

In [13]:
input_text

"Convert this text to JSON for events with this schema: \nEvent:\n- ID: Unique identifier for the event (Integer)\n- Title: The name of the event (String)\n- Description: Detailed information about the event (String)\n- Start Time: The date and time when the event starts (DateTime)\n- End Time: The date and time when the event ends (DateTime)\n- Location: Where the event takes place (String)\n- Organizer: The person or organization responsible for the event (String)\n- Guest Speaker: The person or organization responsible for the event (String)\nEvents\nCurrent and past events hosted, sponsored, or partnered on by the Scholars' Lab.\nUpcoming Events\nApr17\nMake a leather book cover\nWhen: Wednesday, April 17, 2024, 1:00PM-3:00PM\nWhere: Scholars' Lab Makerspace - Alderman 308i\nDetails \nMay3\nMay the 4th\nWhen: Friday, May 3, 2024, -\nWhere: Scholars' Lab Makerspace - Alderman 308i\nDetails \nMay6\nMother's Day\nWhen: Monday, May 6, 2024, -\nWhere: Scholars' Lab Makerspace - Alderman

In [7]:
import os 
import google.generativeai as genai
import pathlib
import textwrap
from IPython.display import Markdown


GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

# call genai with the api key
# genai.extract_text_from_image('image.jpg')

In [10]:
model = genai.GenerativeModel('gemini-pro')
response = model.generate_content(input_text)

In [11]:
to_markdown(response.text)


> ```json
> [
>   {
>     "ID": 1,
>     "Title": "Facilities Management Apprenticeship & Community Job Fair",
>     "Description": "Join us on April 10 at UVA's Alumni Hall for the annual Apprenticeship Job Fair, featuring UVA FM Apprenticeship and other local employers offering apprenticeships & entry-level positions.",
>     "Start Time": "2024-04-10T09:11:00-04:00",
>     "End Time": null,
>     "Location": "UVA's Alumni Hall",
>     "Organizer": null,
>     "Guest Speaker": null
>   },
>   {
>     "ID": 2,
>     "Title": "In The Frame: Indigenous Vernacular Photography Across Two Continents",
>     "Description": "Join Micheal Aird and Karen Hughes at the Lemon Lounge, 946 Grady Ave, suite 100 for a discussion about how candid and everyday photographs of Indigenous people across multiple communities disrupt assumed settler historical narratives. Aird and Hughes will present their research project that investigates, connects, and exhibits Indigenous community-controlled photography across four communities in Australia and North America.This program will be of interest particularly to community members and people at UVA who wish to work with Indigenous and descendant communities.Michael Aird is a First Nations Australian photographer and curator and director of the University of Queensland Anthropology Museum. He has worked in the area of Aboriginal arts and cultural heritage since 1985 maintaining an interest in documenting aspects of urban Aboriginal history and culture.Karen Hughes is Associate Professor of Indigenous Studies at Swinburne University of Technology. Her research focuses on Indigenous and cross-cultural social and political histories in Australia and North America.",
>     "Start Time": "2024-04-10T14:00:00-04:00",
>     "End Time": null,
>     "Location": "Lemon Lounge, 946 Grady Ave, suite 100",
>     "Organizer": null,
>     "Guest Speaker": null
>   },
>   {
>     "ID": 3,
>     "Title": "M.S. in Commerce Information Session",
>     "Description": "Join us for this information session to learn about the UVA McIntire M.S. in Commerce (MSC) program with tracks in Biotechnology, Business Analytics, Finance, and Marketing & Management. This session will include a comprehensive overview of the academic program, student experience, and career development support as well as an overview of the admissions process and next steps.",
>     "Start Time": "2024-04-10T15:30:00-04:00",
>     "End Time": "2024-04-10T16:30:00-04:00",
>     "Location": null,
>     "Organizer": null,
>     "Guest Speaker": null
>   },
>   {
>     "ID": 4,
>     "Title": "Morven Student Days",
>     "Description": "UVA students are invited to explore, experience, and enjoy Morvens unique landscape!From 10am-4pm, we invite you to use our indoor and outdoor spaces to study and relax, enjoy Arepas food truck, therapy dogs, and more!These spaces include our Formal Garden, Main House, beautiful historic buildings, and more!Free transportation from UVA Grounds to Morven available on a first come first serve basis.Visit our website for more details!",
>     "Start Time": "2024-04-11T10:00:00-04:00",
>     "End Time": "2024-04-11T16:00:00-04:00",
>     "Location": "Morven",
>     "Organizer": null,
>     "Guest Speaker": null
>   },
>   {
>     "ID": 5,
>     "Title": "CANCELLED - North Grounds E-Bike Demo Day",
>     "Description": "Come test ride a selection of e-bikes to celebrate Earth Month and in preparation for Cville Bike Month in May! If you are curious about switching to this healthy and sustainable mode of transportation, this a great opportunity to test e-bikes before making a commitment.",
>     "Start Time": "2024-04-11T11:00:00-04:00",
>     "End Time": "2024-04-11T14:00:00-04:00",
>     "Location": null,
>     "Organizer": null,
>     "Guest Speaker": null
>   },
>   {
>     "ID": 6,
>     "Title": "Miraculous Objects: Relics and Buddha Images in Xuanzangs Datang Xiyu ji",
>     "Description": "Join in person or online as Professor Deeg of Cardiff University presents on one of the major sources used for the study of early medieval Buddhism: \"Record of the Western Regions of the Great Tang\" by Chinese monk Xuanzang (600/602-664).The focus has been and still in on the value of the text for archaeology and historical geography: through the information given in the \"Record\" places and sites linked to the historical Buddha Sakyamuni have been identified and excavated. Much less attention has been given to the narratives recorded or paraphrased by Xuanzang, particularly when they differ from the \"standard\" versions in extant Indian Buddhist texts. Even less research has been done on the soemtimes quite detailed stories about relics or/and images of the Buddha in different places on the South Asian subcontinent. This talk will discuss and contextualize the cases of the famous Buddha image and relics in the Northwest (Gandhara) and the miraculous Buddha statue in Bodhgaya.",
>     "Start Time": "2024-04-11T12:15:00-04:00",
>     "End Time": "2024-04-11T13:15:00-04:00",
>     "Location": null,
>     "Organizer": null,
>     "Guest Speaker": null
>   },
>   {
>     "ID": 7,
>     "Title": "National defense strategy: Asking what you can do for your country",
>     "Description": "The Commission on the National Defense Strategy is holding public conversations and meeting with senior national security leaders, business leaders, members of Congress, and foreign allies and partners.We invite you to join Commission Chair Jane Harman and Vice Chair Eric Edelman, along with commissioners Tom Mahnken, General Jack Keane, and Mara Rudman, for apublic conversation moderated by Miller Center Director and CEO William Antholis. ONLINE and IN PERSON",
>     "Start Time": "2024-04-11T14:00:00-04:00",
>     "End Time": "2024-04-11T15:00:00-04:00",
>     "Location": null,
>     "Organizer": null,
>     "Guest Speaker": null
>   },
>   {
>     "ID": 8,
>     "Title": "Social Entrepreneurs and Ventures in Food Justice",
>     "Description": "Join us for an insightful panel discussion on food entrepreneurship and waste reduction as part of the annual FEED Week (Food Engagement, Education, and Discussion Week) on Thursday, April 11th at 4pm at Darden - North Grounds, CLA 140. The panel, co-sponsored by Food Assist and Net Impact at Darden, will feature esteemed guests Clara Camber, a UVA alumni at Goodr, and Eric Walter, the owner of Black Bear Composting. are eager to hear your perspectives and experiences regarding the intersection of business and sustainability. These Panelists insights will shed light on the creative strategies employed by businesses on and off grounds to reduce food waste and promote a more sustainable food system. This panel discussion promises to be an enlightening and inspiring event, highlighting the power of entrepreneurship in addressing critical issues within our food system. Attendees will gain valuable insights into the challenges and opportunities surrounding food waste reduction and the role of businesses in driving positive market change in food systems. To reserve a ticket visit: www.eventbrite.com",
>     "Start Time": "2024-04-11T16:00:00-04:00",
>     "End Time": "2024-04-11T18:00:00-04:00",
>     "Location": "Darden - North Grounds, CLA 140",
>     "Organizer": null,
>     "Guest Speaker": null
>   },
>   {
>     "ID": 9,
>     "Title": "Matthew Olzmann Poetry Reading",
>     "Description": "Matthew Olzmann reads from his work as part of his time at UVA as a Rea Visiting Writer in Poetry. Olzmann is the author of Constellation Route as well as two previous collections of poetry: Mezzanines and Contradictions in the Design. A recipient of fellowships from Kundiman, MacDowell, and the National Endowment for the Arts, Olmanns poems have appeared in the New York Times, Best American Poetry, the Pushcart Prizes, Kenyon Review, and elsewhere. He is a

In [5]:

from transformers import T5Tokenizer, TFT5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = TFT5ForConditionalGeneration.from_pretrained("google/flan-t5-small")

input_ids = tokenizer(input_text, return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Token indices sequence length is longer than the specified maximum sequence length for this model (791 > 512). Running this sequence through the model will result in indexing errors


ImportError: Unable to convert output to PyTorch tensors format, PyTorch is not installed.

### Test Functions

In [76]:
def test_parsings(sites):
    for id, site in enumerate(sites):
        if id == 1 or id == 4 or id == 9:
            print(site)
            html = get_html_selenium(site)
        else:
            html = get_html(site)
        if html is not None:
            soup = bs4.BeautifulSoup(html, 'html.parser')
            event_text = extract_event_text(soup)
            print(len(event_text), site)
    return 0 

In [77]:
test_parsings(lines)

84 https://scholarslab.lib.virginia.edu/events/
https://www.virginia.edu/calendar
Virginia
Found IFRAME
33 https://www.virginia.edu/calendar
595 https://education.virginia.edu/events
88 https://global.virginia.edu/events
https://cal.lib.virginia.edu/calendar/events?cid=4299&t=m&d=0000-00-00&cal=4299&ct=69160,33395,66337,31015,30813,51597,58853,58854,58855,58856,70846,45972,31362,27888,30045,27381,57994,54907,26930,29624,56703,66253,66255,66338,46136,70848,33496,70427,27725,29618,63738,28898,33396,38996,50481,70849,51598,29985&inc=0
57 https://cal.lib.virginia.edu/calendar/events?cid=4299&t=m&d=0000-00-00&cal=4299&ct=69160,33395,66337,31015,30813,51597,58853,58854,58855,58856,70846,45972,31362,27888,30045,27381,57994,54907,26930,29624,56703,66253,66255,66338,46136,70848,33496,70427,27725,29618,63738,28898,33396,38996,50481,70849,51598,29985&inc=0
1152 https://engineering.virginia.edu/news-events/events
24 https://commcal.mcintire.virginia.edu/
22 https://www.arch.virginia.edu/events?sea

0