In [10]:
import os 
import google.generativeai as genai
import pathlib
import textwrap
from IPython.display import Markdown

GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

with open ('sites.txt') as f:
    lines = f.readlines()
    lines = [line.rstrip() for line in lines]
f.close()

# manual div class keywords
keywords = ["event", "content", "detail", "card", "views","location","time", "date", "notes", "evt"]

# previous 10 years
years = [str(i) for i in range(2010, 2024)]

# all monnth in title case
old_months = ["January", "February", "March"]

### Main Functions


In [28]:
# Custom HTML Parsing as solution vs. LLM Text Extraction w/ HTML Filtering
import bs4
import requests
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

"""
1. Retrieve HTML from a site
1.1 Retrieve HTML from a site using Selenium
2. Extract event text from HTML
2.5 Preprocess Event Text
3. Store event text in a file
4. Convert to JSON or CSV via LLM
5. Store in a database

"""

# 1 - Retrieve HTML from a site
def get_html(site):
    response = requests.get(site)
    if response is None:
        print('Failed to retrieve html from site')
        return None
    return response.text

# 1.1 - Retrieve HTML from a site using Selenium
def get_html_selenium(site):
    driver = webdriver.Chrome()
    driver.get(site)
    element = None

    # Ugly Code
    # Library Calendar: 4
    if "lib" in site:
        element = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CLASS_NAME, "s-lc-mc-evt"))
        )

    # Battern  Calendar: 9
    elif "batten" in site:
        print("Batten")
        iframe_id = "spud913f6613-59b3-4547-ba85-97693a7c9dbb.iframe"
        element = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.ID, iframe_id))
        )
        time.sleep(5)
        driver.switch_to.frame(iframe_id)
        print("Found IFRAME")
        time.sleep(5)

    # University Calendar: 1
    elif "virginia.edu/calendar" in site:
        print('Virginia')
        iframe_id = "trumba.spud.5.iframe"
        element = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.ID, iframe_id))
        )
        driver.switch_to.frame(iframe_id)
        print("Found IFRAME")
        time.sleep(5)

    html = driver.page_source
    driver.quit()
    return html

# --------------------------------------------------------------------------------------------

# 2 - Extract event text from HTML
def extract_event_text(soup):
    all_divs = soup.find_all('div')
    lowercase_all_divs_classes(all_divs)
    event_divs = filter_event_divs(all_divs)
    event_text = extract_text_from_event_divs(event_divs)
    return event_text

# 2.5 - Preprocess Event Text
def lowercase_all_divs_classes(divs):
    for div in divs:
        if div.has_attr('class'):
            div['class'] = [x.lower() for x in div['class']]

def filter_event_divs(all_divs):
    event_divs = []
    for div in all_divs:
        if div.get('class') is not None and any(keyword in div.get('class')[0] for keyword in keywords):
            event_divs.append(div)
    return event_divs

def extract_text_from_event_divs(event_divs):
    event_text = []
    for div in event_divs:
        text = div.get_text()
        text = [x for x in text.split('\n') if x != '']
        for line in text:
            if is_old_event(line):
                continue
            while '\n' in line:
                line = line.replace('\n', ' ')
            event_text.append(line + '\n')
    return event_text

# TODO: Update to use datetime
def is_old_event(line):
    if any(year in line for year in years):
        return True
    elif any(month in line for month in old_months):
        return True
    return False

def remove_duplicates(event_text):
    return list(set(event_text))

# --------------------------------------------------------------------------------------------


# 3 - Store event text in a file
def write_event_text(event_text, filename):
    event_text = [x.encode('ascii', 'ignore').decode('ascii') for x in event_text]
    folder = "extracted_txt"
    file_path = folder + "/" + filename
    with open(file_path, 'w') as f:
        for event in event_text:
            if len(event) > 0 or event != ' ':
                f.write(event)
    f.close()
    return 0

# --------------------------------------------------------------------------------------------

# 4 - Convert to JSON or CSV via LLM
def convert_to_json(site_index):
    input_text = """ 
    Convert this text to JSON for events with this schema: 
    Event:
    - Title: The name of the event (String)
    - Start Time: Military time when the event starts (DateTime)
    - End Time: Military time when the event ends (DateTime)
    - Date: (Month, Day) of the event (DateTime)
    - Location: Where the event takes place (String)
    - Guest Speaker: The person or organization responsible for the event (String)

    Do not add '''JSON''' to the start or end of the file.
    """
    filename = f'site_{site_index}.txt'
    with open(f'extracted_txt/{filename}') as f:
        site_text = f.read()
    input_text += site_text

    model = genai.GenerativeModel('gemini-pro')
    response = model.generate_content(input_text)

    with open(f'event_jsons/site_{site_index}.json', 'w') as f:
        f.write(response.text)
    f.close()
    
    return 0

# --------------------------------------------------------------------------------------------

# 5 - Read JSON and store in a database
def read_json(site_index):
    with open(f'event_jsons/site_{site_index}.json') as f:
        event_json = f.read()
    f.close()
    return event_json

# --------------------------------------------------------------------------------------------


def single_site(site):
    html = get_html_selenium(site)
    print(html)
    soup = bs4.BeautifulSoup(html, 'html.parser')
    event_text = extract_event_text(soup)
    write_event_text(event_text, 'site.txt')    
    print(len(event_text))
    print(event_text)
    return 0

def process_sites():
    for i, site in enumerate(lines):
        # ugly code
        if i == 1 or i == 4 or i == 9:
            html = get_html_selenium(site)
        else:
            html = get_html(site)

        soup = bs4.BeautifulSoup(html, 'html.parser')
        event_text = extract_event_text(soup)
        write_event_text(event_text, f'site_{i}.txt')
        convert_to_json(i)
        # gemini pro free can only process 2 requests per minute
        time.sleep(30)
    return 0

In [79]:
process_sites()

Virginia
Found IFRAME
Batten
Found IFRAME


0

### Json Playground

In [32]:
import json
with open('event_jsons/site_4.json') as f:
    data = json.load(f)
    print(len(data))
    print(data)
f.close()

with open('extracted_txt/site_1.txt') as f:
    data = f.read()
    print(data)

9
[{'Title': 'Graduate Writing Caf', 'Start Time': '12:00pm', 'End Time': None, 'Date': None, 'Location': 'Holloway Hall (Bavaro 116)', 'Guest Speaker': None}, {'Title': 'Second Tuesday Forum: Jeremy Boggs', 'Start Time': '2:00pm', 'End Time': None, 'Date': None, 'Location': 'Common Room (Rm 308)', 'Guest Speaker': 'Jeremy Boggs'}, {'Title': 'DH Portfolio Design Workshop', 'Start Time': '3:30pm', 'End Time': None, 'Date': None, 'Location': 'Common Room (Rm 308)', 'Guest Speaker': None}, {'Title': 'EHD Faculty Shannon Library Tour', 'Start Time': '11:00am', 'End Time': None, 'Date': None, 'Location': 'Other', 'Guest Speaker': None}, {'Title': 'A Librarian Like No Other: Belle da Costa Greene and Self-Invention', 'Start Time': '2:00pm', 'End Time': None, 'Date': None, 'Location': 'Shannon Seminar Room 330', 'Guest Speaker': None}, {'Title': 'Making Noise in the Library: The Dale Avenue Band', 'Start Time': '1:00pm', 'End Time': None, 'Date': None, 'Location': 'Other', 'Guest Speaker': 'T

### LLM Playground

In [5]:
model = genai.GenerativeModel('gemini-pro')
response = model.generate_content(input_text)

In [6]:
to_markdown(response.text)


> ```JSON
> [
>   {
>     "Title": "Graduate Writing Cafe",
>     "Start Time": "12:00 PM",
>     "End Time": "3:00 PM",
>     "Date": "09 April",
>     "Location": "Holloway Hall, Bavaro",
>     "Guest Speaker": null
>   },
>   {
>     "Title": "Introduction to Graduate Study at the UVA School of Education and Human Development",
>     "Start Time": "4:00 PM",
>     "End Time": "5:00 PM",
>     "Date": "11 April",
>     "Location": "Virtual",
>     "Guest Speaker": null
>   },
>   {
>     "Title": "UVA Communication Science and Disorders Accreditation Public Meeting",
>     "Start Time": "4:30 PM",
>     "End Time": "6:00 PM",
>     "Date": "17 April",
>     "Location": "Room 318A, Bavaro Hall",
>     "Guest Speaker": null
>   },
>   {
>     "Title": "Wheelchair Takeover: Paralympic Pick-up",
>     "Start Time": "3:00 PM",
>     "End Time": "6:00 PM",
>     "Date": "19 April",
>     "Location": "Memorial Gym Outdoor Courts",
>     "Guest Speaker": null
>   },
>   {
>     "Title": "EHD Black Alumni Weekend Events: Honoring the Past, Advancing in the Future",
>     "Start Time": "3:30 PM",
>     "End Time": "5:30 PM",
>     "Date": "19 April",
>     "Location": "Holloway Hall, Bavaro",
>     "Guest Speaker": null
>   },
>   {
>     "Title": "M.Ed. in Administration & Supervision Information Session",
>     "Start Time": "4:00 PM",
>     "End Time": "5:00 PM",
>     "Date": "22 April",
>     "Location": "Virtual",
>     "Guest Speaker": null
>   },
>   {
>     "Title": "M.Ed. in Administration & Supervision Information Session",
>     "Start Time": "6:00 PM",
>     "End Time": "7:00 PM",
>     "Date": "23 April",
>     "Location": "Virtual",
>     "Guest Speaker": null
>   },
>   {
>     "Title": "How to Become a Teacher Information Session April 2024",
>     "Start Time": "7:00 PM",
>     "End Time": "8:00 PM",
>     "Date": "23 April",
>     "Location": "Virtual",
>     "Guest Speaker": null
>   },
>   {
>     "Title": "EHD Yoga",
>     "Start Time": "9:00 AM",
>     "End Time": "10:00 AM",
>     "Date": "29 April",
>     "Location": "Holloway/Bavaro atrium",
>     "Guest Speaker": null
>   },
>   {
>     "Title": "Ice Cream Social",
>     "Start Time": "12:00 PM",
>     "End Time": "2:00 PM",
>     "Date": "30 April",
>     "Location": "Bavaro Atrium/Courtyard",
>     "Guest Speaker": null
>   }
> ]
> ```

### Test Functions

In [76]:
def test_parsings(sites):
    for id, site in enumerate(sites):
        if id == 1 or id == 4 or id == 9:
            print(site)
            html = get_html_selenium(site)
        else:
            html = get_html(site)
        if html is not None:
            soup = bs4.BeautifulSoup(html, 'html.parser')
            event_text = extract_event_text(soup)
            print(len(event_text), site)
    return 0 

In [77]:
test_parsings(lines)

84 https://scholarslab.lib.virginia.edu/events/
https://www.virginia.edu/calendar
Virginia
Found IFRAME
33 https://www.virginia.edu/calendar
595 https://education.virginia.edu/events
88 https://global.virginia.edu/events
https://cal.lib.virginia.edu/calendar/events?cid=4299&t=m&d=0000-00-00&cal=4299&ct=69160,33395,66337,31015,30813,51597,58853,58854,58855,58856,70846,45972,31362,27888,30045,27381,57994,54907,26930,29624,56703,66253,66255,66338,46136,70848,33496,70427,27725,29618,63738,28898,33396,38996,50481,70849,51598,29985&inc=0
57 https://cal.lib.virginia.edu/calendar/events?cid=4299&t=m&d=0000-00-00&cal=4299&ct=69160,33395,66337,31015,30813,51597,58853,58854,58855,58856,70846,45972,31362,27888,30045,27381,57994,54907,26930,29624,56703,66253,66255,66338,46136,70848,33496,70427,27725,29618,63738,28898,33396,38996,50481,70849,51598,29985&inc=0
1152 https://engineering.virginia.edu/news-events/events
24 https://commcal.mcintire.virginia.edu/
22 https://www.arch.virginia.edu/events?sea

0