In [1]:
from populate_events_functions import get_active_venues
import psycopg2
import os
import requests
from bs4 import BeautifulSoup
from meta_scraping_functions import discover_html_selectors, generate_scraping_config, test_scraping_config
from openai import OpenAI
import re
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from populate_events_functions import start_selenium, parse_date
import time
from populate_events_functions import parse_date
from datetime import date, datetime

conn = psycopg2.connect(os.getenv('DATABASE_URL_UNPOOLED'))
cur = conn.cursor()

venues=get_active_venues(cur)
urls = {venue['scraping_config']['base_url']: venue['scraping_config'] for venue in venues}


üéµ Found 13 active venues:
   - Ace of Spades
   - Avogardro's Number
   - Goldfield Trading Post
   - Neck of the Woods
   - Rickshaw Stop
   - The Aggie Theatre
   - The Armory
   - The Chapel
   - The Great American Music Hall
   - The Independent
   - The Mishawaka 
   - The Warfield
   - Washington's


In [2]:
def get_selectors(soup):
    for tag in soup(['script', 'style', 'svg', 'iframe', 'noscript', 'meta', 'link']):
        tag.decompose()
    
    # Remove common non-content sections
    for tag in soup.find_all(['nav', 'header', 'footer']):
        tag.decompose()
    body = soup.find('body') or soup
    # Convert to string and send to API
    html = str(body)
    client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

    response = client.chat.completions.create(
        model="gpt-4o-mini",  # cheapest model
        messages=[
            {
                "role": "system",
                "content": "You are a web scraping expert. Find CSS selectors."
            },
            {
                "role": "user", 
                "content": f"""
    Look at this HTML and tell me:
    1. The CSS selector for the container that holds each event
    2. The CSS selector for the artist name
    3. The CSS selector for the date
    HTML:
    {html}

    Return JSON only:
    {{
    "container": "...",
    "artist": "...",
    "date": "...",
    }}
    """
            }
        ]
    )

    result_text = response.choices[0].message.content.strip()

    # Clean markdown if present
    result_text = re.sub(r'^```(?:json)?\s*|\s*```$', '', result_text.strip(), flags=re.MULTILINE)

    # Parse JSON
    return json.loads(result_text)

def get_date_format(dates):

    client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

    response = client.chat.completions.create(
        model="gpt-4o-mini",  # cheapest model
        messages=[
            {
                "role": "system",
                "content": "You are a syntax expert specifically in dates in python. Return ONLY valid Python strptime format codes."
            },
            {
                "role": "user", 
                "content": f"""

                Look at the following dates and return JUST the Python strptime format string that fits the dates.
                IMPORTANT: Use ONLY standard Python strptime codes like %a, %A, %b, %B, %d, %m, %Y, %y, etc.
                DO NOT include literal text like 'th', 'st', 'nd', 'rd' in the format string.
                
                Examples:
                - If dates look like "Fri Oct 24", the format would be "%a %b %d"
                - If dates look like "10/24/2025", the format would be "%m/%d/%Y"
                - If dates look like "Friday October 24th", the format would be "%A %B %d" (removed 'th')
                - If dates look like "24", this cannot be parsed - return "INVALID"
                
                {dates}
                Return just the format string or "INVALID" if the dates don't have enough info.

                """
                        }
                    ]
                )

    result_text = response.choices[0].message.content.strip()

    # Clean markdown if present
    result_text = re.sub(r'^```(?:json)?\s*|\s*```$', '', result_text.strip(), flags=re.MULTILINE)
    result_text = result_text.strip('"\'')  # Remove quotes if present
    
    print(f'date format: {dates[0]} --> {result_text}')
    return result_text

def parse_date(raw_date_text, date_format):
    current_year = date.today().year
    
    # Remove ordinal suffixes (st, nd, rd, th) from the text
    raw_date_text = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', raw_date_text)
    
    # Check if year is already in the format
    has_year = '%Y' in date_format or '%y' in date_format
    has_month = '%m' in date_format or '%b' in date_format or '%B' in date_format
    
    # If format is incomplete, try to handle it
    if not has_year:
        if not has_month:
            # Only day number - this is invalid, can't parse
            raise ValueError(f"Cannot parse date '{raw_date_text}' with incomplete format '{date_format}'")
        # Has month but no year - append current year
        parsed_date = datetime.strptime(f"{raw_date_text.strip()} {current_year}", f"{date_format} %Y").date()
        # If the date is in the past, assume next year
        if parsed_date < date.today():
            parsed_date = parsed_date.replace(year=current_year + 1)
    else:
        # Full format with year
        parsed_date = datetime.strptime(raw_date_text.strip(), date_format).date()
    
    return parsed_date

def check_date_format(raw_dates, date_format):
    if date_format.upper() == "INVALID":
        return 0.0
        
    c = 0
    for raw_date in raw_dates:
        try:
            parse_date(raw_date, date_format)
            c+=1
        except (ValueError, TypeError) as e:
            pass
    return c/len(raw_dates) if raw_dates else 0.0

def get_events(soup, selectors):
        
    containers = soup.select(selectors['container'])

    events = []
    for container in containers:
        # Find artist and date within each container
        artist_elem = container.select_one(selectors['artist'])
        date_elem = container.select_one(selectors['date'])
        
        artist = artist_elem.get_text(strip=True) if artist_elem else None
        raw_date = date_elem.get_text(strip=True) if date_elem else None
        if artist and raw_date:
            events.append({
                'artist': artist,
                'raw_date': raw_date
            })
    if len(events) > 0:
        print(f"Yay! {len(events)} events found")
        for event in events[:2]:
            print(f"             {event['artist']} - {event['raw_date']}")
    else:
        print('No events found :( ')
    return events

In [3]:
driver = start_selenium()
all_events=[]
scraping_params = {}
try:
    for url in urls.keys():
        print(url)
        driver.get(url)
        time.sleep(3)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        selectors = get_selectors(soup)
        print(selectors)
        events = get_events(soup, selectors)
        all_events.append(events)
        scraping_params[url] = selectors
        

finally:
    driver.quit()
    print('i QUIT!')


https://www.aceofspadessac.com/shows
{'container': '.chakra-linkbox.css-8092ru', 'artist': '.chakra-text.css-zvlevn', 'date': '.chakra-text.css-dwkl9v'}
Yay! 36 events found
             Spafford - Wed Oct 29, 2025
             Reggaetonlandia 18+ - Thu Oct 30, 2025
https://avogadros.com/fort-collins-avogadro-s-number-events
{'container': '.events-holder section', 'artist': 'h2', 'date': 'h3:not(.event-time)'}
Yay! 29 events found
             Blue Monday - Monday October 27th
             Open Mic! - Tuesday October 28th
https://goldfieldtradingpost.com/
{'container': '.events-list', 'artist': '.event-title h3 a', 'date': '.event-date .date h4'}
Yay! 20 events found
             A Lot Like Birds - Oct 27
             Surfer Girl - Oct 28
https://www.neckofthewoodssf.com/page/2/
{'container': '.tw-section', 'artist': '.tw-name a', 'date': '.tw-event-date'}
Yay! 18 events found
             Neck of the Woods SF Open Mic Wednesdays - Wed, Nov 12
             Clement St Comedy - Thu, Nov 

In [5]:
for events in all_events:
    dates = [event['raw_date'] for event in events]
    if len(dates) > 0: 
        date_format = get_date_format(dates)
        fraction_correct = check_date_format(dates, date_format)
        if  fraction_correct > 0.9:
            print(f'Date format was correct! {fraction_correct:.2f}')
        else:
            print(f'fdate format did not pass test {fraction_correct:.2f}')
    else:
        print('No dates found')


date format: Wed Oct 29, 2025 --> %a %b %d, %Y
Date format was correct! 1.00
date format: Monday October 27th --> %A %B %d
Date format was correct! 1.00
date format: Oct 27 --> %b %d
Date format was correct! 1.00
date format: Wed, Nov 12 --> %a, %b %d
Date format was correct! 1.00
date format: Thu Oct 30 --> %a %b %d
Date format was correct! 1.00
date format: Oct30 --> %b%d
Date format was correct! 1.00
No dates found
date format: Tue Feb 10 --> %a %b %d
Date format was correct! 1.00
date format: Mon Oct 27 --> %a %b %d
Date format was correct! 1.00
date format: 10.27 --> %m.%d
Date format was correct! 1.00
date format: Sat, Dec 06, 2025 --> %a, %b %d, %Y
Date format was correct! 0.91
date format: Mon, Oct 27, 2025 --> %a, %b %d, %Y
Date format was correct! 0.97
No dates found


In [1]:
from meta_scraping_functions import discover_venue_scraping_config, format_for_database, add_venue_to_db

# 1. Discover scraping config
url = "https://cafedunord.com/"
result = discover_venue_scraping_config(url)

# 2. Format for database
scraping_config = format_for_database(result)

# 3. Add to database
venue_id = add_venue_to_db(
    name="Cafe du Nord & Swedish American Hall",
    city="San Francisco",
    url=url,
    scraping_config=scraping_config,
)

# 4. Test event population with your existing populate_events code


VENUE SCRAPING CONFIGURATION DISCOVERY

üéµ Analyzing: https://cafedunord.com/

üì• Fetching URL...
   Using Selenium (JavaScript rendering enabled)...

üîç Discovering CSS selectors...
   Container: .tw-slick-slide-caption
   Artist: .tw-slick-slide-eventname
   Date: .tw-slick-slide-eventdate

üéØ Scraping events...
   Found 21 events
   1. Dan Mangan, and Israel Nebeker - February 3, 2026
   2. Rio Romeo with Jhariah (SOLD OUT) - October 28, 2025
   3. Haute & Freddy with Jack Powers (SOLD OUT) - October 30, 2025

üìÖ Discovering date format...
   Format detected: %B %d, %Y

‚úÖ Validating date parsing...
   Success rate: 100.00%

üéâ Configuration discovery successful!
   ‚úÖ Found 21 events
   ‚úÖ Date format validated (100.00% success)
‚úÖ Successfully added/updated: Cafe du Nord & Swedish American Hall (ID: 36)


In [2]:
result

{'url': 'https://cafedunord.com/',
 'selectors': {'container': '.tw-slick-slide-caption',
  'artist': '.tw-slick-slide-attractions span',
  'date': '.tw-slick-slide-eventdate'},
 'date_format': '%B %d, %Y',
 'events': [{'artist': 'Dan Mangan', 'raw_date': 'February 3, 2026'},
  {'artist': 'Rio Romeo', 'raw_date': 'October 28, 2025'},
  {'artist': 'Haute & Freddy', 'raw_date': 'October 30, 2025'},
  {'artist': 'NoSo', 'raw_date': 'November 10, 2025'},
  {'artist': 'Vlad Holiday', 'raw_date': 'November 12, 2025'},
  {'artist': '\u200bThomas Dunford', 'raw_date': 'November 14, 2025'},
  {'artist': 'Youngmi Mayer', 'raw_date': 'November 21, 2025'},
  {'artist': 'Laura Stevenson', 'raw_date': 'November 22, 2025'},
  {'artist': 'Mdou Moctar', 'raw_date': 'December 5, 2025'},
  {'artist': 'Tyler Hilton', 'raw_date': 'January 19, 2026'},
  {'artist': 'Dan Mangan', 'raw_date': 'February 3, 2026'},
  {'artist': 'Rio Romeo', 'raw_date': 'October 28, 2025'},
  {'artist': 'Haute & Freddy', 'raw_dat

In [None]:
def needs_selenium(soup):
    """Quick check if we need Selenium."""

    # Get text content
    text = soup.get_text(strip=True)

    # Smoking guns:
    if soup.find(id='root') or soup.find(id='app'):
        return True, "React/Vue root detected"
    
    if len(text) < 500:
        return True, "Almost no text content"

    if 'loading' in text.lower()[:200]:
        return True, "Loading placeholder detected"

    # If we got here, probably fine
    return False