In [89]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
from datetime import datetime
import psycopg2
from psycopg2.extras import RealDictCursor
import os
import json
from functions import start_selenium, parse_date
import html


I have been having a hard time wrapping my head around how to efficiently organize scraping.

Chat wrote a good function based on the functions that I wrote, but it is confusing and i'd like to understand it better, so I'm goign to start by going through those functions and trying to rewrite them in a way that makes sense to me.

The goal here is to have a script that will pull from VENUES and scrape and then validate each of the artists performing at the venues, and then populate the ARTIST_EVENTS table.

Ok so thats 3 different tasks, lets break them down.

1. scrape the venues 

We have a few choices to make with the architeture. Ideally, we would be able to use something just like a scraping_params dict  
       "scraping_config": {
            "base_url": "https://rickshawstop.com/?list1page=1",
            "pagination": {
                "enabled": True,
                "pages": 2,
                "url_pattern": "https://rickshawstop.com/?list1page={page}"
            },
            "selectors": {
                "event_container": "div.event-info-block",
                "artist": "p.fs-12.headliners",
                "date": "p.fs-18.bold.mt-1r.date",
                "genre": None,
                "cancellation_indicator": None
            },
            "date_format": "%a %b %d",
            "filters": {
                "exclude_genres": [],
                "check_cancelled": False
            }}



In [90]:
conn = psycopg2.connect(os.getenv('DATABASE_URL_UNPOOLED'))

cur = conn.cursor()

# Get all active venues
cur.execute("""
    SELECT * 
    FROM venues 
    WHERE is_active = TRUE
    ORDER BY name;
""")

column_names = [desc[0] for desc in cur.description]
print("Columns:", column_names)

# Get results
res = cur.fetchall()
venues = [dict(zip(column_names, v)) for v in res]
# ret = dict(zip(column_names, venues[0]))

Columns: ['venue_id', 'name', 'address', 'city', 'url', 'scraping_config', 'is_active', 'created_at', 'updated_at']


In [91]:
def scrape_venue_html(soup, venue_id, scraping_config):
    """
    Scraper for venues using HTML/CSS selectors
    Returns list of events
    """
    selectors = scraping_config['selectors']
    date_format = scraping_config['date_format']
    event_containers = soup.select(selectors['event_container'])
    
    events = []
    
    for container in event_containers:
        try:
            # Extract artists
            artist_elem = container.select_one(selectors['artist'])
            artist = artist_elem.text.strip() if artist_elem else None
            
            # Extract date
            date_elem = container.select_one(selectors['date'])
            date_text = date_elem.text.strip() if date_elem else None
            parsed_date = parse_date(date_text, date_format) if date_text else None
            
            # Extract genre (if configured)
            genre = None
            if selectors.get('genre'):
                genre_elem = container.select_one(selectors['genre'])
                genre = genre_elem.text.strip() if genre_elem else None
            
            # Check if cancelled (if configured)
            is_cancelled = False
            if selectors.get('cancellation_indicator'):
                cancel_elem = container.select_one(selectors['cancellation_indicator'])
                if cancel_elem:
                    cancelled_text = scraping_config.get('filters', {}).get('cancelled_text', 'Cancelled')
                    is_cancelled = cancel_elem.text.strip() == cancelled_text
            
            # Only add if we got at minimum an artist and date
            if artist and date_text:
                events.append({
                    'venue_id': venue_id,
                    'raw_event_name': artist,
                    'raw_date_text': date_text,
                    'genres': genre,
                    'is_cancelled': is_cancelled,
                    'parsed_date': parsed_date
                })
                
        except Exception as e:
            print(f"  ⚠️ Error parsing event: {e}")
            continue
    
    return events

In [92]:
def scrape_venue_json_ld(soup, venue_id, scraping_config):
    """
    Scraper for venues using JSON-LD structured data
    Returns list of events
    """
    json_keys = scraping_config.get('json_keys')
    if not json_keys:
        print(f"  ❌ Missing json_keys in config")
        return []
    
    # Find all JSON-LD script tags
    script_tags = soup.find_all('script', type='application/ld+json')
    events = []
    
    for script_tag in script_tags:
        try:
            # Clean control characters before parsing
            json_text = script_tag.string
            if json_text:
                json_text = json_text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
                event_data = json.loads(json_text)
            else:
                continue
            
            # Skip if it's not an Event schema
            if event_data.get('@type') != 'Event':
                continue
            
            # Extract artist
            artist = get_nested_value(event_data, json_keys.get('artist', 'performer'))
            artist = html.unescape(str(artist)).strip() if artist else None
            
            # Extract date
            date_string = get_nested_value(event_data, json_keys.get('date', 'startDate'))
            
            # Parse date based on format
            parsed_date = None
            date_text = None
            if date_string:
                try:
                    date_format = scraping_config.get('date_format', 'iso')
                    if date_format == 'iso':
                        # Handle various ISO formats
                        parsed_date = datetime.fromisoformat(date_string.replace('+00:00', '').replace('Z', ''))
                        date_text = parsed_date.strftime('%Y-%m-%d')
                    else:
                        parsed_date = datetime.strptime(date_string, date_format)
                        date_text = parsed_date.strftime('%Y-%m-%d')
                except Exception as e:
                    print(f"  ⚠️ Error parsing date '{date_string}': {e}")
                    date_text = date_string  # Fallback to raw string
            
            # Only add if we got at minimum an artist and date
            if artist and date_text:
                events.append({
                    'venue_id': venue_id,
                    'raw_event_name': artist,
                    'raw_date_text': date_text,
                    'genres': None,
                    'is_cancelled': False,
                    'parsed_date': parsed_date
                })
                
        except json.JSONDecodeError as e:
            print(f"  ⚠️ Skipping malformed JSON-LD script")
            continue
        except Exception as e:
            print(f"  ⚠️ Error extracting event data: {e}")
            continue
    
    return events

def get_nested_value(data, key_path):
    """
    Get value from nested dict using dot notation
    e.g., 'location.name' returns data['location']['name']
    """
    if not key_path:
        return None
        
    keys = key_path.split('.')
    value = data
    
    for key in keys:
        if isinstance(value, dict):
            value = value.get(key)
        else:
            return None
            
        if value is None:
            return None
    
    return value

In [93]:
raw_events =[]
driver = start_selenium()
for target_venue in venues:
    scraping_config = target_venue.get('scraping_config', {})
    pagination = scraping_config.get('pagination', {})
    selectors = scraping_config.get('selectors', {})
    id = target_venue.get('venue_id')
    print(target_venue.get('name'))
    events_list=[]

    soups = []

    if pagination.get('enabled'):
        url_pattern = pagination.get('url_pattern')
        if pagination.get('enabled'): # bool
            pages = pagination.get('pages', 1)
            url_pattern = pagination.get('url_pattern')
            for i in range(1, 6):
                page_url = url_pattern.format(page=i)
                print(f"  → Getting page {i}: {page_url}")
                try:
                    driver.get(page_url)
                    time.sleep(1)
                    soup = BeautifulSoup(driver.page_source, 'html.parser')
                    events = scrape_venue_html(soup, 9, target_venue['scraping_config'])
                    raw_events.extend(events)
                    print('added ', len(events))

                except Exception as e:
                    print(f"  ⚠️  Timeout/error loading page {i}: {e}")
                    break

    else:
        driver.get(scraping_config.get('base_url'))
        time.sleep(1)
        method = scraping_config.get('scraping_method', 'html')
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        if method == 'html':
            events = scrape_venue_html(soup, id, scraping_config)
            raw_events.extend(events)
        elif method == 'json-ld':
            events = scrape_venue_json_ld(soup, id, scraping_config)
            raw_events.extend(events)
    print('added ', len(events))

driver.quit()

Neck of the Woods
added  36
Rickshaw Stop
  → Getting page 1: https://rickshawstop.com/?list1page=1
added  15
  → Getting page 2: https://rickshawstop.com/?list1page=2
added  15
  → Getting page 3: https://rickshawstop.com/?list1page=3
added  15
  → Getting page 4: https://rickshawstop.com/?list1page=4
added  15
  → Getting page 5: https://rickshawstop.com/?list1page=5
added  10
added  10
The Aggie Theatre
added  45
The Chapel
  → Getting page 1: https://thechapelsf.com/music/?list1page=1
added  15
  → Getting page 2: https://thechapelsf.com/music/?list1page=2
added  15
  → Getting page 3: https://thechapelsf.com/music/?list1page=3
added  15
  → Getting page 4: https://thechapelsf.com/music/?list1page=4
added  15
  → Getting page 5: https://thechapelsf.com/music/?list1page=5
added  15
added  15
The Great American Music Hall
added  10
The Independent
added  81
The Mishawaka 
  ⚠️ Error parsing event: time data 'Sat, June 20, 2026' does not match format '%a, %b %d, %Y'
added  16
The Warf

In [99]:
import pandas as pd
raw_df = pd.DataFrame(raw_events)
raw_df.to_csv('raw_events.csv', index=False)

In [96]:
# old functions
def scrape_venue(venue_id, venue_name, url, scraping_config, driver):
    """
    Generic scraper that works for any venue based on its config
    Returns list of raw events: [(artist, date_text, genre, is_cancelled), ...]
    """
    print(f"🔍 Scraping {venue_name}...")
    
    soups = []
    pagination = scraping_config.get('pagination', {})
    
    try:
        # Handle pagination
        if pagination.get('enabled'): # bool
            pages = pagination.get('pages', 1)
            url_pattern = pagination.get('url_pattern')
            
            for i in range(1, pages + 1):
                page_url = url_pattern.format(page=i)
                print(f"  → Getting page {i}: {page_url}")
                try:
                    driver.get(page_url)
                    time.sleep(3)
                    soups.append(BeautifulSoup(driver.page_source, 'html.parser'))
                except Exception as e:
                    print(f"  ⚠️  Timeout/error loading page {i}: {e}")
                    continue
        else:
            # No pagination, just get the base URL
            print(f"  → Getting {url}")
            driver.get(url)
            time.sleep(3)
            soups.append(BeautifulSoup(driver.page_source, 'html.parser'))
    
    except Exception as e:
        print(f"  ❌ Failed to scrape {venue_name}: {e}")
        return []
    
    # Extract events from all pages
    events = []
    selectors = scraping_config['selectors']
    
    for soup in soups:
        # Find all event containers
        event_containers = soup.select(selectors['event_container'])
        print(f"  → Found {len(event_containers)} events on this page")
        
        for container in event_containers:
            try:
                # Extract artist
                artist_elem = container.select_one(selectors['artist'])
                artist = artist_elem.text.strip() if artist_elem else None
                
                # Extract date
                date_elem = container.select_one(selectors['date'])
                date_text = date_elem.text.strip() if date_elem else None
                
                # Extract genre (if configured)
                genre = None
                if selectors.get('genre'):
                    genre_elem = container.select_one(selectors['genre'])
                    genre = genre_elem.text.strip() if genre_elem else None
                
                # Check if cancelled (if configured)
                is_cancelled = False
                if selectors.get('cancellation_indicator'):
                    cancel_elem = container.select_one(selectors['cancellation_indicator'])
                    if cancel_elem:
                        cancelled_text = scraping_config.get('filters', {}).get('cancelled_text', 'Cancelled')
                        is_cancelled = cancel_elem.text.strip() == cancelled_text
                
                # Only add if we got at minimum an artist and date
                if artist and date_text:
                    events.append({
                        'venue_id': venue_id,
                        'artist': artist,
                        'date_text': date_text,
                        'genre': genre,
                        'is_cancelled': is_cancelled
                    })
                    
            except Exception as e:
                print(f"  ⚠️  Error parsing event: {e}")
                continue
    
    print(f"✅ Scraped {len(events)} events from {venue_name}")
    return events

def scrape_all_venues(conn):
    """
    Scrape all active venues from the database
    Returns list of all events from all venues
    """
    cur = conn.cursor()
    
    # Get all active venues
    cur.execute("""
        SELECT venue_id, name, url, scraping_config 
        FROM venues 
        WHERE is_active = TRUE
        ORDER BY name;
    """)
    
    venues = cur.fetchall()
    print(f"📍 Found {len(venues)} active venues to scrape\n")
    
    # Start selenium once for all venues
    driver = start_selenium()
    
    all_events = []
    
    try:
        for venue_id, name, url, config in venues:
            events = scrape_venue(venue_id, name, url, config, driver)
            all_events.extend(events)
            print()  # Blank line between venues
    except Exception as e:
        print(f"❌ Error scraping {name}: {e}")
    finally:
        driver.quit()
        print("🛑 Browser closed")
    
    print(f"\n🎉 Total events scraped: {len(all_events)}")
    return all_events

In [97]:
import json
import psycopg2
import os
# scrape venue
# pagination
venues = [
    {"venue_id": "3", 
        "name": "The Warfield",
        "address": "982 Market Street",
        "city": "San Francisco",
        "scraping_config": {
            "base_url": "https://www.thewarfieldtheatre.com/events",
            "pagination": {
                "enabled": False,
            },
            "selectors": {
                "event_container": "div.entry.warfield.clearfix",
                "artist": "h3.carousel_item_title_small",
                "support": 'h4.animated',
                "date": "span.date",
                "genre": None,
                "cancellation_indicator": None
            },
            "date_format": "%a, %b %d, %Y",
            }},

                {
                    "venue_id": "2", 
        "name": "The Great American Music Hall",
        "address": "859 O’Farrell St.",
        "city": "San Francisco",
        "scraping_config": {
            "base_url": "https://gamh.com/",
            "pagination": {
                "enabled": False,
            },
            "selectors": {
                "event_container": "div.seetickets-list-event-content-container.position-relative",
                "artist": "p.fs-12.headliners",
                "support": 'p.fs-12.supporting-talent',
                "date": "p.fs-18.bold.mt-1r.event-date",
                "genre": None,
                "cancellation_indicator": None
            },
            "date_format": "%a %b %d",
            }},

                {
                    "venue_id": "4", 
        "name": "Neck of the Woods",
        "address": "406 Clement St",
        "city": "San Francisco",
        "scraping_config": {
            "base_url": "https://www.neckofthewoodssf.com/page/2/",
            "selectors": {
                "event_container": "div.tw-section",
                "artist": "div.tw-name",
                "support": None,
                "date": "div.tw-event-datetime ",
                "genre": None,
                "cancellation_indicator": None
            },
            "date_format": "%a, %b %d",
            }},

{"venue_id": "1", 
    "name": "The Aggie Theatre",
    "address": "204 S College Avenue",
    "city": "Fort Collins",
    "scraping_config": {
        "scraping_method": "json-ld",  # or "html" for regular scraping
        "base_url": "https://www.fortcollinsmusichall.com/events/",
        "json_keys": {
            "artist": "performer",
            "date": "startDate",
            "venue": "location.name",
            "url": "url"
        },
        "date_format": "iso"
    }
}, 
{"venue_id": "5", 
    "name": "Washington's",
    "address": "132 Laporte Ave",
    "city": "Fort Collins",
    "scraping_config": {
        "scraping_method": "html",
        "base_url": "https://bohemianlivemusic.org/our-venues/washingtons/",
        "selectors": {
            "event_container": "div.elementor-element.elementor-element-18ac28d.e-flex.e-con-boxed.e-con.e-child",
            "artist": "div.elementor-widget-container",
            "support": None,
            "date": "span.elementor-icon-list-text.elementor-post-info__item.elementor-post-info__item--type-custom",
            "genre": None,
            "cancellation_indicator": None
        },
        "date_format": "%A, %B %d @ %I:%M %p"
    }
},
{"venue_id": "6", 
    "name": "The Mishawaka ",
    "address": "13714 Poudre Canyon Highway",
    "city": "Fort Collins",
    "scraping_config": {
        "scraping_method": "html",
        "base_url": "https://www.themishawaka.com/events/?view=list",
        "selectors": {
            "event_container": "div.col-12.eventWrapper.rhpSingleEvent.py-4.px-0",
            "artist": "#eventTitle h2",
            "support": None,
            "date": "#eventDate",
            "genre": None,
            "cancellation_indicator": None
        },
        "date_format": "%a, %b %d, %Y"
    }
}
]
# Connect to database
conn = psycopg2.connect(os.getenv('DATABASE_URL_UNPOOLED'))
cur = conn.cursor()

insert_query = """
INSERT INTO venues (name, address, city, url, scraping_config)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (url) DO UPDATE SET
    name = EXCLUDED.name,
    address = EXCLUDED.address,
    city = EXCLUDED.city,
    scraping_config = EXCLUDED.scraping_config,
    updated_at = CURRENT_TIMESTAMP
RETURNING venue_id, name;
"""

try:
    for venue in venues:
        cur.execute(insert_query, (
            venue["name"],
            venue["address"],
            venue["city"],
            venue["scraping_config"]["base_url"],  # Use base_url as url
            json.dumps(venue["scraping_config"])
        ))
        venue_id, name = cur.fetchone()
        print(f"✅ Inserted/Updated: {name} (ID: {venue_id})")
    
    conn.commit()
    print(f"\n🎉 Successfully processed {len(venues)} venues!")
    
except Exception as e:
    conn.rollback()
    print(f"❌ Error inserting venues: {e}")
    raise
finally:
    conn.close()

✅ Inserted/Updated: The Warfield (ID: 10)
✅ Inserted/Updated: The Great American Music Hall (ID: 11)
✅ Inserted/Updated: Neck of the Woods (ID: 12)
✅ Inserted/Updated: The Aggie Theatre (ID: 13)
✅ Inserted/Updated: Washington's (ID: 14)
✅ Inserted/Updated: The Mishawaka  (ID: 15)

🎉 Successfully processed 6 venues!
