# Concurrent Venue Scraping Experiment

This notebook tests a more scalable approach to scraping hundreds of venues concurrently.

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
from datetime import datetime, date
import psycopg2
from psycopg2.extras import RealDictCursor
import os
import json
from populate_events_functions import start_selenium, parse_date, scrape_venue_html, scrape_venue_json_ld
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

True

## 1. Load Venues from Database

In [2]:
conn = psycopg2.connect(os.getenv('DATABASE_URL_UNPOOLED'))
cur = conn.cursor()

# Get all active venues
cur.execute("""
    SELECT * 
    FROM venues 
    WHERE is_active = TRUE
    ORDER BY name;
""")

column_names = [desc[0] for desc in cur.description]
res = cur.fetchall()
venues = [dict(zip(column_names, v)) for v in res]

cur.close()
conn.close()

print(f"Loaded {len(venues)} active venues:")
for venue in venues:
    print(f"  - {venue['name']}")

Loaded 13 active venues:
  - Ace of Spades
  - Avogardro's Number
  - Goldfield Trading Post
  - Neck of the Woods
  - Rickshaw Stop
  - The Aggie Theatre
  - The Armory
  - The Chapel
  - The Great American Music Hall
  - The Independent
  - The Mishawaka 
  - The Warfield
  - Washington's


## 2. Define Scraping Functions

Clean functions with driver reuse for efficiency.

In [3]:
def scrape_single_venue_with_driver(venue_dict, driver):
    """
    Scrapes a single venue using a provided driver (for reuse).
    Handles pagination and errors gracefully.
    
    Returns: (venue_name, events_list, error_message)
    """
    venue_name = venue_dict['name']
    venue_id = venue_dict['venue_id']
    scraping_config = venue_dict.get('scraping_config', {})
    
    events = []
    error = None
    
    try:
        # Extract config
        pagination = scraping_config.get('pagination', {})
        base_url = scraping_config.get('base_url')
        method = scraping_config.get('scraping_method', 'html')
        
        # Determine if pagination is enabled
        if pagination.get('enabled'):
            # Paginated scraping
            url_pattern = pagination.get('url_pattern')
            max_pages = pagination.get('pages', 5)
            
            for page_num in range(1, max_pages + 1):
                page_url = url_pattern.format(page=page_num)
                
                try:
                    driver.get(page_url)
                    time.sleep(1)
                    soup = BeautifulSoup(driver.page_source, 'html.parser')
                    
                    # Scrape using configured method
                    page_events = scrape_venue_html(soup, venue_id, scraping_config)
                    events.extend(page_events)
                    
                    # Stop if we got no events (likely end of pagination)
                    if not page_events:
                        break
                        
                except Exception as e:
                    # Log page error but continue to next page
                    print(f"  ⚠️  {venue_name} - Error on page {page_num}: {e}")
                    break
        else:
            # Single page scraping
            driver.get(base_url)
            time.sleep(1)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            if method == 'html':
                events = scrape_venue_html(soup, venue_id, scraping_config)
            elif method == 'json-ld':
                events = scrape_venue_json_ld(soup, venue_id, scraping_config)
    
    except Exception as e:
        error = str(e)
        print(f"  ❌ {venue_name} - Fatal error: {e}")
    
    return venue_name, events, error


def scrape_venues_worker(venues_chunk):
    """
    Worker function that creates ONE driver and reuses it for all venues in the chunk.
    This is much more efficient than creating a driver per venue.
    
    Returns: list of (venue_name, events_list, error_message) tuples
    """
    driver = None
    results = []
    
    try:
        # Create one driver for this worker thread
        driver = start_selenium()
        
        # Process all venues assigned to this worker
        for venue in venues_chunk:
            result = scrape_single_venue_with_driver(venue, driver)
            results.append(result)
    
    finally:
        # Clean up the driver when done with all venues
        if driver:
            driver.quit()
    
    return results

## 4. Scrape All Venues Concurrently

Now let's run multiple venues in parallel for speed.

In [4]:
# Configuration
MAX_WORKERS = 3  # Number of concurrent scrapers (adjust based on your system)

# Split venues into chunks for each worker
def chunk_list(lst, n):
    """Split list into n roughly equal chunks"""
    k, m = divmod(len(lst), n)
    return [lst[i*k+min(i,m):(i+1)*k+min(i+1,m)] for i in range(n)]

venue_chunks = chunk_list(venues, MAX_WORKERS)

print(f"Processing {len(venues)} venues with {MAX_WORKERS} workers")
print(f"Chunk sizes: {[len(chunk) for chunk in venue_chunks]}\n")

all_events = []
failed_venues = []
completed = 0

# Use ThreadPoolExecutor with worker function that reuses drivers
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    # Submit chunks to workers (each worker gets its own driver)
    futures = [executor.submit(scrape_venues_worker, chunk) for chunk in venue_chunks]
    
    # Process results as they complete
    for future in as_completed(futures):
        try:
            # Each future returns a list of results from one worker
            worker_results = future.result()
            
            for venue_name, events, error in worker_results:
                completed += 1
                
                if error:
                    print(f"[{completed}/{len(venues)}] ❌ {venue_name} - Error: {error}")
                    failed_venues.append({'name': venue_name, 'error': error})
                else:
                    print(f"[{completed}/{len(venues)}] ✅ {venue_name} - {len(events)} events")
                    all_events.extend(events)
                    
        except Exception as e:
            print(f"Worker thread failed: {e}")

# Summary
print(f"\n{'='*60}")
print(f"SCRAPING COMPLETE")
print(f"{'='*60}")
print(f"Total events scraped: {len(all_events)}")
print(f"Successful venues: {len(venues) - len(failed_venues)}")
print(f"Failed venues: {len(failed_venues)}")

if failed_venues:
    print(f"\nFailed venues:")
    for fail in failed_venues:
        print(f"  - {fail['name']}: {fail['error']}")

Processing 13 venues with 3 workers
Chunk sizes: [5, 4, 4]

  ⚠️ Error parsing event: time data 'Sat, June 20, 2026' does not match format '%a, %b %d, %Y'
[1/13] ✅ Ace of Spades - 0 events
[2/13] ✅ Avogardro's Number - 23 events
[3/13] ✅ Goldfield Trading Post - 20 events
[4/13] ✅ Neck of the Woods - 36 events
[5/13] ✅ Rickshaw Stop - 30 events
  ⚠️ Error parsing event: time data 'TBD' does not match format '%a, %b %d, %Y'
[6/13] ✅ The Aggie Theatre - 43 events
[7/13] ✅ The Armory - 9 events
[8/13] ✅ The Chapel - 27 events
[9/13] ✅ The Great American Music Hall - 12 events
[10/13] ✅ The Independent - 73 events
[11/13] ✅ The Mishawaka  - 15 events
[12/13] ✅ The Warfield - 30 events
[13/13] ✅ Washington's - 9 events

SCRAPING COMPLETE
Total events scraped: 327
Successful venues: 13
Failed venues: 0


## 5. Convert to DataFrame and Remove Duplicates

In [5]:
raw_df = pd.DataFrame(all_events).drop_duplicates()

print(f"Total unique events: {len(raw_df)}")
print(f"\nSample of scraped data:")
raw_df.head(10)

Total unique events: 298

Sample of scraped data:


Unnamed: 0,venue_id,raw_event_name,raw_date_text,genres,is_cancelled,parsed_date
0,32,Al Loves El,Thursday October 23rd,,False,2025-10-23
1,32,Amanda Hofer Trio,Thursday October 23rd,,False,2025-10-23
2,32,Defunkl,Friday October 24th,,False,2025-10-24
3,32,High Desert Groove,Friday October 24th,,False,2025-10-24
4,32,Dracula's Disco,Saturday October 25th,,False,2025-10-25
5,32,Harmony Hotshots,Sunday October 26th,,False,2025-10-26
6,32,School of Rock Adult Band,Sunday October 26th,,False,2025-10-26
7,32,Blue Monday,Monday October 27th,,False,2025-10-27
8,32,Open Mic!,Tuesday October 28th,,False,2025-10-28
9,32,An Evening with That 1 Guy,Thursday October 30th,,False,2025-10-30


## 6. Explore the Results

In [6]:
# Events per venue
print("Events per venue:")
if 'venue_id' in raw_df.columns and len(raw_df) > 0:
    venue_counts = raw_df['venue_id'].value_counts()
    for venue_id, count in venue_counts.items():
        venue_name = next(v['name'] for v in venues if v['venue_id'] == venue_id)
        print(f"  {venue_name}: {count} events")
else:
    print("  No data to display")

Events per venue:
  The Independent: 72 events
  The Aggie Theatre: 43 events
  The Warfield: 30 events
  Rickshaw Stop: 25 events
  Avogardro's Number: 23 events
  The Chapel: 23 events
  Goldfield Trading Post: 20 events
  Neck of the Woods: 18 events
  The Mishawaka : 14 events
  The Great American Music Hall: 12 events
  The Armory: 9 events
  Washington's: 9 events


## Next Steps

1. Test this approach with your current venues
2. Adjust `MAX_WORKERS` based on performance (3-5 is usually good)
3. If it works well, we can integrate this into `populate_events.ipynb`
4. Add retry logic for failed venues if needed