In [16]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
from datetime import datetime
import psycopg2
from psycopg2.extras import RealDictCursor
import os

def start_selenium():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.page_load_strategy = 'eager'
    chrome_options.add_argument("--disable-images")
    driver = webdriver.Chrome(options=chrome_options)
    driver.set_page_load_timeout(12)
    return driver

In [8]:
def scrape_venue(venue_id, venue_name, url, scraping_config, driver):
    """
    Generic scraper that works for any venue based on its config
    Returns list of raw events: [(artist, date_text, genre, is_cancelled), ...]
    """
    print(f"🔍 Scraping {venue_name}...")
    
    soups = []
    pagination = scraping_config.get('pagination', {})
    
    try:
        # Handle pagination
        if pagination.get('enabled'):
            pages = pagination.get('pages', 1)
            url_pattern = pagination.get('url_pattern')
            
            for i in range(1, pages + 1):
                page_url = url_pattern.format(page=i)
                print(f"  → Getting page {i}: {page_url}")
                try:
                    driver.get(page_url)
                    time.sleep(3)
                    soups.append(BeautifulSoup(driver.page_source, 'html.parser'))
                except Exception as e:
                    print(f"  ⚠️  Timeout/error loading page {i}: {e}")
                    continue
        else:
            # No pagination, just get the base URL
            print(f"  → Getting {url}")
            driver.get(url)
            time.sleep(3)
            soups.append(BeautifulSoup(driver.page_source, 'html.parser'))
    
    except Exception as e:
        print(f"  ❌ Failed to scrape {venue_name}: {e}")
        return []
    
    # Extract events from all pages
    events = []
    selectors = scraping_config['selectors']
    
    for soup in soups:
        # Find all event containers
        event_containers = soup.select(selectors['event_container'])
        print(f"  → Found {len(event_containers)} events on this page")
        
        for container in event_containers:
            try:
                # Extract artist
                artist_elem = container.select_one(selectors['artist'])
                artist = artist_elem.text.strip() if artist_elem else None
                
                # Extract date
                date_elem = container.select_one(selectors['date'])
                date_text = date_elem.text.strip() if date_elem else None
                
                # Extract genre (if configured)
                genre = None
                if selectors.get('genre'):
                    genre_elem = container.select_one(selectors['genre'])
                    genre = genre_elem.text.strip() if genre_elem else None
                
                # Check if cancelled (if configured)
                is_cancelled = False
                if selectors.get('cancellation_indicator'):
                    cancel_elem = container.select_one(selectors['cancellation_indicator'])
                    if cancel_elem:
                        cancelled_text = scraping_config.get('filters', {}).get('cancelled_text', 'Cancelled')
                        is_cancelled = cancel_elem.text.strip() == cancelled_text
                
                # Only add if we got at minimum an artist and date
                if artist and date_text:
                    events.append({
                        'venue_id': venue_id,
                        'artist': artist,
                        'date_text': date_text,
                        'genre': genre,
                        'is_cancelled': is_cancelled
                    })
                    
            except Exception as e:
                print(f"  ⚠️  Error parsing event: {e}")
                continue
    
    print(f"✅ Scraped {len(events)} events from {venue_name}")
    return events

def scrape_all_venues(conn):
    """
    Scrape all active venues from the database
    Returns list of all events from all venues
    """
    cur = conn.cursor()
    
    # Get all active venues
    cur.execute("""
        SELECT venue_id, name, url, scraping_config 
        FROM venues 
        WHERE is_active = TRUE
        ORDER BY name;
    """)
    
    venues = cur.fetchall()
    print(f"📍 Found {len(venues)} active venues to scrape\n")
    
    # Start selenium once for all venues
    driver = start_selenium()
    
    all_events = []
    
    try:
        for venue_id, name, url, config in venues:
            events = scrape_venue(venue_id, name, url, config, driver)
            all_events.extend(events)
            print()  # Blank line between venues
    except Exception as e:
        print(f"❌ Error scraping {name}: {e}")
    finally:
        driver.quit()
        print("🛑 Browser closed")
    
    print(f"\n🎉 Total events scraped: {len(all_events)}")
    return all_events

In [15]:
import pandas as pd
conn = psycopg2.connect(os.getenv('DATABASE_URL_UNPOOLED'))
all_events = scrape_all_venues(conn)
events = pd.DataFrame(all_events)

In [12]:
from functions import parse_date

conn = psycopg2.connect(os.getenv('DATABASE_URL_UNPOOLED'))
cur = conn.cursor()
cur.execute("""
    SELECT venue_id, scraping_config->>'date_format' as date_format
    FROM venues 
""")
venue_id_date_format = dict(cur.fetchall())

all_events['event_date'] = all_events.apply(lambda row: parse_date(row['raw_date_text'], venue_id_date_format[row['venue_id']]), axis=1)

all_events

AttributeError: 'list' object has no attribute 'apply'

In [None]:
import pandas as pd

# Convert to DataFrame
df = pd.DataFrame(all_events)

# Save as CSV
df.to_csv('events_data.csv', index=False)

print("Events saved to events_data.csv")

Events saved to events_data.csv
