In [None]:
"""
1. Scrape public Eventbrite search pages (correct URL & selector)
2. Extract event IDs via regex on <a href="/e/...-<digits>">
3. Fetch each event’s details via OAuth2 API
4. Filter next 30 days & exact SoCal cities
5. Output leap_events_socal.csv & leap_events_socal.md
"""

'\nThis notebook uses the Eventbrite API to find “career planning”, “leadership”, “job fairs” and “professional development” events in Southern California over the next 30 days.  \n\nIt will:\n- Query the API  \n- Filter & clean the results  \n- Display a preview of the data  \n- Export to CSV and Markdown files  \n'

In [10]:
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta, timezone
import pytz

In [11]:
# OAuth2 token for detail calls
TOKEN = "ADFEE3M6QKP4UHHK4RTJ"
HEADERS = {"Authorization": f"Bearer {TOKEN}"}

# Keywords → URL slugs
KEYWORDS = {
    "career planning":         "career-planning",
    "leadership":              "leadership",
    "job fair":                "job-fair",
    "professional development":"professional-development"
}

# SoCal cities → URL slug & normalized name
CITIES = {
    "Los Angeles":   "los-angeles",
    "Orange County": "orange-county",
    "San Diego":     "san-diego",
    "Riverside":     "riverside",
    "Santa Barbara": "santa-barbara"
}

# Time window: now → now+30 days in UTC
UTC_NOW = datetime.now(timezone.utc)
UTC_LIM = UTC_NOW + timedelta(days=30)
PACIFIC = pytz.timezone("America/Los_Angeles")

# Pause between requests
PAUSE = 0.5

# Regex for /e/...-1234567890 event links
EVENT_ID_RE = re.compile(r'/e/[^/"]+-(\d{8,12})')

In [12]:
def to_pacific(iso_str):
    """Convert ISO-8601 UTC → 'YYYY-MM-DD HH:MM' PT."""
    dt = datetime.fromisoformat(iso_str.replace('Z','+00:00'))
    return dt.astimezone(PACIFIC).strftime("%Y-%m-%d %H:%M")

def clean_html(html):
    """Strip tags & collapse whitespace."""
    text = BeautifulSoup(html or "", "html.parser").get_text(separator=" ")
    return " ".join(text.split())

def in_next_30_days(iso_str):
    """Check if UTC timestamp is within our 30-day window."""
    dt = datetime.fromisoformat(iso_str.replace('Z','+00:00'))
    return UTC_NOW <= dt <= UTC_LIM


In [13]:
records = []
seen_ids = set()

for city_name, city_slug in CITIES.items():
    for kw_display, kw_slug in KEYWORDS.items():
        page = 1
        while True:
            # Search URL
            search_url = f"https://www.eventbrite.com/d/ca--{city_slug}/{kw_slug}/"
            params = {"page": page}
            resp = requests.get(search_url, params=params)
            if resp.status_code != 200:
                break 
            html = resp.text

            # Extract event IDs from all <a href="/e/...-digits">
            ids = set(EVENT_ID_RE.findall(html))
            # if nothing new, stop paging
            new_ids = [i for i in ids if i not in seen_ids]
            if not new_ids:
                break

            for eid in new_ids:
                seen_ids.add(eid)

                # Fetch event detail
                detail = requests.get(
                    f"https://www.eventbriteapi.com/v3/events/{eid}/",
                    headers=HEADERS,
                    params={"expand":"venue,organizer,ticket_availability"}
                )
                if detail.status_code != 200:
                    continue
                data = detail.json()
                time.sleep(PAUSE)

                # Filter by date & city
                start_utc  = data["start"]["utc"]
                venue_city = data.get("venue",{}).get("address",{}).get("city","")
                if not (in_next_30_days(start_utc) and venue_city == city_name):
                    continue

                # Extract fields
                records.append({
                    "Title":       data["name"]["text"].strip(),
                    "Organizer":   data.get("organizer",{}).get("name","").strip(),
                    "Start (PT)":  to_pacific(start_utc),
                    "City":        venue_city,
                    "Venue":       data.get("venue",{}).get("name","").strip(),
                    "Description": clean_html(data.get("description",{}).get("html","")),
                    "URL":         data.get("url",""),
                    "Fee":         data.get("ticket_availability",{}) \
                                       .get("minimum_ticket_price",{}) \
                                       .get("display","Free")
                })

            # try next page
            page += 1

In [14]:
df = pd.DataFrame(records)
df.drop_duplicates(subset=["Title","Start (PT)"], inplace=True)
df.reset_index(drop=True, inplace=True)

df.to_csv("leap_events_socal.csv", index=False)
with open("leap_events_socal.md","w",encoding="utf-8") as md:
    for _,r in df.iterrows():
        md.write(f"### {r['Title']}\n")
        md.write(f"- **Organizer**: {r['Organizer']}\n")
        md.write(f"- **When (PT)**: {r['Start (PT)']}\n")
        md.write(f"- **Where**: {r['Venue']}, {r['City']}\n")
        md.write(f"- **Fee**: {r['Fee']}\n")
        md.write(f"- **URL**: [{r['URL']}]({r['URL']})\n")
        desc = (r["Description"][:200] + "...") if len(r["Description"])>200 else r["Description"]
        md.write(f"- **Description**: {desc}\n\n")

print(f"Saved {len(df)} events to leap_events_socal.csv & .md")

Saved 12 events to leap_events_socal.csv & .md
