In [80]:
import pandas as pd
import psycopg2
from psycopg2.extras import RealDictCursor
import os
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from dotenv import load_dotenv
from datetime import datetime, date
from fuzzywuzzy import fuzz
import random
import json
import re
from typing import Dict, List
from openai import OpenAI
import json
from collections import Counter

load_dotenv()
client_id = os.getenv('CLIENT_ID')
client_secret = os.getenv('CLIENT_SECRET')
refresh_token = os.getenv('REFRESH_TOKEN')
OPENAI_API_KEY= os.getenv('OPENAI_API_KEY')
current_date = datetime.now().date()
cutoff_in_days=21

In [117]:
raw_df = pd.read_csv('raw_events.csv').drop_duplicates(['raw_event_name', 'raw_date_text'])

auth_manager = SpotifyOAuth(
    client_id=client_id,
    client_secret=client_secret,
    redirect_uri="http://127.0.0.1:8080",
    scope="playlist-modify-public playlist-modify-private"
)
token_info = auth_manager.refresh_access_token(refresh_token)
sp = spotipy.Spotify(auth=token_info['access_token'])
print(len(raw_df))

352


In [82]:
raw_df

Unnamed: 0,venue_id,raw_event_name,raw_date_text,genres,is_cancelled,parsed_date
0,29,Kublai Khan TX,"Thu Oct 23, 2025",,False,2025-10-23
1,29,DJ Pauly D,"Fri Oct 24, 2025",,False,2025-10-24
2,29,Yung Gravy: Voluptuous Voyage Tour,"Sat Oct 25, 2025",,False,2025-10-25
3,29,The Buttertones,"Sun Oct 26, 2025",,False,2025-10-26
4,29,Spafford,"Wed Oct 29, 2025",,False,2025-10-29
...,...,...,...,...,...,...
411,14,105.5 The Colorado Sound Welcomes,"Saturday, November 1 @ 7:00 pm",,False,2025-11-01
412,14,Washington's and Live Nation present,"Friday, November 14 @ 7:30 pm",,False,2025-11-14
413,14,105.5 The Colorado Sound Welcomes,"Saturday, November 15 @ 7:00 pm",,False,2025-11-15
414,14,105.5 The Colorado Sound Welcomes,"Saturday, November 22 @ 7:00 pm",,False,2025-11-22


## PREVALIDATION 

use an ai to recognize common issues in each venues soup - things like cover bands or emo nights that we can remove to speed up our parsing

In [83]:
def analyze_venue_patterns(venue_id, event_names, max_retries=3):
      """
      Use LLM to identify recurring patterns in a venue's events.
      Run this periodically (monthly) or when adding a new venue.
      
      Args:
          venue_id: ID of the venue to analyze
          raw_events: DataFrame with 'raw_event_name' column (all historical events)
          max_retries: Number of retries on API failure
      
      Returns:
          dict: validation_config to store in venues table, or None on failure
      """
      client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

      # Get all unique event names from this venue

      if len(event_names) == 0:
          print(f"‚ö†Ô∏è  No events found for venue {venue_id}")
          return None

      # Count occurrences
      event_counts = Counter(event_names)

      # Show some stats
      total_events = len(event_names)
      recurring_count = sum(1 for count in event_counts.values() if count > 1)

      print(f"\nüîç Analyzing venue {venue_id}:")
      print(f"   Total unique events: {total_events}")
      print(f"   Recurring events: {recurring_count}")

      # Build the prompt
      prompt = f"""Analyze these event names from a music venue to identify patterns.

  Context: We scrape event listings and need to filter out non-musical events and clean artist names.

  Identify:
  1. RECURRING NON-ARTIST EVENTS - Events that repeat (karaoke nights, open mics, private events, event series like "Emo Nite")
     - Look for things with occurrence count > 1 that aren't artists
     - Include obvious non-music events even if they only appear once ("Private Event")

  2. COMMON TEXT PATTERNS TO STRIP - Text that appears in MANY event names that should be removed
     - Promotional phrases: "An Evening with", "Presents", "Live in Concert"
     - Location info: "at [venue] - [city]"
     - Tour names: "- World Tour", "Tour 2024"
     - But ONLY patterns that appear frequently (5+ times)

  3. MULTI-ARTIST SEPARATOR - What character(s) does this venue use to separate multiple artists on the same bill?
     - "/" = "Artist A/ Artist B/ Artist C"
     - "," = "Artist A, Artist B, Artist C"
     - "&" = "Artist A & Artist B"
     - Look at the patterns and pick the MOST COMMON one (or null if unclear)

  Event names (showing up to 100, with occurrence count):
  {json.dumps({name: event_counts[name] for name in list(event_names)[:100]}, indent=2)}

  IMPORTANT: Be conservative! Only flag things you're CONFIDENT about.
  - Don't flag actual band names as non-events
  - Don't add text patterns that only appear once or twice
  - If the multi-artist separator is unclear, return null

  Return ONLY valid JSON in this exact format:
  {{
    "recurring_non_events": ["Karaoke Tuesday", "Private Event"],
    "text_patterns_to_strip": ["at Rickshaw Stop - San Francisco, CA"],
    "multi_artist_separator": "/" 
  }}"""

      for attempt in range(max_retries):
          try:
              response = client.chat.completions.create(
                  model="gpt-4o-mini",
                  messages=[
                      {
                          "role": "system",
                          "content": "You are a music industry expert analyzing venue event patterns. Be conservative and only flag obvious non-artist events."
                      },
                      {"role": "user", "content": prompt}
                  ],
                  temperature=0.1,
                  response_format={"type": "json_object"}
              )

              result_text = response.choices[0].message.content.strip()

              # Clean markdown just in case
              result_text = re.sub(r'^```(?:json)?\s*|\s*```$', '', result_text.strip(), flags=re.MULTILINE)

              # Parse JSON
              patterns = json.loads(result_text)

              # Validate structure
              if not isinstance(patterns, dict):
                  raise ValueError(f"Expected dict, got {type(patterns)}")

              # Ensure required keys exist
              required_keys = ['recurring_non_events', 'text_patterns_to_strip', 'multi_artist_separator']
              for key in required_keys:
                  if key not in patterns:
                      patterns[key] = [] if key != 'multi_artist_separator' else None

              # Validate types
              if not isinstance(patterns['recurring_non_events'], list):
                  patterns['recurring_non_events'] = []
              if not isinstance(patterns['text_patterns_to_strip'], list):
                  patterns['text_patterns_to_strip'] = []

              # Add metadata
              patterns['last_pattern_analysis'] = datetime.now().isoformat()
              patterns['total_events_analyzed'] = total_events

              return patterns

          except json.JSONDecodeError as e:
              print(f"  ‚ö†Ô∏è  Attempt {attempt + 1}/{max_retries}: Invalid JSON: {e}")
              if attempt == max_retries - 1:
                  print(f"  ‚ùå Failed to parse after {max_retries} attempts")
                  return None

          except Exception as e:
              print(f"  ‚ö†Ô∏è  Attempt {attempt + 1}/{max_retries}: Error: {e}")
              if attempt == max_retries - 1:
                  print(f"  ‚ùå Failed after {max_retries} attempts")
                  return None

      return None


def quick_filter_events(raw_events, validation_config):
    """
    Apply venue-specific filters BEFORE expensive API calls.
    
    Args:
        raw_events: DataFrame with 'raw_event_name' column
        validation_config: Dict from venues table
    
    Returns:
        Filtered DataFrame
    """
    if not validation_config:
        return raw_events

    filtered = raw_events.copy()

    # Remove known non-events
    non_events = validation_config.get('recurring_non_events', [])
    if non_events:
        filtered = filtered[~filtered['raw_event_name'].isin(non_events)]

    # Strip common text patterns (but keep original in a backup column)
    text_patterns = validation_config.get('text_patterns_to_strip', [])
    if text_patterns:
        filtered['raw_event_name_original'] = filtered['raw_event_name']
        for pattern in text_patterns:
            filtered['raw_event_name'] = filtered['raw_event_name'].str.replace(
                pattern, '', regex=False, case=False
            ).str.strip()
        print(f"   ‚úÇÔ∏è  Stripped {len(text_patterns)} common text patterns")

    return filtered

In [84]:
# conn = psycopg2.connect(os.getenv('DATABASE_URL_UNPOOLED'))
# cur = conn.cursor()
# # MONTHLY: Analyze patterns for all venues
# for venue_id in list(set(raw_df['venue_id'].values)):
#     venue_id = int(venue_id)
#     # Get ALL historical events for this venue
#     all_events = raw_df[raw_df['venue_id'] == venue_id]['raw_event_name'].values

#     patterns = analyze_venue_patterns(venue_id, all_events)
#     if patterns:
#         # Update database
#         cur.execute("""
#             UPDATE venues 
#             SET validation_config = %s
#             WHERE venue_id = %s
#         """, (json.dumps(patterns), venue_id))
#         conn.commit()

In [85]:
conn = psycopg2.connect(os.getenv('DATABASE_URL_UNPOOLED'))
cur = conn.cursor()
cur.execute("""
    SELECT venue_id, name, validation_config
    FROM venues 
    WHERE is_active = TRUE
    ORDER BY name;
""")

column_names = [desc[0] for desc in cur.description]
res = cur.fetchall()
venues = [dict(zip(column_names, v)) for v in res]



In [86]:
all_filtered_events = []

for venue in venues:
    venue_id = int(venue['venue_id'])
    validation_config = venue.get('validation_config', {})

    venue_raw_events = raw_df[raw_df['venue_id'] == venue_id].copy()
    if len(venue_raw_events) == 0:
        continue

    # 2. Quick filter - removes known non-events for THIS venue
    filtered_events_df = quick_filter_events(venue_raw_events, validation_config)
    print(f"   ‚úÖ Removed: {len(venue_raw_events) - len(filtered_events_df)} from {venue['name']}")

    # 3. Collect them
    all_filtered_events.append(filtered_events_df)

# 4. Combine all venues into one DataFrame
filtered_df = pd.concat(all_filtered_events, ignore_index=True)

print(f"\nüìä Total: {len(raw_df)} raw events ‚Üí {len(filtered_df)} after pre-filtering")

# NOW validate the filtered events
# unique_names = filtered_df['raw_event_name'].unique().tolist()
# validated = validate_artists_parallel(sp, unique_names, max_workers=4)

   ‚úÖ Removed: 0 from Ace of Spades
   ‚úÖ Removed: 0 from Goldfield Trading Post
   ‚úÖ Removed: 9 from Neck of the Woods
   ‚úÖ Removed: 5 from Rickshaw Stop
   ‚úÖ Removed: 1 from The Aggie Theatre
   ‚úÇÔ∏è  Stripped 3 common text patterns
   ‚úÖ Removed: 0 from The Armory
   ‚úÖ Removed: 0 from The Great American Music Hall
   ‚úÖ Removed: 1 from The Independent
   ‚úÇÔ∏è  Stripped 1 common text patterns
   ‚úÖ Removed: 0 from The Mishawaka 
   ‚úÖ Removed: 0 from The Warfield
   ‚úÇÔ∏è  Stripped 1 common text patterns
   ‚úÖ Removed: 0 from Washington's

üìä Total: 352 raw events ‚Üí 336 after pre-filtering


In [100]:
filtered_df

Unnamed: 0,venue_id,raw_event_name,raw_date_text,genres,is_cancelled,parsed_date,raw_event_name_original
0,29,Kublai Khan TX,"Thu Oct 23, 2025",,False,2025-10-23,
1,29,DJ Pauly D,"Fri Oct 24, 2025",,False,2025-10-24,
2,29,Yung Gravy: Voluptuous Voyage Tour,"Sat Oct 25, 2025",,False,2025-10-25,
3,29,The Buttertones,"Sun Oct 26, 2025",,False,2025-10-26,
4,29,Spafford,"Wed Oct 29, 2025",,False,2025-10-29,
...,...,...,...,...,...,...,...
331,14,105.5 The Colorado Sound Welcomes,"Saturday, November 1 @ 7:00 pm",,False,2025-11-01,105.5 The Colorado Sound Welcomes
332,14,Washington's and Live Nation present,"Friday, November 14 @ 7:30 pm",,False,2025-11-14,Washington's and Live Nation present
333,14,105.5 The Colorado Sound Welcomes,"Saturday, November 15 @ 7:00 pm",,False,2025-11-15,105.5 The Colorado Sound Welcomes
334,14,105.5 The Colorado Sound Welcomes,"Saturday, November 22 @ 7:00 pm",,False,2025-11-22,105.5 The Colorado Sound Welcomes


In [87]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict, Optional
def validate_artist(sp, raw_event_name, similarity_threshold=90):
    """
    Validate a single artist name against Spotify.
    Returns dict with validated data or None.
    """
    try:
        results = sp.search(q=f'artist:{raw_event_name}', type='artist', limit=3)

        if not results['artists']['items']:
            return None

        spotify_artist = results['artists']['items'][0]
        name = spotify_artist['name']

        if fuzz.ratio(raw_event_name.lower(), name.lower()) < similarity_threshold:
            return None

        return {
            'raw_event_name': raw_event_name,
            'spotify_artist_name': name,
            'spotify_artist_id': spotify_artist['id'],
            'artist_popularity': spotify_artist['popularity'],
            'genres': spotify_artist['genres']
        }
    except Exception as e:
        print(f"‚ùå Error validating {raw_event_name}: {e}")
        return None

def validate_artists_parallel(sp, event_names: List[str], max_workers=4, delay_between_batches=0.5):
      """
      Validate multiple artists in parallel with rate limit protection.
      
      Args:
          sp: Spotipy client
          event_names: List of raw event names to validate
          max_workers: Number of parallel threads (3-4 recommended for Spotify)
          delay_between_batches: Seconds to wait between worker batches
      
      Returns:
          List of validated artist dicts
      """
      import time

      validated_artists = []

      print(f"üîç Validating {len(event_names)} artists with {max_workers} parallel workers...")

      with ThreadPoolExecutor(max_workers=max_workers) as executor:
          # Submit all tasks
          future_to_name = {
              executor.submit(validate_artist, sp, name): name
              for name in event_names
          }

          # Collect results as they complete
          completed = 0
          for future in as_completed(future_to_name):
              result = future.result()
              if result:
                  validated_artists.append(result)

              completed += 1
              # Add a tiny delay every N completions to avoid hammering the API
              if completed % (max_workers * 10) == 0:
                  time.sleep(delay_between_batches)

      print(f"‚úÖ Successfully validated {len(validated_artists)}/{len(event_names)} artists")

      return validated_artists

In [88]:
# try again but with filtered df and see if its different
unique_names = filtered_df['raw_event_name'].unique().tolist()
validated_artists = validate_artists_parallel(sp, unique_names, max_workers=4)

validated_df = pd.DataFrame(validated_artists)

# Step 2: Find what didn't validate
validated_event_names = set(validated_df['raw_event_name'])
unvalidated_df = filtered_df[~filtered_df['raw_event_name'].isin(validated_event_names)].copy()
print(f'missing {len(unvalidated_df)} out of {len(unique_names)}')

üîç Validating 319 artists with 4 parallel workers...




‚úÖ Successfully validated 197/319 artists
missing 128 out of 319


In [101]:
# Now merge with filtered_df to get venue_id, event_date, etc.
validated_df = filtered_df[['venue_id', 'raw_event_name', 'parsed_date', 'is_cancelled']].merge(
    validated_df,
    on='raw_event_name',
    how='inner'
)

# Rename and format to match validated_events schema
validated_df = validated_df.rename(columns={'parsed_date': 'event_date'})
validated_df['genres'] = validated_df['genres'].apply(
    lambda x: ','.join(x) if isinstance(x, list) and x else None
)
validated_df


Unnamed: 0,venue_id,raw_event_name,event_date,is_cancelled,spotify_artist_name,spotify_artist_id,artist_popularity,genres
0,29,Kublai Khan TX,2025-10-23,False,Kublai Khan TX,5BIOo2mCAokFcLHXO2Llb4,58,"hardcore,deathcore,metalcore,metal"
1,29,DJ Pauly D,2025-10-24,False,DJ Pauly D,4kGCUSJJlEKTuol4jSYGPb,28,
2,29,The Buttertones,2025-10-26,False,The Buttertones,1CMml5seBEaxQzlmaGxMPx,46,surf rock
3,29,Spafford,2025-10-29,False,Spafford,7fA0IDinGo27lmOeGy6oGV,35,jam band
4,29,Sir Chloe,2025-11-08,False,Sir Chloe,6rniTPs9zN26kYnkPdFl1U,58,
...,...,...,...,...,...,...,...,...
203,10,Ashnikko,2026-03-21,False,Ashnikko,3PyJHH2wyfQK3WZrk9rpmP,70,
204,10,St. Paul & The Broken Bones,2026-04-01,False,St. Paul & The Broken Bones,4fXkvh05wFhuH77MfD4m9o,55,retro soul
205,10,Bert Kreischer,2026-05-03,False,Bert Kreischer,5ockbqgGT6UUo35PgmTZnI,33,comedy
206,10,Chet Faker,2026-05-05,False,Chet Faker,6UcJxoeHWWWyT5HZP064om,60,


In [89]:
def parse_missed_artists_batch(unvalidated_df, batch_by_venue=True, max_batch_size=100):
    """
    Use OpenAI to filter out non-artists and split multi-artist bills.
    Batches by venue for better LLM context and to avoid token limits.
    
    Args:
        unvalidated_df: DataFrame with columns ['venue_id', 'raw_event_name']
        batch_by_venue: If True, process each venue separately for better context
        max_batch_size: Maximum number of artists per API call
    
    Returns:
        dict: Mapping of raw_event_name -> list of cleaned artist names
    """
    client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

    all_results = {}

    if batch_by_venue:
        # Group by venue
        grouped = unvalidated_df.groupby('venue_id')['raw_event_name'].unique()

        for venue_id, artist_names in grouped.items():
            print(f"\nüéµ Processing venue {venue_id}: {len(artist_names)} events")

            # Further batch if needed (in case one venue has tons of events)
            for i in range(0, len(artist_names), max_batch_size):
                batch = artist_names[i:i + max_batch_size]
                result = _call_openai_parse(client, batch.tolist())
                if result:
                    all_results.update(result)
    else:
        # Process all at once, but still respect max batch size
        unique_artists = unvalidated_df['raw_event_name'].unique()

        for i in range(0, len(unique_artists), max_batch_size):
            batch = unique_artists[i:i + max_batch_size]
            result = _call_openai_parse(client, batch.tolist())
            if result:
                all_results.update(result)

    # Final stats
    total_filtered = sum(len(artists) for artists in all_results.values())
    total_removed = sum(1 for artists in all_results.values() if len(artists) == 0)

    print(f"\nüìä Total Input: {len(all_results)} unique raw names")
    print(f"‚úÖ Total Output: {total_filtered} cleaned artist names")
    print(f"üóëÔ∏è  Total Filtered out: {total_removed} non-artists")

    return all_results


def _call_openai_parse(client, artist_list: List[str], max_retries=3) -> Dict[str, List[str]]:
    """
    Call OpenAI API to parse artist names with error handling.
    
    Returns:
        dict: Mapping of raw names to cleaned artist lists, or None on failure
    """

    prompt = f"""You are analyzing a list of names scraped from music venue websites. 
Some are actual musical artists/bands, and some are event names or non-musical events.

Your task: Extract and clean all PERFORMING MUSICAL ACT names.

STEP 1 - IDENTIFY if the entry contains performing musical acts:
    KEEP: Musicians, bands, DJs, tribute acts - anyone who performs music
    FILTER OUT: Event series (EMO NITE, Nerd Nite), private events, non-music events

STEP 2 - CLEAN the artist names:
    - Remove promotional text: "An Evening with", "Presented by", "Live in Concert"
    - Remove tour names: "- World Tour", "2024 Tour"  
    - Remove location info: "at [venue] - [city]" (but ONLY if it's part of an artist name, not if the whole thing is an event)
    - Remove "feat.", "featuring", "with special guest" and similar

STEP 3 - SPLIT multi-artist bills:
    - "Artist A, Artist B" ‚Üí ["Artist A", "Artist B"]
    - "Artist A & Artist B" ‚Üí ["Artist A", "Artist B"]  
    - "Artist A + Artist B" ‚Üí ["Artist A", "Artist B"]
    - BUT preserve band names with natural "&" or "," (like "The Army, The Navy" or "Simon & Garfunkel" or "Andy Frasco and the
U.N.")

Examples:
- "Legend Zeppelin" ‚Üí ["Legend Zeppelin"]
- "EMO NITE at Rickshaw Stop - San Francisco, CA" ‚Üí [] (entire thing is an event brand, filter out)
- "Nerd Nite SF" ‚Üí [] (event series, filter out)
- "Nora Brown, Stephanie Coleman" ‚Üí ["Nora Brown", "Stephanie Coleman"] (two artists)
- "Josh Ritter and the Royal City Band" ‚Üí ["Josh Ritter and the Royal City Band"] (one act)
- "Pete Yorn ‚Äì You and Me Solo Acoustic" ‚Üí ["Pete Yorn"] (remove tour name)
- "Private Event" ‚Üí [] (filter out)
- "Khalil, Amal, TRAVIE BOBBITO, KING MOST, BELLA D. & FRIENDS" ‚Üí ["Khalil", "Amal", "TRAVIE BOBBITO", "KING MOST", "BELLA 
D."] (split multi-artist showcase)

Return a JSON OBJECT (not array) where:
- Keys are the original raw names from the input
- Values are arrays of cleaned artist names (empty array if filtered out)

Example output format:
{{
"EMO NITE at Rickshaw Stop": [],
"Nora Brown, Stephanie Coleman": ["Nora Brown", "Stephanie Coleman"],
"XANA": ["XANA"]
}}

Names to evaluate:
{json.dumps(artist_list, indent=2)}

Respond with ONLY the JSON object, no other text."""

    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a music industry expert who can distinguish between artist names  and event names."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1,
                response_format={"type": "json_object"}  # Force JSON response
            )

            result_text = response.choices[0].message.content.strip()

            # Clean up any markdown (shouldn't happen with json_object mode, but just in case)
            result_text = re.sub(r'^```(?:json)?\s*|\s*```$', '', result_text.strip(), flags=re.MULTILINE)

            # Parse JSON
            filtered_mapping = json.loads(result_text)

            # Validate output structure
            if not isinstance(filtered_mapping, dict):
                raise ValueError(f"Expected dict, got {type(filtered_mapping)}")

            # Validate all values are lists
            for key, value in filtered_mapping.items():
                if not isinstance(value, list):
                    print(f"‚ö†Ô∏è  Warning: Key '{key}' has non-list value: {value}")
                    filtered_mapping[key] = [value] if value else []

            # Stats for this batch
            total_filtered = sum(len(artists) for artists in filtered_mapping.values())
            total_removed = sum(1 for artists in filtered_mapping.values() if len(artists) == 0)

            print(f"  ‚úÖ Batch: {len(artist_list)} input ‚Üí {total_filtered} artists, {total_removed} filtered")

            return filtered_mapping

        except json.JSONDecodeError as e:
            print(f"  ‚ö†Ô∏è  Attempt {attempt + 1}/{max_retries}: Invalid JSON response: {e}")
            if attempt == max_retries - 1:
                print(f"  ‚ùå Failed to parse after {max_retries} attempts")
                print(f"  Raw response: {result_text[:200]}...")
                return None

        except Exception as e:
            print(f"  ‚ö†Ô∏è  Attempt {attempt + 1}/{max_retries}: API error: {e}")
            if attempt == max_retries - 1:
                print(f"  ‚ùå Failed after {max_retries} attempts")
                return None

    return None

In [90]:
event_artist_map = parse_missed_artists_batch(unvalidated_df, batch_by_venue=True)


üéµ Processing venue 8: 6 events
  ‚úÖ Batch: 6 input ‚Üí 10 artists, 0 filtered

üéµ Processing venue 9: 23 events
  ‚úÖ Batch: 23 input ‚Üí 21 artists, 7 filtered

üéµ Processing venue 10: 8 events
  ‚úÖ Batch: 8 input ‚Üí 11 artists, 1 filtered

üéµ Processing venue 11: 6 events
  ‚úÖ Batch: 6 input ‚Üí 14 artists, 0 filtered

üéµ Processing venue 12: 9 events
  ‚úÖ Batch: 9 input ‚Üí 28 artists, 2 filtered

üéµ Processing venue 13: 17 events
  ‚úÖ Batch: 17 input ‚Üí 15 artists, 5 filtered

üéµ Processing venue 14: 6 events
  ‚úÖ Batch: 6 input ‚Üí 4 artists, 4 filtered

üéµ Processing venue 15: 14 events
  ‚úÖ Batch: 14 input ‚Üí 8 artists, 7 filtered

üéµ Processing venue 29: 25 events
  ‚úÖ Batch: 25 input ‚Üí 20 artists, 5 filtered

üéµ Processing venue 30: 3 events
  ‚úÖ Batch: 3 input ‚Üí 3 artists, 0 filtered

üéµ Processing venue 34: 5 events
  ‚úÖ Batch: 5 input ‚Üí 7 artists, 0 filtered

üìä Total Input: 122 unique raw names
‚úÖ Total Output: 141 cleaned arti

In [98]:
all_cleaned_artists = []
for artists_list in event_artist_map.values():
    all_cleaned_artists.extend(artists_list)

# Remove duplicates (same artist might appear at multiple venues)
unique_cleaned_artists = list(set(all_cleaned_artists))

res = validate_artists_parallel(sp, unique_cleaned_artists, max_workers=4)


üîç Validating 133 artists with 4 parallel workers...
‚ùå Error validating ATREYU: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
‚ùå Error validating Max Mackey Band: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
‚ùå Error validating ADULT.: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
‚ùå Error validating pH-1: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
‚úÖ Successfully validated 83/133 artists


[{'raw_event_name': 'The American Dream',
  'spotify_artist_name': 'The American Dream',
  'spotify_artist_id': '6UKMKYqr8U7kXZ0Tf2Mc6t',
  'artist_popularity': 9,
  'genres': ['melodic hardcore']},
 {'raw_event_name': 'Bitter Days',
  'spotify_artist_name': 'Bitter Days',
  'spotify_artist_id': '0lZ81rmzDesUE90XIXkwf4',
  'artist_popularity': 0,
  'genres': ['pop punk']},
 {'raw_event_name': 'Steph Strings',
  'spotify_artist_name': 'Steph Strings',
  'spotify_artist_id': '39qxIdIb1R6se4J3X6nRPB',
  'artist_popularity': 37,
  'genres': []},
 {'raw_event_name': 'Yung Gravy',
  'spotify_artist_name': 'Yung Gravy',
  'spotify_artist_id': '2YOYua8FpudSEiB9s88IgQ',
  'artist_popularity': 66,
  'genres': []},
 {'raw_event_name': 'Dan Mangan',
  'spotify_artist_name': 'Dan Mangan',
  'spotify_artist_id': '5ByjU6oarxRC2Y85JpKx1n',
  'artist_popularity': 40,
  'genres': []},
 {'raw_event_name': 'OsamaSon',
  'spotify_artist_name': 'OsamaSon',
  'spotify_artist_id': '0uj6QiPsPfK8ywLC7uwBE1',
  

In [103]:
validated_artist_list = res
# Convert to dict: key = cleaned artist name, value = validated spotify data
validated_artists_dict = {
    v['raw_event_name']: v  # This is actually the cleaned artist name from validate_artist
    for v in validated_artist_list
}

# Now build rows
new_rows = []

for raw_event_name, cleaned_artists in event_artist_map.items():
    # Get original event data
    original_events = unvalidated_df[unvalidated_df['raw_event_name'] == raw_event_name]
    if original_events.empty:
        continue

    original_event = original_events.iloc[0]

    for cleaned_artist_name in cleaned_artists:
        # Check if this artist validated
        if cleaned_artist_name in validated_artists_dict:
            validated = validated_artists_dict[cleaned_artist_name]

            new_row = {
                'venue_id': original_event['venue_id'],
                'event_date': original_event['parsed_date'],
                'spotify_artist_id': validated['spotify_artist_id'],
                'spotify_artist_name': validated['spotify_artist_name'],
                'artist_popularity': validated['artist_popularity'],
                'genres': ','.join(validated['genres']) if validated['genres'] else None,
                'raw_event_name': raw_event_name,
                'is_cancelled': original_event.get('is_cancelled', False)
            }
            new_rows.append(new_row)

additional_validated_df = pd.DataFrame(new_rows)
validated_df = pd.concat([validated_df, additional_validated_df], ignore_index=True)


Unnamed: 0,venue_id,raw_event_name,event_date,is_cancelled,spotify_artist_name,spotify_artist_id,artist_popularity,genres
0,29,Kublai Khan TX,2025-10-23,False,Kublai Khan TX,5BIOo2mCAokFcLHXO2Llb4,58,"hardcore,deathcore,metalcore,metal"
1,29,DJ Pauly D,2025-10-24,False,DJ Pauly D,4kGCUSJJlEKTuol4jSYGPb,28,
2,29,The Buttertones,2025-10-26,False,The Buttertones,1CMml5seBEaxQzlmaGxMPx,46,surf rock
3,29,Spafford,2025-10-29,False,Spafford,7fA0IDinGo27lmOeGy6oGV,35,jam band
4,29,Sir Chloe,2025-11-08,False,Sir Chloe,6rniTPs9zN26kYnkPdFl1U,58,
...,...,...,...,...,...,...,...,...
291,34,Stelth Ulvang & Dan Mangan with Frail Talk,2025-11-05,False,Dan Mangan,5ByjU6oarxRC2Y85JpKx1n,40,
292,34,Steph Strings ‚Äì North America 2025 Part One,2025-11-06,False,Steph Strings,39qxIdIb1R6se4J3X6nRPB,37,
293,34,Sean Hayes & Sway Wild,2025-11-20,False,Sean Hayes,478LE6WL6BARBaLU2NHefl,28,
294,34,Sean Hayes & Sway Wild,2025-11-20,False,Sway Wild,4gBQnCbOX2Okv38QKDmb5o,8,


In [115]:
# Get all the raw event names that were validated
validated_raw_names = set(validated_df['raw_event_name'].unique())

# Find failures from filtered_df (events that never made it through either round)
failed_events = filtered_df[~filtered_df['raw_event_name'].isin(validated_raw_names)].copy()

# Prepare for validation_failures table
failed_events_df = failed_events[['venue_id', 'raw_event_name', 'raw_date_text', 'parsed_date']].copy()
failed_events_df = failed_events_df.rename(columns={'parsed_date': 'event_date'})
failed_events_df['failure_reason'] = 'spotify_not_found_or_mismatch'

# Also check if any were filtered out by pre-validation
pre_filtered_names = set(raw_df['raw_event_name']) - set(filtered_df['raw_event_name'])
if pre_filtered_names:
    pre_filtered_events = raw_df[raw_df['raw_event_name'].isin(pre_filtered_names)].copy()
    pre_filtered_df = pre_filtered_events[['venue_id', 'raw_event_name', 'raw_date_text', 'parsed_date']].copy()
    pre_filtered_df = pre_filtered_df.rename(columns={'parsed_date': 'event_date'})
    pre_filtered_df['failure_reason'] = 'filtered_pre_validation'

    # Combine all failures
    all_failures_df = pd.concat([failed_events_df, pre_filtered_df], ignore_index=True)
else:
    all_failures_df = failed_events_df

print(f"\n‚ùå {len(all_failures_df)} events were scraped but not found on spotify")


‚ùå 100 events were scraped but not found on spotify


In [116]:
import psycopg2.extras
validated_df = validated_df.drop_duplicates(
      subset=['venue_id', 'spotify_artist_id', 'event_date'],
      keep='first'
  )

conn = psycopg2.connect(os.getenv('DATABASE_URL_UNPOOLED'))
cur = conn.cursor()

# 1. INSERT VALIDATED EVENTS
insert_validated_query = """
INSERT INTO validated_events (
    venue_id,
    event_date,
    spotify_artist_id,
    spotify_artist_name,
    artist_popularity,
    genres,
    raw_event_name,
    is_cancelled
) VALUES %s
ON CONFLICT (venue_id, spotify_artist_id, event_date) 
DO UPDATE SET
    artist_popularity = EXCLUDED.artist_popularity,
    genres = EXCLUDED.genres,
    raw_event_name = EXCLUDED.raw_event_name,
    is_cancelled = EXCLUDED.is_cancelled,
    scraped_at = CURRENT_TIMESTAMP
"""

# Convert validated_df to tuples
validated_tuples = []
for _, row in validated_df.iterrows():
    validated_tuple = (
        int(row['venue_id']),
        row['event_date'],
        row['spotify_artist_id'],
        row['spotify_artist_name'],
        int(row['artist_popularity']) if pd.notna(row['artist_popularity']) else None,
        row['genres'],
        row['raw_event_name'],
        bool(row['is_cancelled'])
    )
    validated_tuples.append(validated_tuple)

# Execute bulk insert
psycopg2.extras.execute_values(
    cur,
    insert_validated_query,
    validated_tuples,
    template=None,
    page_size=100
)

print(f"‚úÖ Inserted {len(validated_tuples)} validated events")

# 2. INSERT VALIDATION FAILURES
insert_failures_query = """
INSERT INTO validation_failures (
    venue_id,
    raw_event_name,
    raw_date_text,
    event_date,
    failure_reason
) VALUES %s
"""

# Convert all_failures_df to tuples
failure_tuples = []
for _, row in all_failures_df.iterrows():
    failure_tuple = (
        int(row['venue_id']),
        row['raw_event_name'],
        row.get('raw_date_text'),
        row['event_date'] if pd.notna(row['event_date']) else None,
        row['failure_reason']
    )
    failure_tuples.append(failure_tuple)

# Execute bulk insert
psycopg2.extras.execute_values(
    cur,
    insert_failures_query,
    failure_tuples,
    template=None,
    page_size=100
)

print(f"‚ö†Ô∏è  Inserted {len(failure_tuples)} validation failures")

# Commit both
conn.commit()
cur.close()
conn.close()

print(f"\nüéâ Database update complete!")
print(f"   ‚úÖ {len(validated_tuples)} validated events")
print(f"   ‚ùå {len(failure_tuples)} failures logged")

‚úÖ Inserted 293 validated events
‚ö†Ô∏è  Inserted 100 validation failures

üéâ Database update complete!
   ‚úÖ 293 validated events
   ‚ùå 100 failures logged
