In [83]:
from functions import start_selenium, scrape_chapel, scrape_independent, scrape_rickshaw, parse_chapel_soups, parse_independent_soup, parse_rickshaw_soups, spotify_connect, clear_playlist, add_songs_to_playlist
from datetime import datetime
from dotenv import load_dotenv
import os
from openai import OpenAI
from scrape_functions import *

load_dotenv()
client_id = os.getenv('CLIENT_ID')
client_secret = os.getenv('CLIENT_SECRET')
refresh_token = os.getenv('REFRESH_TOKEN')
OPENAI_API_KEY= os.getenv('OPENAI_API_KEY')
current_date = datetime.now().date()
cutoff_in_days=21

known_classes = {'http://www.thechapelsf.com': "fs-12 headliners"}



In [85]:
def whats_the_class(soup, failed_class=None):
    client = OpenAI()
    half = len(str(soup))//2
    soup_str = str(soup)

    if failed_class:
        html_str = soup_str[half:]
    else:
        html_str = soup_str[:half]
    
    failed_message = f"I already tried '{failed_class}' and it didn't work - don't suggest that again.\n" if failed_class else ''
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{
            "role": "system",
            "content": "You are an expert at analyzing HTML structure. Return only the CSS class name(s) used for headliner/artist names, nothing else."
        }, {
            "role": "user",
            "content": f"""Analyze this venue website HTML and find the CSS class used specifically for headliner/artist names.
{failed_message}
Look for patterns like:
- Elements containing artist names
- Event titles with performer names
- Headliner sections
Return ONLY ONE class name that is most likely to contain artist names. 
Do not include multiple classes or special characters.
Examples of good responses:
event-title
artist-name
rhp-event__title

HTML:
{html_str}"""
        }],
        temperature=0,
        max_tokens=50
    )
    return completion.choices[0].message.content.strip()


In [86]:
def parse_venue_flow(venue_url, venue_soup, known_classes):
    # if you know the class to 
    if known_classes.get(venue_url):
        print('I already know the tags this website uses!')
        tag_class = known_classes[venue_url]
        headliners = parse_soup(venue_soup, tag_class)
        return headliners
    # else ask chat for the tag
    print('I never parsed this site before! let me ask chat what the class is')
    chat_says = whats_the_class(venue_soup)
    print('chat thinks the class is', chat_says)
    headliners = parse_soup(venue_soup, chat_says)

    if headliners and len(headliners) > 0:
        print(f'found {len(headliners)} headliners, saving class')
        known_classes[venue_url] = chat_says
        return headliners
    print('The tag chat gave me didnt work, let me try again!')
    chat_says_2 = whats_the_class(venue_soup, failed_class=chat_says)
    print('chat thinks the tag is', chat_says_2)
    headliners = parse_soup(venue_soup, chat_says_2)

    if headliners and len(headliners) > 0:
        print(f'found {len(headliners)} headliners, saving class')
        known_classes[venue_url] = chat_says
        return headliners
    else:
        print('chat failed twice - give up 😡😡')
        return None


In [87]:
from functions import spotify_connect
sp, playlist_id = spotify_connect(client_id, client_secret, refresh_token)

Using refresh token: AQAzE...
Got access token: BQC6J...
User found: a-dog 
Playlist found: Live & Local


In [96]:
def clean_headliners(headliners):
    """Take a list of headliners and return a list that is cleaner"""
    # OKAY NEW CHALLENGE is to parse through these event titles / headliners and find them on spotify
    # skip ones that are obviously not bands
    skips = ['open mic', 'karaoke', 'stand up']
    drops =  ["tour", "with", "presented by", "featuring", 'presents', 'sold out', 'second show added']
    import re

    # Create a single regex pattern for skips (faster than multiple string searches)
    skip_pattern = re.compile('|'.join(skips), re.IGNORECASE)

    # Create a single regex pattern for drops
    drop_pattern = re.compile('|'.join(re.escape(drop) for drop in drops), re.IGNORECASE)

    cleaned_headliners = []

    for headliner in headliners:
        # Single regex search instead of loop
        if skip_pattern.search(headliner):
            print(f"Skipping: {headliner}")
            continue
            
        # Single regex substitution instead of multiple replacements
        cleaned = drop_pattern.sub('', headliner).strip()

        if cleaned:
            cleaned_headliners.append(cleaned)


    return cleaned_headliners

search_results={'strict':[], 'loose':[]}

def find_artist_on_spotify(sp, headliner):
    artist_name=None
    artist_id=None
    strict_search = sp.search(q=f'artist:{headliner}', type='artist', limit=3)

    search_result_name_id = [(k['name'], k['id']) for k in strict_search['artists']['items']]

    if search_result_name_id:
        artist_name, artist_id = search_result_name_id[0]
        # top_tracks = sp.artist_top_tracks(artist_id, country='US')
        # tracks_to_add = top_tracks['tracks'][:3]
        # print('strict', headliner, '---->', artist_name)
        search_results['strict'].append([headliner, artist_name])
    else: 
        loose_search = sp.search(q=headliner, type='artist', limit=3)
        search_result_name_id = [(k['name'], k['id']) for k in loose_search['artists']['items']]
        if search_result_name_id:
            artist_name, artist_id = search_result_name_id[0]
            # print('loose', headliner, '---->', artist_name)
            search_results['loose'].append([headliner, artist_name])

    return artist_name, artist_id



In [89]:
def add_songs_from_artist(sp, artist_id, playlist_id, n=3, country_code='US'):
    top_tracks = sp.artist_top_tracks(artist_id, country=country_code)
    tracks_to_add = top_tracks['tracks'][:n]
    track_uris = [track['uri'] for track in tracks_to_add]
    # sp.playlist_add_items(playlist_id, track_uris)
    print(len(track_uris), 'songs added')

In [97]:
url = 'https://tractortavern.com/'
driver = start_selenium()
soup = get_soup(url, driver)
driver.quit()
headliners = parse_venue_flow(url, soup, known_classes)
c_heads = clean_headliners(headliners)
sp, playlist_id = spotify_connect(client_id, client_secret, refresh_token)

artist_ids = []
artist_names = []
processed_heads = set()

for head in c_heads:
    if len(artist_ids) >= 21:
        break
        
    if head in processed_heads:
        continue
        
    processed_heads.add(head)
    artist_name, artist_id = find_artist_on_spotify(sp, head)
    print(head, "|||", artist_name)
    
    if artist_id and (artist_id not in artist_ids):
        add_songs_from_artist(sp, artist_id, playlist_id)
        artist_ids.append(artist_id)
        artist_names.append(artist_name)

print(f"Added {len(artist_ids)} artists to playlist")

I already know the tags this website uses!
Using refresh token: AQAzE...
Got access token: BQA53...
User found: a-dog 
Playlist found: Live & Local
KBCS : Chaparelle w/ Santa Poco ||| None
The Lone Bellow 2025 Fall  w/ Valley James ||| The Lone Bellow
3 songs added
The Lone Bellow 2025 Fall  BOTH SHOWS ||| The Lone Bellow
Tractor : Deerlady w/ Richard Simeonoff, Soft Chain AT Conor Byrne Pub ||| Sir Richard Bishop
3 songs added
Babes in Canyon Album Release Show w/ Joyla Red & Waltzerr ||| Babes In Canyon
3 songs added
David Archuleta w/ Alexandra John ||| David Archuleta
3 songs added
! Tractor : White Denim w/ Johanna Samuels & AT The Sunset ||| White Denim
3 songs added
Sean Hayes w/ Sway Wild ||| Sean Hayes
3 songs added
2nd Show Added! Tractor : White Denim AT The Sunset ||| White Denim
KBCS : The Bones of J.R. Jones w/ Madeline Hawthorne ||| T-Bone
3 songs added
! Amble w/ Luke Tyler Shelton ||| Luke Tyler Shelton
3 songs added
Tractor : Kelsey Waldon w/ Sterling Drake, Adira Sha