# Matchmaking!
In this notebook I match my user profile with festival lineups scraped from Setlist.fm.

In [1]:
# import libraries
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
from urllib.request import urlopen
from math import sqrt
from math import exp
from collections import defaultdict
import pickle
import sqlite3
from dotenv import load_dotenv
import os

In [4]:
save_dir = '/Users/carolinashimabukuro/projects/festival-matchmaker/'

In [5]:
# load user profile and festival list
with open(save_dir + 'saved_user_profile.pkl', 'rb') as f:
    user_profile = pickle.load(f)
    
festival_list = pd.read_excel("festi_list.xlsx")

In [None]:
# keep 2 festivals for testing purposes
# comment this out for the final version!
#festival_list = festival_list.iloc[:2]
#festival_list

# SQLite connection
I'm using SQLite to save the similarity scores in a database.

In [6]:
# connect db function
def connect_to_db(db_directory, db_name = 'festival_data.db'):
    # check if folder exists
    if not os.path.exists(db_directory):
        os.makedirs(db_directory)
    
    # create full path for the db
    db_path = os.path.join(db_directory, db_name)
    
    # connect to SQLite db
    conn = sqlite3.connect(db_path)
    return conn

In [7]:
# connect to db
conn = connect_to_db(save_dir)
cursor = conn.cursor()

# create festival_data table if it doesn't exist
cursor.execute('''
CREATE TABLE IF NOT EXISTS festival_data (
    festival_name TEXT PRIMARY KEY,
    data BLOB
)
''')

conn.commit()

# Define some functions

In [8]:
def cosine_similarity(profile, artist_features):
    """
    Calculates cosine similarity for an artist, given my user profile.
    """
    dot_product = sum(profile[feature] * artist_features.get(feature, 0) for feature in profile)
    profile_magnitude = sqrt(sum(value ** 2 for value in profile.values()))
    artist_magnitude = sqrt(sum(value ** 2 for value in artist_features.values()))
    
    if profile_magnitude == 0 or artist_magnitude == 0:
        return 0.0
    return dot_product / (profile_magnitude * artist_magnitude)

In [9]:
def scrape_setlistfm(festival_url):
    """
    Scrapes setlist.fm to get festival lineups, and venue + date information.
    """
    html = urlopen(festival_url).read()
    soupified = BeautifulSoup(html, 'html.parser')

    artists_list = soupified.find_all(attrs={"class": "FestivalSetlistListItem-artist"})

    lineup = []
    for artist in artists_list:
        a = artist.find("a", {"class": "Link-root_color-blue"}).get_text().strip()
        lineup.append(a)
    
    # get general data like dates, venue, etc.
    date_strong = soupified.find('strong', class_='value')
    date_text = date_strong.get_text().strip() # format example: Wednesday, August 21, 2024 - Sunday, August 25, 2024
    
    # venue
    venue_tag = soupified.find('h2', class_='Text-root Text-root_variant-display2 Text-root_color-grayDark')
    venue_text = venue_tag.get_text(strip=True)
    
    return lineup, date_text, venue_text

In [10]:
def normalise_scores(artist, similarity_score, df_norm, beta):
    """
    Weighs cosine similarity scores based on my scrobbles.
    beta is an arbitrary value! I want it to be relatively large; I trust my scrobbles more than similarity based on tags.
    If I have no scrobbles for an artist, the final similarity score will be the same as the cosine similarity.
    """
    artist_lowercase = artist.lower() # lowercase in case there's a mismatch between setlist.fm and last.fm spelling
    artist_match = df['artist'].str.lower() == artist_lowercase # store string match result

    # have I scrobbled this artist?
    # if yes
    if artist_match.any():
        # calculate normalised scrobbles
        this_norm_scrobble = df_norm.loc[artist_match, 'scrobbles'].values[0]
        # also store number of scrobbles
        this_scrobble = df.loc[artist_match, 'scrobbles'].values[0]
        # calculate final score
        final_score = similarity_score * np.exp(beta * this_norm_scrobble)
    # if no, the final score is the same as cosine similarity
    else:
        this_norm_scrobble = None
        this_scrobble = None
        final_score = similarity_score
    return this_norm_scrobble, this_scrobble, final_score


In [11]:
# load .env with API key
load_dotenv()
API_KEY = os.getenv('API_KEY')

n_tags = 10 # get the top 10 tags

def calc_scores(user_profile, lineup, df_norm):
    """
    Calculates all scores for a given festival.
    """
    lineup_columns = ['artist_name', 'cosine_similarity', 'scrobbles_norm', 'scrobbles', 'final_score']
    lineup_df = pd.DataFrame(columns = lineup_columns)
    for artist in lineup:
        url = f'http://ws.audioscrobbler.com/2.0/?method=artist.getTopTags&artist={artist}&api_key={API_KEY}&format=json&limit=20'
        try:
            response = requests.get(url)
            response.raise_for_status()
            artistdata = response.json()

            if 'error' in artistdata: # error 6, artist could not be found
                lineup_df = lineup_df.append({
                    'artist_name': artist,
                    'cosine_similarity': None,
                    'scrobbles_norm': None,
                    'scrobbles': None,
                    'final_score': None
                }, ignore_index = True)
            else:
                newband_vector = dict()
                try:
                    for tagname in artistdata['toptags']['tag'][:n_tags]:
                        if tagname['name'] == 'seen live': # exclude 'seen live'
                            continue
                        newband_vector[tagname['name']] = tagname['count']
                except:
                    print(f"Artist error: {artist}")
                    print(f"{artistdata}")

                sum_vals = sum(newband_vector.values())
                newband_vector_scaled = {k: v / total for total in (sum(newband_vector.values()),) for k, v in newband_vector.items()}
                similarity_score = cosine_similarity(user_profile, newband_vector)
                #print(f"{artist}: {similarity_score:.2f}")
                #artists_scores[artist] = similarity_score
                
                beta = 75
                scrobbles_norm, scrobbles, final_score = normalise_scores(artist, similarity_score, df_norm, beta)
                
                lineup_df = lineup_df.append({
                    'artist_name': artist,
                    'cosine_similarity': similarity_score,
                    'scrobbles_norm': scrobbles_norm,
                    'scrobbles': scrobbles,
                    'final_score': final_score
                }, ignore_index = True)
                
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data: {e}")

    return lineup_df


In [12]:
def festival_data_exists(festival_name):
    """
    Checks if this festival was already processed and exists in my database.
    """
    cursor.execute('SELECT 1 FROM festival_data WHERE festival_name = ?', (festival_name,))
    return cursor.fetchone() is not None


In [13]:
def save_festival_data(festival_name, data):
    """
    Saves the database.
    """
    serialized_data = pickle.dumps(data)
    cursor.execute('INSERT OR REPLACE INTO festival_data (festival_name, data) VALUES (?, ?)', 
                   (festival_name, serialized_data))
    conn.commit()

In [14]:
def calc_summary_stats(df, festival_name):
    """
    Calculates summary statistics and puts them in a df.
    """
    summary_columns = ['festi_name', 'n_artists', 'n_artists_total', 'mean_score', 'median_score', 'date', 'venue']
    summary_df = pd.DataFrame(columns = summary_columns)
    
    n_artists = df['cosine_similarity'].notna().sum()
    n_artists_total = len(df)
    mean_score = df['final_score'].mean()
    median_score = df['final_score'].median()
    
    summary_df = summary_df.append({
        'festi_name': festival_name,
        'n_artists': n_artists,
        'n_artists_total': n_artists_total,
        'mean_score': mean_score,
        'median_score': median_score
    }, ignore_index = True)
    
    return summary_df

In [15]:
# load last.fm data artist: scrobbles
# this is to normalise the scores and to then check whether I have scrobbled them or not
df = pd.read_csv("festivalrec_data.csv")
df.columns = ["artist","scrobbles"]
total_scrobbles = df["scrobbles"].sum() # get total scrobbles

In [16]:
# normalise scrobbles
df_norm = df.copy() # make copy
df_norm["scrobbles"] = df_norm["scrobbles"].div(total_scrobbles)

In [17]:
# loop over festivals

festi_summary = []
for _, row in festival_list.iterrows():
    festi_name = row['festi_name']
    festi_url = row['festi_url']

    # check if the similarity data for this festival already exists
    if festival_data_exists(festi_name):
        print(f'Skipping {festi_name}, data already exists.')
        continue
    
    print('*** ', festi_name, ' ***')
    
    # scrape lineup
    print('Scraping...')
    festival_lineup, date_info, venue_name = scrape_setlistfm(festi_url)

    # calculate cosine similarity
    print('Calculating cosine similarity...')
    similarities = calc_scores(user_profile, festival_lineup, df_norm)
    
    # calculate summary stats and put into df
    festi_summary.append({
        'festi_name': festi_name,
        'n_artists': similarities['cosine_similarity'].notna().sum(),
        'n_artists_total': len(similarities),
        'mean_score': similarities['final_score'].mean(),
        'median_score': similarities['final_score'].median(),
        'date': date_info,
        'venue': venue_name
    })
    
    # save the results to the database
    save_festival_data(festi_name, similarities)
    
    print(f'Processed {festi_name}!')

# convert summary list to df, then save
festi_summary = pd.DataFrame(festi_summary)
festi_summary.to_pickle(save_dir + 'festival_summary.pkl')
print('All done! :)')

***  Reading Festival  ***
Scraping...
Calculating cosine similarity...
Processed Reading Festival!
***  British Summer Time  ***
Scraping...
Calculating cosine similarity...
Processed British Summer Time!
***  Download Festival  ***
Scraping...
Calculating cosine similarity...
Processed Download Festival!
***  Rock in Rio  ***
Scraping...
Calculating cosine similarity...
Processed Rock in Rio!
***  Rock Werchter  ***
Scraping...
Calculating cosine similarity...
Processed Rock Werchter!
***  Lollapalooza  ***
Scraping...
Calculating cosine similarity...
Processed Lollapalooza!
***  Glastonbury Festival  ***
Scraping...
Calculating cosine similarity...
Processed Glastonbury Festival!
***  Summerfest  ***
Scraping...
Calculating cosine similarity...
Processed Summerfest!
***  Leeds Festival  ***
Scraping...
Calculating cosine similarity...
Processed Leeds Festival!
***  Bonnaroo  ***
Scraping...
Calculating cosine similarity...
Processed Bonnaroo!
***  Victorious Festival  ***
Scraping..