In [None]:
import csv
import ast
import os
import requests
from dotenv import load_dotenv
import mysql.connector
import ssl
import pymysql
import random
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np

# Initialize an empty array to store the data
# data = []

# Open the CSV file

load_dotenv()

DELIMITER = "<BRK>"
                                
counter = -1

random.seed(17)


In [None]:
# Adding this flag so we don't update every time
update_songs_flag = False

conn = pymysql.connect(
    user=os.getenv('DB_USERNAME'),
    password=os.getenv('DB_PASSWORD'),
    host=os.getenv('DB_HOST'),
    port=int(os.getenv('DB_PORT')),
    database=os.getenv('DB_NAME'),
    ssl={'ca': './ca-certificate.crt'}
)

cursor = conn.cursor()

if update_songs_flag == True:
    with open('final_tracks.csv', mode='r') as file:
        csv_reader = csv.reader(file)

        for row in csv_reader:
            if counter >= 0 and counter > last_committed:
                index = counter
                name = row[1]
                artists = ast.literal_eval(row[2])
                artists_str = DELIMITER.join(artists)
                song_id = row[3]
                popularity = row[4]
                artist_ids = ast.literal_eval(row[8])
                artist_ids_str = DELIMITER.join(artist_ids)
                playlist_ids = ast.literal_eval(row[9])
                num_playlists = len(playlist_ids)
                playlist_ids_str = DELIMITER.join(playlist_ids)

                print(str(index) + ":", name, "-", artists_str)

                query = """
                    INSERT INTO CS_229_SONGS_ALL (SONG_NUM, NAME, ARTISTS, SONG_ID, POPULARITY, ARTIST_ID, PLAYLIST_IDS, NUM_PLAYLISTS)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                """

                cursor.execute(query, (index, name, artists_str, song_id,
                               popularity, artist_ids_str, playlist_ids_str, num_playlists))
                conn.commit()

            last_committed = counter
            counter += 1

conn.close()


In [None]:
import os
import pymysql
from dotenv import load_dotenv

playlist_set = set()
songs_set = set()
num_songs = 10
song_to_name = {}


# Load environment variables from .env file
load_dotenv()

# Establish a connection to the database
conn = pymysql.connect(
    user=os.getenv('DB_USERNAME'),
    password=os.getenv('DB_PASSWORD'),
    host=os.getenv('DB_HOST'),
    port=int(os.getenv('DB_PORT')),
    database=os.getenv('DB_NAME')
)

try:
    with conn.cursor() as cursor:
        # SQL query to find the top N songs based on NUM_PLAYLISTS
        query = f"""
            SELECT SONG_NUM, NAME, NUM_PLAYLISTS, PLAYLIST_IDS, ARTISTS
            FROM CS_229_SONGS_ALL
            ORDER BY NUM_PLAYLISTS DESC
            LIMIT {num_songs}
        """
        cursor.execute(query)

        # Fetch all results
        top_songs = cursor.fetchall()

        # Print results
        for song in top_songs:
            playlist_ids_list = song[3].split(DELIMITER)
            playlist_set.update(set(playlist_ids_list))
            songs_set.add(song[0])
            print(
                f"Song Number: {song[0]}, Name: {song[1]}, Number of Playlists: {song[2]}")

finally:
    # Close the connection
    conn.close()


In [None]:
print(str(len(playlist_set)))


In [None]:
load_dotenv()

# This dict maps playlist to the songs in those playlists
playlist_songs_dict = {}

# Processing songs
for playlist in playlist_set:
    playlist_songs_dict[playlist] = []

conn = pymysql.connect(
    user=os.getenv('DB_USERNAME'),
    password=os.getenv('DB_PASSWORD'),
    host=os.getenv('DB_HOST'),
    port=int(os.getenv('DB_PORT')),
    database=os.getenv('DB_NAME')
)

try:
    with conn.cursor() as cursor:
        query = """
            SELECT SONG_NUM, PLAYLIST_IDS, NAME, ARTISTS
            FROM CS_229_SONGS_ALL
            ORDER BY NUM_PLAYLISTS DESC
        """
        cursor.execute(query)

        all_songs = cursor.fetchall()

        for song in all_songs:
            add_song_flag = False
            for playlist in song[1].split(DELIMITER):
                if playlist in playlist_set:
                    playlist_songs_dict[playlist].append(song[0])
                    add_song_flag = True

            if add_song_flag:
                songs_set.add(song[0])

finally:
    # Close the connection
    conn.close()


In [None]:
# Create train, val, and test sets
train_playlists = []
train_playlist_set = set() #IDs of playlists as a set
train_playlist_songs_dict = {}

validation_playlists = []
test_playlists = []

count = 0 
for playlist in playlist_set: 
    if count % 9 == 0: 
        test_playlists.append(playlist_songs_dict[playlist])
    elif count % 10 == 0: 
        validation_playlists.append(playlist_songs_dict[playlist])
    else: 
        train_playlists.append(playlist_songs_dict[playlist])
        train_playlist_set.add(playlist)
        train_playlist_songs_dict[playlist] = playlist_songs_dict[playlist]
    count += 1

# Generate test and validation sets
first_time_flag = True
for playlist in train_playlists:
    if first_time_flag:
        train_songs = set(playlist)
        first_time_flag = False
    else:
        train_songs = train_songs.union(set(playlist))

# Generate validation examples for hyperparam tuning eval
validation_examples = []
while len(validation_examples) <= 100:
    for playlist in validation_playlists:
        i = random.randint(0, len(playlist) - 2)
        if playlist[i] in train_songs and playlist[i + 1] in train_songs:
            validation_examples.append(tuple((playlist[i], playlist[i+1], 1)))
while len(validation_examples) <= 200:
    train_songs_list = list(train_songs)
    song_1 = train_songs_list[random.randint(0, len(train_songs_list) - 1)]
    song_2 = train_songs_list[random.randint(0, len(train_songs_list) - 1)]
    if song_1 != song_2:
        pair_good = True
        for playlist in validation_playlists:
            if song_1 in playlist and song_2 in playlist:
                pair_good = False
        if pair_good:
            validation_examples.append(tuple((song_1, song_2, 0)))

# Generate test examples for final evaluation
test_examples = []
while len(test_examples) <= 100:
    for playlist in test_playlists:
        i = random.randint(0, len(playlist) - 2)
        if playlist[i] in train_songs and playlist[i + 1] in train_songs:
            test_examples.append(tuple((playlist[i], playlist[i+1], 1)))
while len(test_examples) <= 200:
    train_songs_list = list(train_songs)
    song_1 = train_songs_list[random.randint(0, len(train_songs_list) - 1)]
    song_2 = train_songs_list[random.randint(0, len(train_songs_list) - 1)]
    if song_1 != song_2:
        pair_good = True
        for playlist in test_playlists:
            if song_1 in playlist and song_2 in playlist:
                pair_good = False
    if pair_good:
        test_examples.append(tuple((song_1, song_2, 0)))

songs_set = train_songs

In [None]:
print("# of songs in the database is:", str(len(songs_set)))


In [None]:
# Making co-occurrence matrix

from scipy.sparse import lil_matrix, csr_matrix
import numpy as np

# Our vocab is the songs_set
# Map song names to indexes
song_to_index = {song: index for index, song in enumerate(list(songs_set))}

cooccurrence_matrix = lil_matrix((len(songs_set), len(songs_set)), dtype=int)

playlist_counter = 0
for playlist in train_playlist_set:
    # Indexes as row, col in matrix to update, mapped from the songs associated with the playlist

    indexes = [song_to_index[song]
               for song in train_playlist_songs_dict[playlist]]
    for i in range(len(indexes)):
        for j in range(i + 1, len(indexes)):
            cooccurrence_matrix[indexes[i], indexes[j]] += 1
            cooccurrence_matrix[indexes[j], indexes[i]] += 1
    playlist_counter += 1
    if playlist_counter % 100 == 0:
        print("Finished processing playlist #" + str(playlist_counter))


In [None]:
# Sanity checks on the co-occurrence matrix
cooccurrence_matrix = cooccurrence_matrix.tocsr()
print("Co-occurrence matrix shape:", cooccurrence_matrix.shape)
print("Number of non-zero entries:", cooccurrence_matrix.nnz)
print("Number of zero entries:", str(108338 ** 2 - cooccurrence_matrix.nnz))

row_sums = cooccurrence_matrix.sum(axis=1)
column_sums = cooccurrence_matrix.sum(axis=0)

row_sums = np.array(row_sums).flatten()
column_sums = np.array(column_sums).flatten()

print("Row sums (first 10):", row_sums[:10])
print("Column sums (first 10):", column_sums[:10])


In [None]:
"""
Live functions to initialize training and let it run. 
"""
import numpy as np

def init_embeddings(num_songs, dim=25):
    embeddings = np.random.randn(num_songs, dim) * 0.01
    bias = np.zeros(num_songs)
    return embeddings, bias

def weighting_func(x, x_max, alpha=0.75):
    return np.where(x < x_max, (x / x_max) ** alpha, 1)

def basic_train(cooccurrence_matrix, vocab_size, emb_dim, lr=0.05, epochs=150, x_max=1000, alpha=0.75):
    # Initialize embeddings and bias
    embeddings, bias = init_embeddings(vocab_size, emb_dim)
    
    # Extract non-zero co-occurrence pairs (i, j, x_ij)
    coo_matrix = cooccurrence_matrix.tocoo()  # Convert sparse matrix to COO format (for easy iteration)
    non_zero_indices = list(zip(coo_matrix.row, coo_matrix.col, coo_matrix.data))
    
    # Precompute weights for the non-zero elements
    cooccurrence_values = np.array([x[2] for x in non_zero_indices])
    weights = weighting_func(cooccurrence_values, x_max, alpha)
    
    total_loss_history = []
    
    # Training loop
    for epoch in range(epochs):
        total_loss = 0
        
        for idx, (i, j, x_ij) in enumerate(non_zero_indices):
            # Compute dot product + bias
            prediction = np.dot(embeddings[i], embeddings[j]) + bias[i] + bias[j]
            log_x_ij = np.log(x_ij + 1)  # Laplace Smoothing
            weight = weights[idx]
            
            # Compute the loss for this pair
            loss = weight * (prediction - log_x_ij) ** 2
            total_loss += loss
            
            # Compute gradients
            grad_common = 2 * weight * (prediction - log_x_ij)
            grad_emb_i = grad_common * embeddings[j]
            grad_emb_j = grad_common * embeddings[i]
            grad_bias_i = grad_common
            grad_bias_j = grad_common
            
            # Update embeddings and biases
            embeddings[i] -= lr * grad_emb_i
            embeddings[j] -= lr * grad_emb_j
            bias[i] -= lr * grad_bias_i
            bias[j] -= lr * grad_bias_j
        
        total_loss_history.append(total_loss)
        print(f"Epoch {epoch} Loss: {total_loss}")
        
        # Convergence check
        if epoch > 0 and np.abs(total_loss_history[-1] - total_loss_history[-2]) < 1e-4:
            print(f"Convergence reached at epoch {epoch + 1}.")
            break
    
    return embeddings

In [None]:
# Testing for analysis of which songs to plot
"""for key in song_to_name_embedding_dict.keys(): 
    print(key, song_to_name_embedding_dict[key])"""


In [None]:
def evaluate_embeddings(embeddings, val, test): 
    # Takes ~7 secs ish to run per 1k songs: sets up indexing in order to interpret tSNE
    # Note that we only need to do this once, do all embedding training under this!!!!
    load_dotenv()

    conn = pymysql.connect(
        user=os.getenv('DB_USERNAME'),
        password=os.getenv('DB_PASSWORD'),
        host=os.getenv('DB_HOST'),
        port=int(os.getenv('DB_PORT')),
        database=os.getenv('DB_NAME')
    )

    # Reverses song_to_index (song ID: matrix index)- now is matrix index: song ID
    index_to_song = {index: song for song, index in song_to_index.items()}
    song_to_name_embedding_dict = {}

    for song_id in song_to_index.keys():
        with conn.cursor() as cursor:
            query = f"""
                SELECT NAME, ARTISTS
                FROM CS_229_SONGS_ALL
                WHERE SONG_NUM = {song_id}
                LIMIT {1}
            """
            cursor.execute(query)
            top_songs = cursor.fetchall()
        artists = top_songs[0][1].split(DELIMITER)
        song_to_name_embedding_dict[song_id] = tuple(
            (top_songs[0][0], artists, embeddings[song_to_index[song_id]]))
        if len(song_to_name_embedding_dict.keys()) % 1000 == 0:
            print("Done processing", str(len(song_to_name_embedding_dict.keys())), "songs")

    print(song_to_name_embedding_dict)

    selected_song_ids_1 = [816, 817, 58385, 862, 45032, 51494, 24175, 974, 72162, 955, 9048, 9082]
    # LADY GAGA: 816 - Just Dance, 817 - Paparazzi, 58385 - Applause
    # JUSTIN BIEBER: 862 - Ghost, 45032 - Off My Face, 51494 - 2U
    # KENDRICK LAMAR: 24175 - Alright, 974 - PRIDE., 72162 - Rigamortus
    # PITBULL: 955 - Time of Our Lives, 9048 - Timber, 9082 - Fireball

    selected_song_ids_2 = [142040, 13378, 13402, 74386, 74387, 75851, 72303, 47208, 53778, 259796, 4993, 5839]
    # LUKE COMBS (country) - 142040 - One Number Away, 13378 - Beautiful Crazy, 13402 - The Kind of Love We Make 
    # HAMILTON - 74386 - I Know Him, 74387 - What Comes Next?, 75851 - Wait For It 
    # BAD BUNNY - 72303 - Neverita, 47208 - Titi me Pregunto, 53778 - Safaera 
    # QUEEN - 259796 - Radio Ga Ga, 4993 - Under Pressure, 5839 - Don't Stop Me Now

    if val: 
        get_val_accuracy(embeddings)
    if test: 
        get_test_accuracy(embeddings)

    generate_visual(song_to_name_embedding_dict, selected_song_ids_1)
    generate_visual(song_to_name_embedding_dict, selected_song_ids_2)
    return song_to_name_embedding_dict

def get_val_accuracy(embeddings): 
    total_examples = len(test_examples)
    correct_sum = 0  
    count = 0

    for ex in validation_examples:
        song_1, song_2, label = ex[0], ex[1], ex[2]
        embedding_1 = embeddings[song_to_index[song_1]]
        embedding_2 = embeddings[song_to_index[song_2]]
        cosine_sim = np.dot(embedding_1, embedding_2) / (np.linalg.norm(embedding_1) * np.linalg.norm(embedding_2))
        if cosine_sim > 0.2: 
            if ex[2] == 1: 
                correct_sum += 1
            count += 1
        elif cosine_sim < -0.2: 
            if ex[2] == 0: 
                correct_sum += 1
            count += 1
    print(f"Validation accuracy is {correct_sum} / {count} = {correct_sum / count}")
        

def get_test_accuracy(embeddings): 
    total_examples = len(test_examples)
    correct_sum = 0
    count = 0
    for ex in test_examples: 
        song_1, song_2, label = ex[0], ex[1], ex[2]
        embedding_1 = embeddings[song_to_index[song_1]]
        embedding_2 = embeddings[song_to_index[song_2]]
        cosine_sim = np.dot(embedding_1, embedding_2) / (np.linalg.norm(embedding_1) * np.linalg.norm(embedding_2))
        if cosine_sim > 0.2:
            if ex[2] == 1:
                correct_sum += 1
            count += 1
        elif cosine_sim < -0.2:
            if ex[2] == 0:
                correct_sum += 1
            count += 1
    print(f"Test accuracy is {correct_sum} / {count} = {correct_sum / count}")


def generate_visual(song_to_name_embedding_dict, song_ids): 
    checkpoint_embeddings_dict = song_to_name_embedding_dict

    # Get the corresponding embeddings
    selected_embeddings = [checkpoint_embeddings_dict[song_id][2]
                        for song_id in song_ids]

    # Perform t-SNE to reduce dimensions to 2D, set perplexity lower than the number of samples
    tsne = TSNE(n_components=2, random_state=42,
                perplexity=10)  # Setting perplexity to 10 < 15
    embeddings_2d = tsne.fit_transform(np.array(selected_embeddings))

    # Create a dictionary for labels (song IDs to song names)
    labels = {
        song_id: checkpoint_embeddings_dict[song_id][0] + " - " + ", ".join(checkpoint_embeddings_dict[song_id][1]) for song_id in song_ids}

    # Plot the reduced 2D embeddings
    plt.figure(figsize=(8, 6))
    plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], s=50, cmap='viridis')

    # Add text labels for each point, indexed by their position in selected_song_ids
    for i, song_id in enumerate(song_ids):
        plt.text(embeddings_2d[i, 0], embeddings_2d[i, 1],
                labels[song_id], fontsize=9, ha='right', color='red')

    # Add plot details
    plt.title('2D t-SNE Visualization of Embeddings')
    plt.xlabel('t-SNE Dimension 1')
    plt.ylabel('t-SNE Dimension 2')

    # Show plot
    plt.show()


In [None]:
# For hyperparam tuning, set epochs = 150
# Grid search on:
# (a) vector len = 25, 75, 150 -->
# (b) alpha = 0.65, 0.75, 0.85
# (c) lr = 0.025, 0.05, 0.1

"""
ALREADY TRAINED, DO NOT RE-RUN THIS CELL!
"""
embeddings_dim_25 = basic_train(
    cooccurrence_matrix, cooccurrence_matrix.shape[0], emb_dim=25, alpha=0.75, lr=0.05)
evaluate_embeddings(embeddings_dim_25, True, False)


In [None]:
get_val_accuracy(embeddings_dim_25)
get_test_accuracy(embeddings_dim_25)

In [None]:
embeddings_dim_75 = basic_train(
    cooccurrence_matrix, cooccurrence_matrix.shape[0], emb_dim=75, alpha=0.75, lr=0.05)
evaluate_embeddings(embeddings_dim_75, True, False)


In [None]:
get_val_accuracy(embeddings_dim_75)
get_test_accuracy(embeddings_dim_75)


In [None]:
embeddings_dim_150 = basic_train(
    cooccurrence_matrix, cooccurrence_matrix.shape[0], emb_dim=150, alpha=0.75, lr=0.05)
evaluate_embeddings(embeddings_dim_150, True, False)


In [None]:
"""
ALREADY TRAINED, DO NOT RE-RUN THIS CELL!
"""
embeddings_dim_250 = basic_train(
    cooccurrence_matrix, cooccurrence_matrix.shape[0], emb_dim=250, alpha=0.75, lr=0.05)
evaluate_embeddings(embeddings_dim_250, True, False)


In [None]:
get_val_accuracy(embeddings_dim_250)
get_test_accuracy(embeddings_dim_250)


In [None]:
embeddings_dim_500 = basic_train(
    cooccurrence_matrix, cooccurrence_matrix.shape[0], emb_dim=500, alpha=0.75, lr=0.05)


In [None]:
important_dict = evaluate_embeddings(embeddings_dim_500, True, False)
