In [9]:
import csv
import ast
import os
import requests
from dotenv import load_dotenv
import mysql.connector
import ssl
import pymysql

# Initialize an empty array to store the data
# data = []

# Open the CSV file

load_dotenv()

DELIMITER = "<BRK>"
                                
counter = -1

In [10]:
# Adding this flag so we don't update every time 
update_songs_flag = False

conn = pymysql.connect(
    user=os.getenv('DB_USERNAME'),
    password=os.getenv('DB_PASSWORD'),
    host=os.getenv('DB_HOST'),
    port=int(os.getenv('DB_PORT')),
    database=os.getenv('DB_NAME'),
    ssl={'ca': './ca-certificate.crt'}
)

cursor = conn.cursor()

if update_songs_flag == True: 
    with open('final_tracks.csv', mode='r') as file:
        csv_reader = csv.reader(file)

        for row in csv_reader:
            if counter >= 0 and counter > last_committed:
                index = counter
                name = row[1]
                artists = ast.literal_eval(row[2])
                artists_str = DELIMITER.join(artists)
                song_id = row[3]
                popularity = row[4] 
                artist_ids = ast.literal_eval(row[8])
                artist_ids_str = DELIMITER.join(artist_ids)
                playlist_ids = ast.literal_eval(row[9])
                num_playlists = len(playlist_ids)
                playlist_ids_str = DELIMITER.join(playlist_ids)
                
                print(str(index) + ":", name, "-", artists_str)

                query = """
                    INSERT INTO CS_229_SONGS_ALL (SONG_NUM, NAME, ARTISTS, SONG_ID, POPULARITY, ARTIST_ID, PLAYLIST_IDS, NUM_PLAYLISTS)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                """

                cursor.execute(query, (index, name, artists_str, song_id, popularity, artist_ids_str, playlist_ids_str, num_playlists))
                conn.commit()

            last_committed = counter
            counter += 1

conn.close()

In [11]:
import os
import pymysql
from dotenv import load_dotenv

playlist_set = set()
songs_set = set()
num_songs = 2
# Load environment variables from .env file
load_dotenv()

# Establish a connection to the database
conn = pymysql.connect(
    user=os.getenv('DB_USERNAME'),
    password=os.getenv('DB_PASSWORD'),
    host=os.getenv('DB_HOST'),
    port=int(os.getenv('DB_PORT')),
    database=os.getenv('DB_NAME')
)

try:
    with conn.cursor() as cursor:
        # SQL query to find the top 10,000 songs based on NUM_PLAYLISTS
        query = f"""
            SELECT SONG_NUM, NAME, NUM_PLAYLISTS, PLAYLIST_IDS
            FROM CS_229_SONGS_ALL
            ORDER BY NUM_PLAYLISTS DESC
            LIMIT {num_songs}
        """
        cursor.execute(query)
        
        # Fetch all results
        top_songs = cursor.fetchall()

        # Print results
        for song in top_songs:
            playlist_ids_list = song[3].split(DELIMITER)
            playlist_set.update(set(playlist_ids_list))
            songs_set.add(song[0])
            print(f"Song Number: {song[0]}, Name: {song[1]}, Number of Playlists: {song[2]}")

finally:
    # Close the connection
    conn.close()

Song Number: 4970, Name: Sweater Weather, Number of Playlists: 97
Song Number: 4670, Name: Smells Like Teen Spirit, Number of Playlists: 88


In [12]:
print(str(len(playlist_set)))

182


In [13]:
load_dotenv()

# This dict maps playlist to the songs in those playlists 
playlist_songs_dict = {}

for playlist in playlist_set: 
    playlist_songs_dict[playlist] = []

conn = pymysql.connect(
    user=os.getenv('DB_USERNAME'),
    password=os.getenv('DB_PASSWORD'),
    host=os.getenv('DB_HOST'),
    port=int(os.getenv('DB_PORT')),
    database=os.getenv('DB_NAME')
)

try:
    with conn.cursor() as cursor:
        query = """
            SELECT SONG_NUM, PLAYLIST_IDS
            FROM CS_229_SONGS_ALL
            ORDER BY NUM_PLAYLISTS DESC
        """
        cursor.execute(query)

        all_songs = cursor.fetchall()

        for song in all_songs:
            add_song_flag = False
            for playlist in song[1].split(DELIMITER): 
                if playlist in playlist_set: 
                    playlist_songs_dict[playlist].append(song[0])
                    add_song_flag = True

            if add_song_flag: 
                songs_set.add(song[0])

finally:
    # Close the connection
    conn.close()

In [14]:
print("# of songs in the database is:", str(len(songs_set)))

# of songs in the database is: 8807


In [15]:
# Making co-occurrence matrix 

from scipy.sparse import lil_matrix, csr_matrix
import numpy as np

# Our vocab is the songs_set 
# Map song names to indexes 
song_to_index = {song: index for index, song in enumerate(list(songs_set))}

cooccurrence_matrix = lil_matrix((len(songs_set), len(songs_set)), dtype=int)

playlist_counter = 0
for playlist in playlist_set: 
    # Indexes as row, col in matrix to update, mapped from the songs associated with the playlist
    indexes = [song_to_index[song] for song in playlist_songs_dict[playlist]]
    for i in range(len(indexes)):
        for j in range(i + 1, len(indexes)): 
            cooccurrence_matrix[indexes[i], indexes[j]] += 1
            cooccurrence_matrix[indexes[j], indexes[i]] += 1
    playlist_counter += 1
    if playlist_counter % 100 == 0: 
        print("Finished processing playlist #" + str(playlist_counter))

Finished processing playlist #100


In [16]:
# Sanity checks on the co-occurrence matrix 
cooccurrence_matrix = cooccurrence_matrix.tocsr()
print("Co-occurrence matrix shape:", cooccurrence_matrix.shape)
print("Number of non-zero entries:", cooccurrence_matrix.nnz)
print("Number of zero entries:", str(108338 ** 2 - cooccurrence_matrix.nnz))

row_sums = cooccurrence_matrix.sum(axis=1)
column_sums = cooccurrence_matrix.sum(axis=0)

row_sums = np.array(row_sums).flatten()
column_sums = np.array(column_sums).flatten()

print("Row sums (first 10):", row_sums[:10])
print("Column sums (first 10):", column_sums[:10])


Co-occurrence matrix shape: (8807, 8807)
Number of non-zero entries: 1330210
Number of zero entries: 11735792034
Row sums (first 10): [ 99 194  99  97 146  94  99 546  99 295]
Column sums (first 10): [ 99 194  99  97 146  94  99 546  99 295]


In [17]:
# TODO: Initialize num_songs x embedding_dim 2D matrix of embeddings
# TODO: Start learning. Our loss function is the sum for each song pair dot product of two song embeddings - their co-occurrences (maybe log'd) squared
print(len(songs_set))
count = 0
for song in songs_set:  
    count += 1
    if count >= 10:
        break
    print(song)
    print(song_to_index[song])
    print(all_songs[song_to_index[song]])
# This is just to reference indexes of songs. So we iterate over every song in the song set
# song is its ID in our db but is useless for any sort of math
# song_to_index is the important index in our db
# all_songs[song_to_index] is the actual song data, which I guess right now we aren't caring about
    

8807
65552
0
(4970, 'spotify:playlist:0ea8BtJBTgllhyuEX9RQ8f<BRK>spotify:playlist:04dM82Yt2DcDdGuDsVxiC6<BRK>spotify:playlist:19hxMsnWeXhGBLXGHOXRWJ<BRK>spotify:playlist:2lYTHgnKdLF134LiyLsA9y<BRK>spotify:playlist:6oR2Yk40qHxmZsJNWID5Kv<BRK>spotify:playlist:6zzC8aBbcyX1AtPNJv4ebs<BRK>spotify:playlist:5FYVq0tTPVjogyZmBYtf1F<BRK>spotify:playlist:6DVEl9wfaVbvnYfxEuSE2d<BRK>spotify:playlist:6HyTihf1LpK5YUIndutRBX<BRK>spotify:playlist:0uDEMy8zboD1Ha2731j4qh<BRK>spotify:playlist:2RFmwA1UF7zhcalTC6YbUY<BRK>spotify:playlist:5jg16XBJkVrYYccI2tvL2B<BRK>spotify:playlist:1Gd2QkSfhmHZrFUa0oPpwm<BRK>spotify:playlist:4nuAs8oc3F0LklsMQyXeao<BRK>spotify:playlist:5HAUemLYN1kGSqx1LnGLPS<BRK>spotify:playlist:5cuBCukijEZGyTL83LvwlL<BRK>spotify:playlist:3slaIi3nw0gat5IXNdFgmi<BRK>spotify:playlist:6KaF2jOKBgirsyeASOXwxT<BRK>spotify:playlist:0wNHZTLriAeZxhD4A8K1WP<BRK>spotify:playlist:2cPccAKvCkPN2B6eWfiCnh<BRK>spotify:playlist:2KBb1d8YVICO3Gj3Km9Ju0<BRK>spotify:playlist:5xf47f8EISHQytbzqkkGbb<BRK>spotify:pla

In [48]:
# Ok so initialize our embeddings of size numSongs by numDimensions which I'm going to arbitrarily make 30
embeddingDim = 30
trainingLength = len(songs_set)
embeddingsSize = (trainingLength, embeddingDim)
embeddings = np.array([[0.01]*embeddingDim] *trainingLength)
print(embeddings)
learningRate = 0.05 #also arbitrary
threshold = 0.001
diff = 1
epsilon = 1e-8
dotProds = embeddings.dot(embeddings.T)
COM = cooccurrence_matrix.todense()
log_COM = cooccurrence_matrix.log1p().todense()
counter = 0
while diff > threshold:
    counter += 1
    gradients = np.zeros_like(embeddings)
    gradients += learningRate * (dotProds - COM).dot(embeddings)
    diff = np.sum(np.abs(gradients))
    embeddings -= gradients
    # if counter % 100 == 0:
    print(f"Epoch {counter}: {diff}")



[[0.01 0.01 0.01 ... 0.01 0.01 0.01]
 [0.01 0.01 0.01 ... 0.01 0.01 0.01]
 [0.01 0.01 0.01 ... 0.01 0.01 0.01]
 ...
 [0.01 0.01 0.01 ... 0.01 0.01 0.01]
 [0.01 0.01 0.01 ... 0.01 0.01 0.01]
 [0.01 0.01 0.01 ... 0.01 0.01 0.01]]
Epoch 1: 18549.51694500002
Epoch 2: 527012.5423001156
Epoch 3: 15467171.864689872
Epoch 4: 479052138.2232609
Epoch 5: 15125587166.354383
Epoch 6: 482682927554.0874
Epoch 7: 15502351374159.977
Epoch 8: 499941814483735.94
Epoch 9: 1.6166142183204948e+16
Epoch 10: 5.236693587488467e+17
Epoch 11: 1.6982791572700342e+19
Epoch 12: 5.5117515891669834e+20
Epoch 13: 1.7897219178495912e+22
Epoch 14: 5.8132971876827286e+23
Epoch 15: 1.8886515045259455e+25


KeyboardInterrupt: 