In [2]:
import pandas as pd
import boto3
import math
import utilities


In [None]:
s3 = boto3.resource('s3')

s3_obj = s3.Bucket("spotify-song-recommender-data").Object('tracks_features.csv').get()

song_data = pd.read_csv(s3_obj['Body'])
print(song_data.shape)
print(song_data.columns)


(1204025, 24)
Index(['id', 'name', 'album', 'album_id', 'artists', 'artist_ids',
       'track_number', 'disc_number', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature', 'year', 'release_date'],
      dtype='object')


Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],1,1,False,0.47,...,0.0727,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],2,1,True,0.599,...,0.188,0.0129,7.1e-05,0.155,0.489,103.68,206200,4.0,1999,1999-11-02
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],3,1,False,0.315,...,0.483,0.0234,2e-06,0.122,0.37,149.749,298893,4.0,1999,1999-11-02


In [8]:
print(song_data.shape)
print(song_data.columns)
print(song_data['explicit'].unique())

(1204025, 24)
Index(['id', 'name', 'album', 'album_id', 'artists', 'artist_ids',
       'track_number', 'disc_number', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature', 'year', 'release_date'],
      dtype='object')
[False  True]


In [None]:
# turning Explicit into a binary
explicitDict = {
    False: 0,
    True: 1,
}
pd.set_option('future.no_silent_downcasting', True)
song_data['explicit'] = song_data['explicit'].replace(explicitDict)
print(song_data['explicit'].unique())

[False  True]


In [None]:
from sklearn.preprocessing import StandardScaler
X = song_data[[]]
# scaler = StandardScaler()
# X = scaler.fit_transform(X)
X = StandardScaler().fit_transform(X)

In [6]:
from sklearn.cluster import KMeans

kmeans = KMeans(init="k-means++", n_clusters = 344, n_init=4, random_state=1)
kmeans.fit(X)
song_data['cluster'] = kmeans.labels_

In [None]:
for cluster in sorted(song_data['cluster'].unique())[200:210]:
    print(f"\nCluster #{cluster}")
    songsInCluster = song_data[song_data['cluster'] == cluster].head(5)
    for songName in songsInCluster['song']:
        print(f" - {songName}")

In [7]:
datapoint = song_data.iloc[[4456]]
filteredDatapoint = datapoint.drop('artist', axis=1).drop('song', axis=1).drop('emotion', axis=1).drop('Key', axis=1).drop('Genre',axis=1).drop('cluster', axis=1)
print(datapoint)
test = kmeans.predict(filteredDatapoint)
print(test)

     artist             song emotion  variance Genre  Release Date     Key  \
4456  Drake  Best I Ever Had     joy  0.411265   rap          2024  C# Maj   

      Tempo  Loudness Explicit  Popularity  Energy  Danceability  \
4456    162     -2.18        1          80      92            40   

      Positiveness  Speechiness  Liveness  Acousticness  Instrumentalness  \
4456            56           38        13            19                 0   

      cluster  
4456      339  
[72]




In [8]:
print(song_data[song_data['cluster'] == test[0]].head(5))

                    artist                  song   emotion  variance  \
57867              theMIND     Animated Ambition       joy  0.833514   
58357  mike. fka mike stud          Captain Hook       joy  0.833514   
58821        iLoveMakonnen               Tuesday       joy  0.833514   
58824        iLoveMakonnen               No Maam       joy  0.833514   
58827        iLoveMakonnen  Down 4 So Long Remix  surprise  0.833514   

                             Genre  Release Date     Key  Tempo  Loudness  \
57867  Ghetto Sage,Unknown,Unknown          2021   G min    149     -5.19   
58357      Unknown,Unknown,Unknown          2020  A# min    164     -5.00   
58821                         soul          2014   C Maj    140     -6.00   
58824                         soul          2014   C Maj    140     -6.00   
58827                         soul          2014   C Maj    140     -6.00   

      Explicit  Popularity  Energy  Danceability  Positiveness  Speechiness  \
57867        1          7

In [7]:
# exporting model
print(utilities.dump_model(kmeans))

['./models/KMeans_20250511-2137']
./models/KMeans_20250511-2137


model = load_model()

In [None]:
# importing model
model = joblib.load('./models/test_model_joblib')

In [None]:
test = model.predict(filteredDatapoint)
print(test)