In [50]:
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import os
from sklearn.cluster import FeatureAgglomeration
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import matplotlib.pyplot as plt
import plotly.express as px

import string
import re

# FeatureAgglomeration Reduction + KNN Test

In [51]:
load_dotenv()
SPOTIFY_KEY1 = os.getenv('SPOTIFY_KEY1')
SPOTIFY_KEY2 = os.getenv('SPOTIFY_KEY2')
SPOTIFY_DATA = os.getenv('SPOTIFY_DATA')

spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(SPOTIFY_KEY1,SPOTIFY_KEY2))

df_spotify = pd.read_csv(SPOTIFY_DATA)

print(f"Shape of Dataset: {df_spotify.shape}")

num_duplicate_rows = df_spotify.duplicated().sum()
print(f"\nNumber of Duplicate Rows: {num_duplicate_rows}")

any_null_values = df_spotify.isnull().any()
print(f"\nAny Null Values in DataFrame: \n{any_null_values}")

Shape of Dataset: (170653, 19)

Number of Duplicate Rows: 0

Any Null Values in DataFrame: 
valence             False
year                False
acousticness        False
artists             False
danceability        False
duration_ms         False
energy              False
explicit            False
id                  False
instrumentalness    False
key                 False
liveness            False
loudness            False
mode                False
name                False
popularity          False
release_date        False
speechiness         False
tempo               False
dtype: bool


In [52]:
df_relevant_columns = df_spotify.drop(columns=['id','name','release_date','year','artists', 'popularity','explicit'], axis=1)
display(df_relevant_columns.head(10))

Unnamed: 0,valence,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo
0,0.0594,0.982,0.279,831667,0.211,0.878,10,0.665,-20.096,1,0.0366,80.954
1,0.963,0.732,0.819,180533,0.341,0.0,7,0.16,-12.441,1,0.415,60.936
2,0.0394,0.961,0.328,500062,0.166,0.913,3,0.101,-14.85,1,0.0339,110.339
3,0.165,0.967,0.275,210000,0.309,2.8e-05,5,0.381,-9.316,1,0.0354,100.109
4,0.253,0.957,0.418,166693,0.193,2e-06,3,0.229,-10.096,1,0.038,101.665
5,0.196,0.579,0.697,395076,0.346,0.168,2,0.13,-12.506,1,0.07,119.824
6,0.406,0.996,0.518,159507,0.203,0.0,0,0.115,-10.589,1,0.0615,66.221
7,0.0731,0.993,0.389,218773,0.088,0.527,1,0.363,-21.091,0,0.0456,92.867
8,0.721,0.996,0.485,161520,0.13,0.151,5,0.104,-21.508,0,0.0483,64.678
9,0.771,0.982,0.684,196560,0.257,0.0,8,0.504,-16.415,1,0.399,109.378


In [54]:
# print_info taken from Alan/main.py
def print_info(track):
    print('\nArtist: ' + track['artists'][0]['name'])
    print('Track: ' + track['name'])
    print('Album: ' + track['album']['name'])
    AP = track['preview_url']
    if AP is None:
        print("No Audio Preview available")
    else:
        print('Audio Preview: ' + track['preview_url'])
    print('Cover Art: ' + track['album']['images'][0]['url'])

found = False
while not found:
    track_id = input("Enter uri: ")
    if spotify.track(track_id):
        found = True

track = spotify.track(track_id)
print_info(track)

artist_name = track['artists'][0]['name']
song_name = track['name']
song_features = spotify.audio_features(track_id)

df_song = pd.DataFrame(song_features)
df_song = df_song.drop(columns=['track_href','analysis_url','type','id','uri','time_signature'])
display(df_song)

song_df_reorder = ['valence', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo']

df_song = df_song[song_df_reorder]

display(df_song)


Artist: Lena Raine
Track: First Steps
Album: Celeste (Original Soundtrack)
Audio Preview: https://p.scdn.co/mp3-preview/5ffd1e4c21d1b9cab50652893ae8135079e6e59f?cid=798c070d2d5e4ab98b36353e469dba19
Cover Art: https://i.scdn.co/image/ab67616d0000b273b351fafcb334009c3216a039


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,0.545,0.635,0,-7.874,1,0.0259,0.379,0.0579,0.158,0.0884,90.005,218708


Unnamed: 0,valence,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo
0,0.0884,0.379,0.545,218708,0.635,0.0579,0,0.158,-7.874,1,0.0259,90.005


In [55]:
standard = StandardScaler()

X_standard = standard.fit_transform(df_relevant_columns)
song_standard = standard.transform(df_song)

print(X_standard.shape, "\n")
print(X_standard, "\n")
print(song_standard.shape, "\n")
print(song_standard, "\n")

(170653, 12) 

[[-1.7828247   1.27618658 -1.46701268 ...  0.64391197 -0.37970638
  -1.16930675]
 [ 1.65068832  0.61134711  1.59877887 ...  0.64391197  1.94548067
  -1.82117959]
 [-1.858821    1.22034007 -1.18882049 ...  0.64391197 -0.3962973
  -0.21240379]
 ...
 [ 0.41194856 -1.06670771  0.5484614  ... -1.55300732 -0.10749235
  -0.81976118]
 [-1.26756976 -1.30876246  0.75852489 ...  0.64391197  1.28798856
  -1.36140375]
 [ 0.43094764 -0.98426761  1.80884237 ...  0.64391197  0.05903135
  -0.71220119]] 

(1, 12) 

[[-1.67263006 -0.32740622  0.04317353 -0.0970544   0.57020012 -0.34806608
  -1.47929458 -0.27367002  0.63075405  0.64391197 -0.44545559 -0.87456696]] 



In [56]:
agglo = FeatureAgglomeration(n_clusters = 4)

X_agglo = agglo.fit_transform(X_standard)
song_agglo = agglo.transform(song_standard)

print(X_agglo.shape, "\n")
print(X_agglo, "\n")
print(song_agglo.shape, "\n")
print(song_agglo, "\n")

(170653, 4) 

[[ 1.80393171 -1.23251066  1.77214424 -1.62491869]
 [ 0.48790804 -0.8400718   0.03928829  1.6247336 ]
 [ 0.23117189 -0.66269213  1.80004703 -1.52382074]
 ...
 [-0.37187881  0.73520809 -0.79972547  0.48020498]
 [ 0.87290079 -0.02671729 -0.92075446 -0.25452243]
 [ 0.15002031  0.44236815 -0.75100648  1.119895  ]] 

(1, 4) 

[[-0.33031252  0.10879574 -0.33773615 -0.81472826]] 



In [57]:
knn_model = NearestNeighbors(n_neighbors=5)
knn_model.fit(X_agglo)

In [59]:
input_song = df_song[df_relevant_columns.columns]
input_song_scaled = standard.transform(input_song)
print(input_song_scaled)
input_song_agglo = agglo.transform(input_song_scaled)
print(input_song_agglo)

distances, indices = knn_model.kneighbors(input_song_agglo)
recommended_songs = df_spotify.iloc[indices.flatten()][['artists', 'name']]

print(recommended_songs)

[[-1.67263006 -0.32740622  0.04317353 -0.0970544   0.57020012 -0.34806608
  -1.47929458 -0.27367002  0.63075405  0.64391197 -0.44545559 -0.87456696]]
[[-0.33031252  0.10879574 -0.33773615 -0.81472826]]
                                     artists  \
70916   ['Jim Brickman', 'Michael W. Smith']   
123421                         ['Autoheart']   
28311                ['The Electric Prunes']   
146041      ['The Montclairs', 'Phil Perry']   
130538                         ['The Dells']   

                                            name  
70916   Love of My Life (feat. Michael W. Smith)  
123421              Hungover in the City of Dust  
28311                               Holy Are You  
146041                  Dreaming's out of Season  
130538                           O-O, I Love You  
