In [1]:
import pandas as pd
pd.options.display.max_columns = 200

songs = pd.read_csv('../data/spotify_clean.csv', index_col=[0])
songs = songs.drop_duplicates(['track_id'])
# Drop rows with missing values
songs.dropna(inplace=True)

In [2]:
from sklearn.preprocessing import MinMaxScaler
# Feautre transformation
# Normalize data before clustering since clustering relies on distances
scaler = MinMaxScaler()
numerical_data = songs[['popularity', 'duration_ms', 'danceability', 'energy', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo']] 
scaled_numerical_data = pd.DataFrame(scaler.fit_transform(numerical_data), columns=numerical_data.columns)
# One-hot encode categorical data
categorical_col_names = ['mode', 'key', 'time_signature']
categorical_data = songs[categorical_col_names]
onehot_data = pd.get_dummies(categorical_data, columns=categorical_col_names, prefix=categorical_col_names, dtype=int)
# Merge all the features together
songs_data = pd.concat([scaled_numerical_data, onehot_data.set_axis(scaled_numerical_data.index)], axis=1)
songs_data.head()

Unnamed: 0,popularity,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,mode_0,mode_1,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5
0,0.73,0.042473,0.686294,0.461,0.791392,0.148187,0.032329,1e-06,0.358,0.718593,0.361245,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0.55,0.026971,0.426396,0.166,0.597377,0.079067,0.927711,6e-06,0.101,0.268342,0.318397,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0.57,0.038679,0.44467,0.359,0.736123,0.05772,0.210843,0.0,0.117,0.120603,0.313643,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,0.71,0.036978,0.270051,0.0596,0.573701,0.037617,0.908635,7.1e-05,0.132,0.143719,0.746758,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0.82,0.036389,0.627411,0.443,0.737103,0.054508,0.470884,0.0,0.0829,0.167839,0.492863,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [3]:
# Convert explicit from bool to int
explicit = songs['explicit'].astype(int).to_numpy()
songs_data['explicit'] = explicit

In [4]:
from sklearn.neighbors import NearestCentroid
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

songs.reset_index(inplace=True)
genres = songs["track_genre"]
# Numerically encode the labels
label_encoder = LabelEncoder()
encoded_genres = label_encoder.fit_transform(genres)
# Using stratify might help because we have an imbalanced dataset
X_train, X_test, y_train, y_test = train_test_split(songs_data, encoded_genres, test_size=0.2, 
                                                    stratify=encoded_genres, shuffle=True, random_state=50)
# Train model
ncc = NearestCentroid(metric='euclidean')
ncc.fit(X_train, y_train)

In [26]:
songs_data.tail()

Unnamed: 0,popularity,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,mode_0,mode_1,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5,explicit
89735,0.21,0.07199,0.174619,0.235,0.612952,0.043731,0.64257,0.928,0.0863,0.03407,0.517705,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
89736,0.22,0.07199,0.17665,0.117,0.577345,0.041554,0.997992,0.976,0.105,0.035176,0.350242,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
89737,0.22,0.050276,0.638579,0.329,0.714648,0.043523,0.870482,0.0,0.0839,0.746734,0.543933,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
89738,0.41,0.052653,0.595939,0.506,0.714759,0.030777,0.38253,0.0,0.27,0.415075,0.558651,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
89739,0.22,0.044608,0.53401,0.487,0.727429,0.07513,0.683735,0.0,0.0893,0.711558,0.32542,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [24]:
genres

0           acoustic
1           acoustic
2           acoustic
3           acoustic
4           acoustic
            ...     
89735    world-music
89736    world-music
89737    world-music
89738    world-music
89739    world-music
Name: track_genre, Length: 89740, dtype: object

In [5]:
# Evaluate model
predictions = ncc.predict(X_test)
base_accuracy = accuracy_score(y_test, predictions)
base_f1_weighted = f1_score(y_test, predictions, average='weighted')
print("Baseline performance using an Nearest Centroids")
print(f"Accuracy: {base_accuracy:.4f}")
print(f"F1-score: {base_f1_weighted:.4f}")

Baseline performance using an Nearest Centroids
Accuracy: 0.1224
F1-score: 0.0990


In [75]:
# Replace ; for spaces in the artists column
tmp = [artist.replace(";", " ") if artist else None for artist in songs['artists']]
songs['artists'] = tmp
songs['artists']

0                    Gen Hoshino
1                   Ben Woodward
2         Ingrid Michaelson ZAYN
3                   Kina Grannis
4               Chord Overstreet
                   ...          
113995             Rainy Lullaby
113996             Rainy Lullaby
113997             Cesária Evora
113998          Michael W. Smith
113999             Cesária Evora
Name: artists, Length: 89740, dtype: object

In [8]:
# Use stemming and remove stopwords
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
nltk.download('stopwords')
import re

stemmer = PorterStemmer()
# Regex for excluding numbers
token_pattern = re.compile(r"(?u)\b[a-zA-Z]+\b")
# token_pattern = re.compile(r"(?u)\b\w\w+\b")
# Probably need to add stopwords in other languages
my_stopwords = set(stopwords.words('english'))

def tokenize(text):
    stems = []
    tokens = token_pattern.findall(text)
    for item in tokens:
        if item not in my_stopwords:
            stem = stemmer.stem(item)
            if len(stem) > 2:
                stems.append(stem)
    return stems

artists_vectorizer = TfidfVectorizer(max_features=5000)
artists_matrix = artists_vectorizer.fit_transform(songs['artists'])
print(f"Number of features related to artist names: {artists_matrix.shape[1]}")
artists_df = pd.DataFrame(artists_matrix.toarray(), columns=artists_vectorizer.get_feature_names_out())
tfid_vectorizer = TfidfVectorizer(tokenizer=tokenize, max_features=5000)
concatenated_features = (songs['album_name'] + ' ' + songs['track_name']).to_list()
tfid_matrix = tfid_vectorizer.fit_transform(concatenated_features)
print(f"Number of features (removing stopwords and using stemming): {tfid_matrix.shape[1]}")
df = pd.DataFrame(tfid_matrix.toarray(), columns=tfid_vectorizer.get_feature_names_out())
songs_data.reset_index(drop=True, inplace=True)
artists_df.reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)
songs_data_modified = pd.concat([songs_data, df, artists_df], axis=1)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\masam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Number of features related to artist names: 5000




Number of features (removing stopwords and using stemming): 5000


In [9]:
songs_data_modified.describe()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,aaj,aaja,aana,aashiqui,aay,aaya,abajo,abandon,abba,abbey,abc,abcdefu,abend,abr,absenc,absolut,abstract,abuso,abyss,acaba,acapulco,access,ace,acid,acido,acoust,acoustiqu,across,act,action,activ,adagio,adam,addict,ade,adentro,adeu,adhuri,adi,adio,ador,adrenalin,adult,advanc,adventur,aesthet,affair,affect,aficionado,afraid,africa,african,afro,afrobeat,afrod,afropop,afterglow,afterlif,aftermath,afternoon,age,agent,agoni,agora,agua,ahead,ahora,aid,aida,aim,ainda,air,airplan,akatsuki,aku,akuma,akustik,ala,alabama,aladdin,alan,alarm,albert,alberto,album,...,yera,yesudas,yiruma,yo,yoasobi,yoga,yokota,yonder,yonezu,york,you,young,your,youth,yu,yui,yuki,yuna,yung,yungblud,yuri,yusei,yusuf,yuvan,yves,zabo,zac,zach,zachary,zack,zaeden,zany,zara,zarcort,zatox,zayde,zeca,zedd,zeds,zeki,zeppelin,zero,zezo,zezé,zhanna,ziegler,zillertaler,zimmer,zimmerman,zinc,zion,zoe,zombie,zondaflex,zone,zoë,zs,zschech,zuar,zubin,zucker,zuckowski,zumbi,zé,álvaro,ángeles,árbol,étienne,ícaro,ñejo,ñengo,ólafur,ötzi,şam,şanışer,şehinşah,агафонов,александр,ансамбль,большого,бунчиков,валерий,владимир,группа,джанг,добронравов,загадка,захаров,зыков,оркестр,сергей,театра,цфасман,サザンオールスターズ,ヨルシカ,吳青峰,告五人,夜光性アミューズ,櫻坂46,高爾宣
count,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,...,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0
mean,33.198808,229144.4,0.085848,0.562166,0.634458,5.28353,-8.498994,0.636973,0.087442,0.328285,0.173415,0.216971,0.469474,122.058134,3.897426,0.000169,7.5e-05,7.8e-05,0.000173,6.9e-05,0.000107,0.000135,8.4e-05,8.1e-05,0.000134,0.000216,0.000143,7.1e-05,0.000133,0.000102,0.000111,0.000109,0.000149,7.5e-05,7e-05,0.000116,8.6e-05,0.000111,0.000396,8.3e-05,0.001728,0.000138,0.00012,0.001273,0.000167,0.000125,0.000252,0.000372,0.000353,0.000264,9.2e-05,0.000131,4e-05,0.000136,0.000173,0.000126,8.3e-05,0.000388,8e-05,0.00032,8.7e-05,0.000173,0.000115,0.0001,0.000166,0.000386,0.000238,0.000157,0.000255,0.000128,0.000137,0.000142,0.00015,0.000146,0.000134,0.000627,0.000103,0.00015,0.000299,0.000201,0.000139,0.000382,0.000193,0.000234,0.000136,0.00029,0.001128,0.000107,7.1e-05,0.000172,9.1e-05,0.000113,0.00023,0.000179,0.000277,0.000104,0.000109,8.9e-05,0.000143,0.002002,...,7.1e-05,0.000106,0.00073,0.000478,0.000345,0.000103,0.000205,0.000166,0.000102,0.000122,0.000322,0.001334,0.000109,0.000213,0.000142,0.000134,0.000134,0.000119,0.000457,0.000456,0.000257,0.000156,9.3e-05,0.000888,0.000196,0.000159,8.9e-05,0.000326,8.6e-05,0.000217,0.000246,0.000103,0.000234,0.00011,0.000101,0.000108,0.000638,0.000338,9.3e-05,0.000126,0.000181,0.000478,0.000256,6.9e-05,0.00015,6.6e-05,7e-05,0.00086,0.000263,0.000107,0.000152,0.000135,0.000453,0.000256,0.000127,9.1e-05,6e-05,7.7e-05,6.9e-05,5.9e-05,0.000184,0.00038,6.5e-05,0.000522,0.000117,8.4e-05,0.000129,8.6e-05,0.000284,9.9e-05,6.2e-05,0.000138,0.000295,0.000118,0.000242,0.000232,9.5e-05,0.00022,0.000155,6e-05,0.000199,0.0001,0.000214,0.000358,0.000106,0.000107,0.000317,7.7e-05,0.000138,0.000182,0.000199,6.8e-05,0.000113,0.000435,0.000111,0.000148,0.000178,0.000156,0.000111,7.5e-05
std,20.58064,112945.8,0.280141,0.176692,0.256606,3.559912,5.221518,0.480875,0.113278,0.338321,0.323849,0.194885,0.262864,30.117651,0.453437,0.010202,0.00768,0.007092,0.010678,0.006296,0.008488,0.009278,0.007551,0.007663,0.008489,0.011577,0.010603,0.005735,0.009534,0.008622,0.009117,0.008494,0.009487,0.006558,0.006898,0.00936,0.006946,0.009396,0.016314,0.00724,0.031885,0.009309,0.008353,0.022912,0.010534,0.008456,0.01113,0.015148,0.015836,0.011954,0.007791,0.009307,0.004558,0.01056,0.010714,0.010138,0.008254,0.014244,0.00732,0.013173,0.007961,0.00996,0.009739,0.008065,0.011333,0.015935,0.011765,0.009641,0.014129,0.009713,0.01051,0.010869,0.011213,0.011178,0.009084,0.019261,0.008323,0.010338,0.013345,0.011417,0.009398,0.015731,0.008839,0.013071,0.009107,0.013809,0.024885,0.008141,0.007711,0.011973,0.009175,0.009915,0.012421,0.010308,0.011734,0.007857,0.008474,0.006992,0.009099,0.031291,...,0.006323,0.008298,0.02696,0.017964,0.018583,0.008573,0.012034,0.009568,0.00851,0.009975,0.014502,0.028908,0.008273,0.01272,0.01184,0.011563,0.010274,0.010443,0.018994,0.020591,0.014456,0.012489,0.007854,0.020865,0.012214,0.012389,0.008265,0.015513,0.007775,0.012394,0.015275,0.009505,0.012757,0.009649,0.009692,0.00869,0.020193,0.015579,0.008425,0.009441,0.011319,0.018229,0.016007,0.006214,0.010288,0.0056,0.006585,0.021804,0.013914,0.009038,0.009461,0.009211,0.018237,0.012619,0.011047,0.007083,0.004689,0.007054,0.006614,0.004198,0.011174,0.016148,0.005945,0.017999,0.008839,0.007272,0.01125,0.007733,0.014049,0.00956,0.005189,0.009547,0.015274,0.010707,0.015214,0.014445,0.008207,0.012134,0.009551,0.005748,0.01137,0.008328,0.011876,0.016276,0.007514,0.008863,0.015006,0.007375,0.009995,0.011173,0.011505,0.006207,0.008807,0.020842,0.010556,0.011362,0.013352,0.012489,0.010556,0.007144
min,0.0,8586.0,0.0,0.0,0.0,0.0,-49.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,173040.0,0.0,0.45,0.457,2.0,-10.32225,0.0,0.036,0.0171,0.0,0.0982,0.249,99.26275,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,33.0,213295.5,0.0,0.576,0.676,5.0,-7.185,1.0,0.0489,0.188,5.8e-05,0.132,0.457,122.013,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,49.0,264293.0,0.0,0.692,0.853,8.0,-5.108,1.0,0.0859,0.625,0.097625,0.279,0.682,140.077,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,100.0,5237295.0,1.0,0.985,1.0,11.0,4.532,1.0,0.965,0.996,1.0,1.0,0.995,243.372,5.0,1.0,1.0,1.0,1.0,0.681422,1.0,0.845085,1.0,1.0,0.815987,1.0,1.0,0.677029,1.0,1.0,1.0,0.803066,0.748652,0.828322,1.0,1.0,0.73862,1.0,1.0,0.730979,1.0,0.884804,0.795451,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.573308,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.846645,1.0,1.0,0.695257,1.0,1.0,0.867747,0.894991,1.0,1.0,1.0,1.0,1.0,1.0,0.729593,1.0,0.906577,1.0,1.0,1.0,1.0,1.0,0.778881,0.874631,0.885325,1.0,1.0,0.76688,1.0,1.0,1.0,1.0,1.0,1.0,0.870618,1.0,0.765303,1.0,0.844461,1.0,...,0.759957,0.787849,1.0,1.0,1.0,0.77761,0.707107,0.55167,0.707107,1.0,1.0,1.0,0.786861,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.585879,1.0,1.0,1.0,0.756137,0.704012,1.0,1.0,1.0,1.0,1.0,1.0,0.707107,0.698679,1.0,0.797407,0.707107,0.707107,1.0,1.0,0.631845,0.707107,0.649966,0.627197,1.0,0.7784,0.86915,0.734267,0.696911,0.780341,0.621594,1.0,0.699527,0.621506,0.707107,0.754074,0.368544,0.78904,0.708714,0.707107,1.0,0.758231,0.631182,1.0,0.707107,0.712109,1.0,0.552222,0.707107,0.846885,1.0,1.0,1.0,0.709775,1.0,1.0,0.651049,0.709721,0.704429,1.0,1.0,0.530588,0.736445,0.710904,0.740846,0.76595,1.0,0.690103,0.737919,0.731785,1.0,1.0,1.0,1.0,1.0,1.0,0.707107


In [64]:
# Using stratify might help because we have an imbalanced dataset
X_train, X_test, y_train, y_test = train_test_split(songs_data_modified, encoded_genres, test_size=0.2, 
                                                    stratify=encoded_genres, shuffle=True, random_state=100)
# Train model
ncc = NearestCentroid()
ncc.fit(X_train, y_train)
# Evaluate model
predictions = ncc.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1_weighted = f1_score(y_test, predictions, average='weighted')
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score: {f1_weighted:.4f}")
print(f"{(f1_weighted - base_f1_weighted)/base_f1_weighted*100:.2f}% better performance when compared to baseline ncc")

Accuracy: 0.2774
F1-score: 0.2698
154.29% better performance when compared to baseline ncc
