In [15]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
file_path = "spotify-2023.csv"
df = pd.read_csv(file_path, encoding='latin1')

In [17]:
df.head()

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703,43,...,125,B,Major,80,89,83,31,0,8,4
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286,48,...,92,C#,Major,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974,94,...,138,F,Major,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817,116,...,170,A,Major,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322,84,...,144,A,Minor,65,23,80,14,63,11,6


In [18]:
key_columns = ['track_name', 'artist(s)_name', 'bpm', 'danceability_%', 
               'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'speechiness_%']
df[key_columns].head()

Unnamed: 0,track_name,artist(s)_name,bpm,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",125,80,89,83,31,0,4
1,LALA,Myke Towers,92,71,61,74,7,0,4
2,vampire,Olivia Rodrigo,138,51,32,53,17,0,6
3,Cruel Summer,Taylor Swift,170,55,58,72,11,0,15
4,WHERE SHE GOES,Bad Bunny,144,65,23,80,14,63,6


In [19]:
df.describe()

Unnamed: 0,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,in_apple_playlists,in_apple_charts,in_deezer_charts,bpm,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
count,953.0,953.0,953.0,953.0,953.0,953.0,953.0,953.0,953.0,953.0,953.0,953.0,953.0,953.0,953.0,953.0,953.0
mean,1.556139,2018.238195,6.033578,13.930745,5200.124869,12.009444,67.812172,51.908709,2.666317,122.540399,66.96957,51.43127,64.279119,27.057712,1.581322,18.213012,10.131165
std,0.893044,11.116218,3.566435,9.201949,7897.60899,19.575992,86.441493,50.630241,6.035599,28.057802,14.63061,23.480632,16.550526,25.996077,8.4098,13.711223,9.912888
min,1.0,1930.0,1.0,1.0,31.0,0.0,0.0,0.0,0.0,65.0,23.0,4.0,9.0,0.0,0.0,3.0,2.0
25%,1.0,2020.0,3.0,6.0,875.0,0.0,13.0,7.0,0.0,100.0,57.0,32.0,53.0,6.0,0.0,10.0,4.0
50%,1.0,2022.0,6.0,13.0,2224.0,3.0,34.0,38.0,0.0,121.0,69.0,51.0,66.0,18.0,0.0,12.0,6.0
75%,2.0,2022.0,9.0,22.0,5542.0,16.0,88.0,87.0,2.0,140.0,78.0,70.0,77.0,43.0,0.0,24.0,11.0
max,8.0,2023.0,12.0,31.0,52898.0,147.0,672.0,275.0,58.0,206.0,96.0,97.0,97.0,97.0,91.0,97.0,64.0


In [20]:
print("Missing values:")
df[key_columns].isnull().sum()

Missing values:


track_name            0
artist(s)_name        0
bpm                   0
danceability_%        0
valence_%             0
energy_%              0
acousticness_%        0
instrumentalness_%    0
speechiness_%         0
dtype: int64

In [21]:
print(f"Duplicate Rows: {df.duplicated().sum()}")

Duplicate Rows: 0


In [22]:
features = ['bpm', 'danceability_%', 'valence_%', 'energy_%', 
            'acousticness_%', 'instrumentalness_%', 'speechiness_%']

In [23]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[features])
pd.DataFrame(df_scaled, columns=features).head()

Unnamed: 0,bpm,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,speechiness_%
0,0.087708,0.891096,1.600828,1.131729,0.151729,-0.188132,-0.618829
1,-1.089053,0.275624,0.40773,0.587654,-0.771972,-0.188132,-0.618829
2,0.55128,-1.092091,-0.827979,-0.681854,-0.387097,-0.188132,-0.416966
3,1.692382,-0.818548,0.279898,0.466749,-0.618022,-0.188132,0.49142
4,0.765237,-0.13469,-1.211475,0.950371,-0.502559,7.307062,-0.416966


In [24]:
similarity_matrix = cosine_similarity(df_scaled)
similarity_df = pd.DataFrame(similarity_matrix, index=df['track_name'], columns=df['track_name'])
similarity_df.head(10)

track_name,Seven (feat. Latto) (Explicit Ver.),LALA,vampire,Cruel Summer,WHERE SHE GOES,Sprinter,Ella Baila Sola,Columbia,fukumean,La Bebe - Remix,...,Privileged Rappers,The Astronaut,BackOutsideBoyz,Broke Boys,The Great War,My Mind & Me,Bigger Than The Whole Sky,A Veces (feat. Feid),En La De Ella,Alone
track_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Seven (feat. Latto) (Explicit Ver.),1.0,0.469782,-0.712783,0.006874,-0.124119,0.230075,0.774788,-0.240766,-0.240237,-0.17028,...,0.253117,-0.347852,-0.363861,-0.795652,0.181997,-0.579019,-0.657683,0.633279,0.689028,-0.35311
LALA,0.469782,1.0,-0.361807,-0.416064,-0.121963,-0.197456,-0.047361,0.298458,0.003581,-0.616812,...,-0.012772,0.239737,-0.389889,-0.286762,0.736862,-0.650823,-0.77728,0.844254,0.9019,0.587228
vampire,-0.712783,-0.361807,1.0,0.36657,-0.006297,-0.516328,-0.297706,0.115617,0.073201,-0.079755,...,-0.405279,0.627788,-0.041915,0.376168,0.025359,0.648048,0.689727,-0.58574,-0.643435,0.282168
Cruel Summer,0.006874,-0.416064,0.36657,1.0,0.015082,0.120478,0.323004,-0.585444,-0.159177,0.441335,...,0.198596,0.262203,0.167181,0.12202,-0.279427,-0.071991,0.088541,-0.388573,-0.390435,-0.489121
WHERE SHE GOES,-0.124119,-0.121963,-0.006297,0.015082,1.0,-0.143324,-0.110934,-0.018728,0.0274,-0.092415,...,-0.088844,0.114259,-0.090697,-0.008935,-0.114626,-0.05458,-0.000544,-0.190629,-0.116235,-0.047984
Sprinter,0.230075,-0.197456,-0.516328,0.120478,-0.143324,1.0,-0.010759,-0.589288,0.349517,0.843105,...,0.935893,-0.607931,0.812382,0.22685,-0.743668,-0.288885,-0.312862,0.239448,0.172418,-0.637782
Ella Baila Sola,0.774788,-0.047361,-0.297706,0.323004,-0.110934,-0.010759,1.0,-0.333781,-0.480427,-0.106807,...,-0.069374,-0.315012,-0.496534,-0.860213,0.027345,-0.090942,-0.088826,0.09964,0.106351,-0.584607
Columbia,-0.240766,0.298458,0.115617,-0.585444,-0.018728,-0.589288,-0.333781,1.0,0.323612,-0.676414,...,-0.547121,0.488704,-0.448957,0.059823,0.498641,0.235583,0.128283,-0.136607,0.130148,0.784002
fukumean,-0.240237,0.003581,0.073201,-0.159177,0.0274,0.349517,-0.480427,0.323612,1.0,0.237248,...,0.51119,0.280802,0.477172,0.466852,-0.410185,0.155343,-0.020381,-0.048528,0.165713,0.181251
La Bebe - Remix,-0.17028,-0.616812,-0.079755,0.441335,-0.092415,0.843105,-0.106807,-0.676414,0.237248,1.0,...,0.742468,-0.446278,0.896072,0.480956,-0.87911,0.027165,0.110207,-0.23733,-0.350584,-0.704725


In [25]:
query_songs = [
    "What Was I Made For? [From The Motion Picture \"Barbie\"]", 
    "I Wanna Be Yours", 
    "Blinding Lights"
]

In [26]:
def top_similar_songs(query, similarity_df, top_n=10):
    if query not in similarity_df.index:
        print(f"Song '{query}' not found in the dataset.")
        return
    
    similar_songs = similarity_df[query].sort_values(ascending=False).iloc[1:top_n+1]
    display(pd.DataFrame(similar_songs).reset_index().rename(columns={"track_name": "Song", query: "Similarity Score"}))

for query in query_songs:
    print(f"\nTop 10 Most Similar Songs to '{query}':")
    top_similar_songs(query, similarity_df)


Top 10 Most Similar Songs to 'What Was I Made For? [From The Motion Picture "Barbie"]':


Unnamed: 0,Song,Similarity Score
0,Special,0.975896
1,Arcade,0.969704
2,San Lucas,0.966647
3,Nobody Gets Me,0.960892
4,Photograph,0.958649
5,It's Beginning To Look A Lot Like Christmas,0.957957
6,Sparks,0.952212
7,Heather,0.951792
8,Fingers Crossed,0.950047
9,Falling,0.949864



Top 10 Most Similar Songs to 'I Wanna Be Yours':


Unnamed: 0,Song,Similarity Score
0,Die For You,0.912415
1,vampire,0.897866
2,Satellite,0.890336
3,"Come Back Home - From ""Purple Hearts""",0.88965
4,Atlantis,0.870567
5,Shut up My Moms Calling,0.861723
6,Good Looking,0.847095
7,Car's Outside,0.838838
8,Me and Your Mama,0.835659
9,Angel Baby,0.822971



Top 10 Most Similar Songs to 'Blinding Lights':


Unnamed: 0,Song,Similarity Score
0,Wild Flower (with youjeen),0.980587
1,Chemical,0.978565
2,Curtains,0.97432
3,10 Things I Hate About You,0.974318
4,Unstoppable,0.967626
5,STAY (with Justin Bieber),0.955295
6,NEW MAGIC WAND,0.938111
7,Ghost,0.934299
8,Yellow,0.921913
9,ýýýýýýýýýýýýýýýýýýýýý,0.920734
