In [2]:
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv('../data/clean/spotify_clean.csv')
df

Unnamed: 0,artists,name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genre
0,gen hoshino,Comedy,73,230666,False,0.676,0.4610,1,-6.746,0,0.1430,0.0322,0.000001,0.3580,0.7150,87.917,4,acoustic
1,ben woodward,Ghost - Acoustic,55,149610,False,0.420,0.1660,1,-17.235,1,0.0763,0.9240,0.000006,0.1010,0.2670,77.489,4,acoustic
2,ingrid michaelson;zayn,To Begin Again,57,210826,False,0.438,0.3590,0,-9.734,1,0.0557,0.2100,0.000000,0.1170,0.1200,76.332,4,acoustic
3,kina grannis,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.9050,0.000071,0.1320,0.1430,181.740,3,acoustic
4,chord overstreet,Hold On,82,198853,False,0.618,0.4430,2,-9.681,1,0.0526,0.4690,0.000000,0.0829,0.1670,119.949,4,acoustic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106943,rainy lullaby,Sleep My Little Boy,21,384999,False,0.172,0.2350,5,-16.393,1,0.0422,0.6400,0.928000,0.0863,0.0339,125.995,5,world-music
106944,rainy lullaby,Water Into Light,22,385000,False,0.174,0.1170,0,-18.318,0,0.0401,0.9940,0.976000,0.1050,0.0350,85.239,4,world-music
106945,cesária evora,Miss Perfumado,22,271466,False,0.629,0.3290,0,-10.895,0,0.0420,0.8670,0.000000,0.0839,0.7430,132.378,4,world-music
106946,michael w. smith,Friends,41,283893,False,0.587,0.5060,7,-10.889,1,0.0297,0.3810,0.000000,0.2700,0.4130,135.960,4,world-music


Split into a dataframe for clustering and a dataframe containing the metadata

In [5]:
df.columns.unique()

Index(['artists', 'name', 'popularity', 'duration_ms', 'explicit',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature', 'genre'],
      dtype='object')

In [6]:
cluster_features = ['danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

df_cluster = df[cluster_features]
df_cluster.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.676,0.461,-6.746,0.143,0.0322,1e-06,0.358,0.715,87.917
1,0.42,0.166,-17.235,0.0763,0.924,6e-06,0.101,0.267,77.489
2,0.438,0.359,-9.734,0.0557,0.21,0.0,0.117,0.12,76.332
3,0.266,0.0596,-18.515,0.0363,0.905,7.1e-05,0.132,0.143,181.74
4,0.618,0.443,-9.681,0.0526,0.469,0.0,0.0829,0.167,119.949


These features need to be scaled

# Engineer Features to Support Hypotheses

H1 Dance–Energy Index (H1)

Purpose:
Capture “movement intensity” — upbeat dance tracks vs calm ones.

In [11]:
df_cluster["dance_energy_index"] = (df["danceability"] + df["energy"]) / 2
df_cluster.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,dance_energy_index,scoustic_profile,acoustic_profile
0,0.676,0.461,-6.746,0.143,0.0322,1e-06,0.358,0.715,87.917,0.5685,0.016101,0.016101
1,0.42,0.166,-17.235,0.0763,0.924,6e-06,0.101,0.267,77.489,0.293,0.462003,0.462003
2,0.438,0.359,-9.734,0.0557,0.21,0.0,0.117,0.12,76.332,0.3985,0.105,0.105
3,0.266,0.0596,-18.515,0.0363,0.905,7.1e-05,0.132,0.143,181.74,0.1628,0.452535,0.452535
4,0.618,0.443,-9.681,0.0526,0.469,0.0,0.0829,0.167,119.949,0.5305,0.2345,0.2345


H2: Acoustic Profile 

Purpose:
Identify acoustic / lo-fi / instrumental-leaning tracks.

In [10]:
df_cluster["acoustic_profile"] = (df["acousticness"] + df["instrumentalness"]) / 2
df_cluster

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,dance_energy_index,scoustic_profile,acoustic_profile
0,0.676,0.4610,-6.746,0.1430,0.0322,0.000001,0.3580,0.7150,87.917,0.5685,0.016101,0.016101
1,0.420,0.1660,-17.235,0.0763,0.9240,0.000006,0.1010,0.2670,77.489,0.2930,0.462003,0.462003
2,0.438,0.3590,-9.734,0.0557,0.2100,0.000000,0.1170,0.1200,76.332,0.3985,0.105000,0.105000
3,0.266,0.0596,-18.515,0.0363,0.9050,0.000071,0.1320,0.1430,181.740,0.1628,0.452535,0.452535
4,0.618,0.4430,-9.681,0.0526,0.4690,0.000000,0.0829,0.1670,119.949,0.5305,0.234500,0.234500
...,...,...,...,...,...,...,...,...,...,...,...,...
106943,0.172,0.2350,-16.393,0.0422,0.6400,0.928000,0.0863,0.0339,125.995,0.2035,0.784000,0.784000
106944,0.174,0.1170,-18.318,0.0401,0.9940,0.976000,0.1050,0.0350,85.239,0.1455,0.985000,0.985000
106945,0.629,0.3290,-10.895,0.0420,0.8670,0.000000,0.0839,0.7430,132.378,0.4790,0.433500,0.433500
106946,0.587,0.5060,-10.889,0.0297,0.3810,0.000000,0.2700,0.4130,135.960,0.5465,0.190500,0.190500


H4 Mood Index

Purpose:
Capture emotional feel — sad/calm vs happy/energetic.

In [13]:
df_cluster["mood_index"] = (df["valence"] + df["energy"]) / 2
df_cluster

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,dance_energy_index,scoustic_profile,acoustic_profile,mood_index
0,0.676,0.4610,-6.746,0.1430,0.0322,0.000001,0.3580,0.7150,87.917,0.5685,0.016101,0.016101,0.58800
1,0.420,0.1660,-17.235,0.0763,0.9240,0.000006,0.1010,0.2670,77.489,0.2930,0.462003,0.462003,0.21650
2,0.438,0.3590,-9.734,0.0557,0.2100,0.000000,0.1170,0.1200,76.332,0.3985,0.105000,0.105000,0.23950
3,0.266,0.0596,-18.515,0.0363,0.9050,0.000071,0.1320,0.1430,181.740,0.1628,0.452535,0.452535,0.10130
4,0.618,0.4430,-9.681,0.0526,0.4690,0.000000,0.0829,0.1670,119.949,0.5305,0.234500,0.234500,0.30500
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106943,0.172,0.2350,-16.393,0.0422,0.6400,0.928000,0.0863,0.0339,125.995,0.2035,0.784000,0.784000,0.13445
106944,0.174,0.1170,-18.318,0.0401,0.9940,0.976000,0.1050,0.0350,85.239,0.1455,0.985000,0.985000,0.07600
106945,0.629,0.3290,-10.895,0.0420,0.8670,0.000000,0.0839,0.7430,132.378,0.4790,0.433500,0.433500,0.53600
106946,0.587,0.5060,-10.889,0.0297,0.3810,0.000000,0.2700,0.4130,135.960,0.5465,0.190500,0.190500,0.45950


H2 and H3:
Vocal Presence

Purpose:
Separate instrumental tracks from vocal / rap-heavy tracks.

In [16]:
df_cluster["vocal_presence"] = 1 - df["instrumentalness"]
df_cluster

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,dance_energy_index,scoustic_profile,acoustic_profile,mood_index,vocal_presence
0,0.676,0.4610,-6.746,0.1430,0.0322,0.000001,0.3580,0.7150,87.917,0.5685,0.016101,0.016101,0.58800,0.999999
1,0.420,0.1660,-17.235,0.0763,0.9240,0.000006,0.1010,0.2670,77.489,0.2930,0.462003,0.462003,0.21650,0.999994
2,0.438,0.3590,-9.734,0.0557,0.2100,0.000000,0.1170,0.1200,76.332,0.3985,0.105000,0.105000,0.23950,1.000000
3,0.266,0.0596,-18.515,0.0363,0.9050,0.000071,0.1320,0.1430,181.740,0.1628,0.452535,0.452535,0.10130,0.999929
4,0.618,0.4430,-9.681,0.0526,0.4690,0.000000,0.0829,0.1670,119.949,0.5305,0.234500,0.234500,0.30500,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106943,0.172,0.2350,-16.393,0.0422,0.6400,0.928000,0.0863,0.0339,125.995,0.2035,0.784000,0.784000,0.13445,0.072000
106944,0.174,0.1170,-18.318,0.0401,0.9940,0.976000,0.1050,0.0350,85.239,0.1455,0.985000,0.985000,0.07600,0.024000
106945,0.629,0.3290,-10.895,0.0420,0.8670,0.000000,0.0839,0.7430,132.378,0.4790,0.433500,0.433500,0.53600,1.000000
106946,0.587,0.5060,-10.889,0.0297,0.3810,0.000000,0.2700,0.4130,135.960,0.5465,0.190500,0.190500,0.45950,1.000000


We have engineered features that represent movement, mood, acoustic character, and vocal presence to support similarity-based clustering

In [23]:
df_cluster.describe()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,dance_energy_index,acoustic_profile,mood_index,vocal_presence
count,106948.0,106948.0,106948.0,106948.0,106948.0,106948.0,106948.0,106948.0,106948.0,106948.0,106948.0,106948.0,106948.0
mean,0.565018,0.642839,-8.30367,0.085466,0.31376,0.163338,0.215561,0.469294,122.373609,0.603928,0.238549,0.556066,0.836662
std,0.173769,0.252462,5.06507,0.107613,0.332777,0.315018,0.192968,0.259432,29.94317,0.161741,0.240558,0.202749,0.315018
min,0.0,0.0,-49.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.454,0.471,-10.082,0.0359,0.0154,0.0,0.0982,0.254,99.79875,0.517,0.03257,0.4245,0.931475
50%,0.578,0.686,-7.056,0.049,0.168,5.8e-05,0.132,0.456,122.0795,0.639,0.169696,0.574,0.999942
75%,0.693,0.856,-5.019,0.085,0.596,0.068525,0.277,0.678,140.172,0.7215,0.3885,0.7095,1.0
max,0.985,1.0,4.532,0.965,0.996,1.0,1.0,0.995,243.372,0.978,0.9975,0.986,1.0


# Create dataframe of metadata

In [22]:
metadata_cols = ['artists', 'name', 'popularity', 'duration_ms', 'explicit', 'genre']

df_metadata = df[metadata_cols]
df_metadata.head()

Unnamed: 0,artists,name,popularity,duration_ms,explicit,genre
0,gen hoshino,Comedy,73,230666,False,acoustic
1,ben woodward,Ghost - Acoustic,55,149610,False,acoustic
2,ingrid michaelson;zayn,To Begin Again,57,210826,False,acoustic
3,kina grannis,Can't Help Falling In Love,71,201933,False,acoustic
4,chord overstreet,Hold On,82,198853,False,acoustic
