In [13]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

In [14]:
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

In [15]:
df = pd.read_csv(f'{PROJECT_ROOT}/data/clean/spotify_clean.csv')
df

Unnamed: 0,artists,name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genre,artist_primary
0,sam smith;kim petras,unholy,100,156943,False,0.7140,0.472,2,-7.375,1,0.0864,0.01300,0.000005,0.2660,0.2380,131.121,4,pop,sam smith
1,bizarrap;quevedo,"quevedo: bzrp music sessions, vol. 52",99,198937,False,0.6210,0.782,2,-5.548,1,0.0440,0.01250,0.033000,0.2300,0.5500,128.033,4,hip-hop,bizarrap
2,manuel turizo,la bachata,98,162637,False,0.8350,0.679,7,-5.329,0,0.0364,0.58300,0.000002,0.2180,0.8500,124.980,4,reggaeton,manuel turizo
3,david guetta;bebe rexha,i'm good,98,175238,True,0.5610,0.965,7,-3.673,0,0.0343,0.00383,0.000007,0.3710,0.3040,128.040,4,edm,david guetta
4,bad bunny;chencho corleone,me porto bonito,97,178567,True,0.9110,0.712,1,-5.105,0,0.0817,0.09010,0.000027,0.0933,0.4250,92.005,4,reggae,bad bunny
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77503,lil baby,in a minute,0,200470,True,0.6430,0.605,4,-5.968,0,0.2970,0.03360,0.000000,0.1120,0.1450,84.216,4,hip-hop,lil baby
77504,future;lil uzi vert,tic tac,0,189147,True,0.9420,0.566,11,-4.834,1,0.2260,0.07730,0.000000,0.1100,0.4790,132.858,4,hip-hop,future
77505,lil baby,all in,0,156032,True,0.8360,0.649,11,-6.442,1,0.3320,0.07160,0.000000,0.0943,0.2940,95.084,4,hip-hop,lil baby
77506,lil baby;gunna,drip too hard,0,145542,True,0.8970,0.662,1,-6.903,0,0.2920,0.08520,0.000000,0.5340,0.3900,112.509,4,hip-hop,lil baby


Split into a dataframe for clustering and a dataframe containing the metadata

In [16]:
df.columns.unique()

Index(['artists', 'name', 'popularity', 'duration_ms', 'explicit',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature', 'genre', 'artist_primary'],
      dtype='object')

In [17]:
cluster_features = ['danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

df_cluster = df[cluster_features]
df_cluster.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.714,0.472,-7.375,0.0864,0.013,5e-06,0.266,0.238,131.121
1,0.621,0.782,-5.548,0.044,0.0125,0.033,0.23,0.55,128.033
2,0.835,0.679,-5.329,0.0364,0.583,2e-06,0.218,0.85,124.98
3,0.561,0.965,-3.673,0.0343,0.00383,7e-06,0.371,0.304,128.04
4,0.911,0.712,-5.105,0.0817,0.0901,2.7e-05,0.0933,0.425,92.005


These features need to be scaled

# Engineer Features to Support Hypotheses

H1 Dance–Energy Index (H1)

Purpose:
Capture “movement intensity” — upbeat dance tracks vs calm ones.

In [18]:
df_cluster["dance_energy_index"] = (df["danceability"] + df["energy"]) / 2
df_cluster.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,dance_energy_index
0,0.714,0.472,-7.375,0.0864,0.013,5e-06,0.266,0.238,131.121,0.593
1,0.621,0.782,-5.548,0.044,0.0125,0.033,0.23,0.55,128.033,0.7015
2,0.835,0.679,-5.329,0.0364,0.583,2e-06,0.218,0.85,124.98,0.757
3,0.561,0.965,-3.673,0.0343,0.00383,7e-06,0.371,0.304,128.04,0.763
4,0.911,0.712,-5.105,0.0817,0.0901,2.7e-05,0.0933,0.425,92.005,0.8115


H2: Acoustic Profile 

Purpose:
Identify acoustic / lo-fi / instrumental-leaning tracks.

In [19]:
df_cluster["acoustic_profile"] = (df["acousticness"] + df["instrumentalness"]) / 2
df_cluster

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,dance_energy_index,acoustic_profile
0,0.7140,0.472,-7.375,0.0864,0.01300,0.000005,0.2660,0.2380,131.121,0.5930,0.006502
1,0.6210,0.782,-5.548,0.0440,0.01250,0.033000,0.2300,0.5500,128.033,0.7015,0.022750
2,0.8350,0.679,-5.329,0.0364,0.58300,0.000002,0.2180,0.8500,124.980,0.7570,0.291501
3,0.5610,0.965,-3.673,0.0343,0.00383,0.000007,0.3710,0.3040,128.040,0.7630,0.001919
4,0.9110,0.712,-5.105,0.0817,0.09010,0.000027,0.0933,0.4250,92.005,0.8115,0.045063
...,...,...,...,...,...,...,...,...,...,...,...
77503,0.6430,0.605,-5.968,0.2970,0.03360,0.000000,0.1120,0.1450,84.216,0.6240,0.016800
77504,0.9420,0.566,-4.834,0.2260,0.07730,0.000000,0.1100,0.4790,132.858,0.7540,0.038650
77505,0.8360,0.649,-6.442,0.3320,0.07160,0.000000,0.0943,0.2940,95.084,0.7425,0.035800
77506,0.8970,0.662,-6.903,0.2920,0.08520,0.000000,0.5340,0.3900,112.509,0.7795,0.042600


H4 Mood Index

Purpose:
Capture emotional feel — sad/calm vs happy/energetic.

In [20]:
df_cluster["mood_index"] = (df["valence"] + df["energy"]) / 2
df_cluster

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,dance_energy_index,acoustic_profile,mood_index
0,0.7140,0.472,-7.375,0.0864,0.01300,0.000005,0.2660,0.2380,131.121,0.5930,0.006502,0.35500
1,0.6210,0.782,-5.548,0.0440,0.01250,0.033000,0.2300,0.5500,128.033,0.7015,0.022750,0.66600
2,0.8350,0.679,-5.329,0.0364,0.58300,0.000002,0.2180,0.8500,124.980,0.7570,0.291501,0.76450
3,0.5610,0.965,-3.673,0.0343,0.00383,0.000007,0.3710,0.3040,128.040,0.7630,0.001919,0.63450
4,0.9110,0.712,-5.105,0.0817,0.09010,0.000027,0.0933,0.4250,92.005,0.8115,0.045063,0.56850
...,...,...,...,...,...,...,...,...,...,...,...,...
77503,0.6430,0.605,-5.968,0.2970,0.03360,0.000000,0.1120,0.1450,84.216,0.6240,0.016800,0.37500
77504,0.9420,0.566,-4.834,0.2260,0.07730,0.000000,0.1100,0.4790,132.858,0.7540,0.038650,0.52250
77505,0.8360,0.649,-6.442,0.3320,0.07160,0.000000,0.0943,0.2940,95.084,0.7425,0.035800,0.47150
77506,0.8970,0.662,-6.903,0.2920,0.08520,0.000000,0.5340,0.3900,112.509,0.7795,0.042600,0.52600


H2 and H3:
Vocal Presence

Purpose:
Separate instrumental tracks from vocal / rap-heavy tracks.

In [21]:
df_cluster["vocal_presence"] = 1 - df["instrumentalness"]
df_cluster

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,dance_energy_index,acoustic_profile,mood_index,vocal_presence
0,0.7140,0.472,-7.375,0.0864,0.01300,0.000005,0.2660,0.2380,131.121,0.5930,0.006502,0.35500,0.999995
1,0.6210,0.782,-5.548,0.0440,0.01250,0.033000,0.2300,0.5500,128.033,0.7015,0.022750,0.66600,0.967000
2,0.8350,0.679,-5.329,0.0364,0.58300,0.000002,0.2180,0.8500,124.980,0.7570,0.291501,0.76450,0.999998
3,0.5610,0.965,-3.673,0.0343,0.00383,0.000007,0.3710,0.3040,128.040,0.7630,0.001919,0.63450,0.999993
4,0.9110,0.712,-5.105,0.0817,0.09010,0.000027,0.0933,0.4250,92.005,0.8115,0.045063,0.56850,0.999973
...,...,...,...,...,...,...,...,...,...,...,...,...,...
77503,0.6430,0.605,-5.968,0.2970,0.03360,0.000000,0.1120,0.1450,84.216,0.6240,0.016800,0.37500,1.000000
77504,0.9420,0.566,-4.834,0.2260,0.07730,0.000000,0.1100,0.4790,132.858,0.7540,0.038650,0.52250,1.000000
77505,0.8360,0.649,-6.442,0.3320,0.07160,0.000000,0.0943,0.2940,95.084,0.7425,0.035800,0.47150,1.000000
77506,0.8970,0.662,-6.903,0.2920,0.08520,0.000000,0.5340,0.3900,112.509,0.7795,0.042600,0.52600,1.000000


We have engineered features that represent movement, mood, acoustic character, and vocal presence to support similarity-based clustering

In [22]:
df_cluster.describe()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,dance_energy_index,acoustic_profile,mood_index,vocal_presence
count,77508.0,77508.0,77508.0,77508.0,77508.0,77508.0,77508.0,77508.0,77508.0,77508.0,77508.0,77508.0,77508.0
mean,0.558231,0.633894,-8.619441,0.089621,0.332315,0.185649,0.218597,0.463641,122.076398,0.596063,0.258982,0.548767,0.814351
std,0.17799,0.259328,5.333961,0.118077,0.340301,0.332705,0.196464,0.264001,30.242541,0.166546,0.249551,0.207338,0.332705
min,0.0,0.0,-49.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.445,0.453,-10.48825,0.0361,0.0163,0.0,0.0986,0.241,99.01,0.507,0.039901,0.413,0.846
50%,0.572,0.676,-7.273,0.0492,0.196,8.5e-05,0.133,0.449,122.002,0.632,0.2025,0.567,0.999915
75%,0.689,0.856,-5.141,0.0877,0.634,0.154,0.281,0.677,140.18425,0.7175,0.408501,0.7055,1.0
max,0.985,1.0,4.532,0.965,0.996,1.0,1.0,0.995,243.372,0.978,0.9975,0.986,1.0


# Create dataframe of metadata

In [23]:
metadata_cols = ['artists', 'name', 'popularity', 'duration_ms', 'explicit', 'genre']

df_metadata = df[metadata_cols]
df_metadata.head()

Unnamed: 0,artists,name,popularity,duration_ms,explicit,genre
0,sam smith;kim petras,unholy,100,156943,False,pop
1,bizarrap;quevedo,"quevedo: bzrp music sessions, vol. 52",99,198937,False,hip-hop
2,manuel turizo,la bachata,98,162637,False,reggaeton
3,david guetta;bebe rexha,i'm good,98,175238,True,edm
4,bad bunny;chencho corleone,me porto bonito,97,178567,True,reggae


In [24]:
df_cluster.to_csv(f'{PROJECT_ROOT}/data/engineered/spotify_features.csv', index=False)