In [30]:
!pip install -r requirements.txt



In [31]:
import os
import numpy as np
import kagglehub
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import duckdb

### Load Data

In [3]:
path = kagglehub.dataset_download("maharshipandya/-spotify-tracks-dataset")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/maharshipandya/-spotify-tracks-dataset?dataset_version_number=1...


100%|██████████| 8.17M/8.17M [00:00<00:00, 13.9MB/s]

Extracting files...
Path to dataset files: /Users/jordanandrew/.cache/kagglehub/datasets/maharshipandya/-spotify-tracks-dataset/versions/1





In [4]:
csv_path = os.path.join(path, "dataset.csv")  
raw_data = pd.read_csv(csv_path)
raw_data.head()


Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


### Clean Data ###

In [5]:
# drop unnamed:0 column
df = raw_data.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [6]:
# drop null values
# view number of null values in each column
null_counts = df.isnull().sum()
print("Number of null values in each column:")
print(null_counts)

Number of null values in each column:
track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64


In [7]:
# view which rows have null values
null_rows = df[df.isnull().any(axis=1)]
null_rows


Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
65900,1kR4gIb7nGxHPI3D2ifs59,,,,0,0,False,0.501,0.583,7,-9.46,0,0.0605,0.69,0.00396,0.0747,0.734,138.391,4,k-pop


In [8]:
# drop rows with null values
df = df.dropna(subset=['track_id','artists', 'track_name'])

In [9]:
# check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

Number of duplicate rows: 450


In [10]:
# deal with duplicates
# not specifying columns assumes checking duplicates against all columns
print("Checking and removing duplicate rows across all columns being identical.")
print(f"Number of rows: {len(df)}")
df = df.drop_duplicates()
print(f"Number of rows after removing duplicates: {len(df)}")

Checking and removing duplicate rows across all columns being identical.
Number of rows: 113999
Number of rows after removing duplicates: 113549


In [11]:
# now check for duplicates among track_ids themselves, in case not all columns were identical
print("Checking for duplicate track_ids.")
track_id_duplicates = df.duplicated(subset=['track_id'])
# view the duplicate rows if needed
print(f"Number of duplicate track_ids: {track_id_duplicates.sum()}")

df[track_id_duplicates]

# view only the duplicated rows all together 
df[df.duplicated(subset=['track_id'], keep=False)].sort_values(by='track_id')


Checking for duplicate track_ids.
Number of duplicate track_ids: 23809


Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
15028,001APMDOl3qtx1526T11n1,Pink Sweat$;Kirby,New RnB,Better,0,176320,False,0.613,0.471,1,-6.644,0,0.1070,0.316000,0.000001,0.1170,0.406,143.064,4,chill
103211,001APMDOl3qtx1526T11n1,Pink Sweat$;Kirby,New RnB,Better,0,176320,False,0.613,0.471,1,-6.644,0,0.1070,0.316000,0.000001,0.1170,0.406,143.064,4,soul
85578,001YQlnDSduXd5LgBd66gT,Soda Stereo,Soda Stereo (Remastered),El Tiempo Es Dinero - Remasterizado 2007,38,177266,False,0.554,0.921,2,-4.589,1,0.0758,0.019400,0.088100,0.3290,0.700,183.571,1,punk-rock
100420,001YQlnDSduXd5LgBd66gT,Soda Stereo,Soda Stereo (Remastered),El Tiempo Es Dinero - Remasterizado 2007,38,177266,False,0.554,0.921,2,-4.589,1,0.0758,0.019400,0.088100,0.3290,0.700,183.571,1,ska
2106,003vvx7Niy0yvhvHt4a68B,The Killers,Hot Fuss,Mr. Brightside,86,222973,False,0.352,0.911,1,-5.230,1,0.0747,0.001210,0.000000,0.0995,0.236,148.033,4,alt-rock
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22326,7zv2vmZq8OjS54BxFzI2wM,Attila,Soundtrack to a Party (Bonus),Lets Start the Party,25,125859,True,0.592,0.932,1,-5.412,1,0.0558,0.000005,0.859000,0.0730,0.677,133.987,4,death-metal
72679,7zv2vmZq8OjS54BxFzI2wM,Attila,Soundtrack to a Party (Bonus),Lets Start the Party,25,125859,True,0.592,0.932,1,-5.412,1,0.0558,0.000005,0.859000,0.0730,0.677,133.987,4,metalcore
3100,7zwn1eykZtZ5LODrf7c0tS,The Neighbourhood,Hard To Imagine The Neighbourhood Ever Changing,You Get Me So High,83,153000,False,0.551,0.881,7,-6.099,0,0.0542,0.186000,0.079100,0.1520,0.387,88.036,4,alternative
2004,7zwn1eykZtZ5LODrf7c0tS,The Neighbourhood,Hard To Imagine The Neighbourhood Ever Changing,You Get Me So High,83,153000,False,0.551,0.881,7,-6.099,0,0.0542,0.186000,0.079100,0.1520,0.387,88.036,4,alt-rock


In [12]:
# choose a way to handle duplicates if any exist, keep the first occurrence but for track_genre column create a list of all unique values so that there could be multiple genres associated with a single track_id
def aggregate_genres(genres):
    # store it as a list of unique genres
    unique_genres = list(set(genres))
    return unique_genres

# sort by popularity so that the most genre is first
df = df.sort_values("popularity", ascending=False)
df = df.groupby('track_id').agg({
    'artists': 'first',
    'track_name': 'first',
    'track_genre': aggregate_genres,
    'explicit': 'first',
    'popularity': 'first',
    'danceability': 'first',
    'energy': 'first',
    'key': 'first',
    'loudness': 'first',
    'mode': 'first',
    'speechiness': 'first',
    'acousticness': 'first',
    'instrumentalness': 'first',
    'liveness': 'first',
    'valence': 'first',
    'tempo': 'first',
    'duration_ms': 'first',
    'time_signature': 'first'
}).reset_index()

# confirm no duplicates now!
track_id_duplicates = df.duplicated(subset=['track_id'])
print(f"Number of duplicate track_ids: {track_id_duplicates.sum()}")

# confirm example of a duplicated track_id with combined genres
df

Number of duplicate track_ids: 0


Unnamed: 0,track_id,artists,track_name,track_genre,explicit,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0000vdREvCVMxbQTkS888c,Rill,Lolly,[german],True,44,0.910,0.37400,8,-9.844,0,0.1990,0.075700,0.00301,0.1540,0.432,104.042,160725,4
1,000CC8EParg64OmTxVnZ0p,Glee Cast,It's All Coming Back To Me Now (Glee Cast Vers...,[club],False,47,0.269,0.51600,0,-7.361,1,0.0366,0.406000,0.00000,0.1170,0.341,178.174,322933,4
2,000Iz0K615UepwSJ5z2RE5,Paul Kalkbrenner;Pig&Dan,Böxig Leise - Pig & Dan Remix,[minimal-techno],False,22,0.686,0.56000,5,-13.264,0,0.0462,0.001140,0.18100,0.1110,0.108,119.997,515360,4
3,000RDCYioLteXcutOjeweY,Jordan Sandhu,Teeje Week,[hip-hop],False,62,0.679,0.77000,0,-3.537,1,0.1900,0.058300,0.00000,0.0825,0.839,161.721,190203,4
4,000qpdoc97IMTBvF8gwcpy,Paul Kalkbrenner,Tief,[minimal-techno],False,19,0.519,0.43100,6,-13.606,0,0.0291,0.000964,0.72000,0.0916,0.234,129.971,331240,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89735,7zxHiMmVLt4LGWpOMqOpUh,Haricharan;Gopi Sundar,"Aethu Kari Raavilum - From ""Bangalore Days""",[pop-film],False,56,0.766,0.38200,7,-11.464,0,0.0324,0.698000,0.00143,0.1570,0.672,119.992,325156,4
89736,7zxpdh3EqMq2JCkOI0EqcG,Piano Genie,"Two Worlds (From ""Tarzan"")",[disney],False,23,0.529,0.00879,10,-32.266,1,0.0587,0.996000,0.95900,0.0916,0.510,82.694,109573,4
89737,7zyYmIdjqqiX6kLryb7QBx,Eric Chou,以後別做朋友,[mandopop],False,61,0.423,0.36000,3,-9.458,1,0.0372,0.728000,0.00000,0.1050,0.291,130.576,260573,4
89738,7zybSU9tFO9HNlwmGF7stc,Stereoclip,Sunset Drive,[electronic],False,54,0.649,0.83400,10,-11.430,0,0.0397,0.268000,0.93200,0.0974,0.150,125.004,234300,4


In [18]:
# view an example of a previously duplicated track_id with combined genres
df[(df['track_id'] == '001APMDOl3qtx1526T11n1')]

Unnamed: 0,track_id,artists,track_name,track_genre,explicit,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
6,001APMDOl3qtx1526T11n1,Pink Sweat$;Kirby,Better,"[soul, chill]",False,0,0.613,0.471,1,-6.644,0,0.107,0.316,1e-06,0.117,0.406,143.064,176320,4


In [13]:
# more data cleaning steps
# e.g., ensure correct data types
df['popularity'] = df['popularity'].astype(int)
df['duration_ms'] = df['duration_ms'].astype(int)
df['tempo'] = df['tempo'].astype(float)
df['key'] = df['key'].astype(int)
df['mode'] = df['mode'].astype(int)
df['time_signature'] = df['time_signature'].astype(int)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89740 entries, 0 to 89739
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          89740 non-null  object 
 1   artists           89740 non-null  object 
 2   track_name        89740 non-null  object 
 3   track_genre       89740 non-null  object 
 4   explicit          89740 non-null  bool   
 5   popularity        89740 non-null  int64  
 6   danceability      89740 non-null  float64
 7   energy            89740 non-null  float64
 8   key               89740 non-null  int64  
 9   loudness          89740 non-null  float64
 10  mode              89740 non-null  int64  
 11  speechiness       89740 non-null  float64
 12  acousticness      89740 non-null  float64
 13  instrumentalness  89740 non-null  float64
 14  liveness          89740 non-null  float64
 15  valence           89740 non-null  float64
 16  tempo             89740 non-null  float6

In [15]:
# clean the artists column: artists can contain multiple artists separated by ;
# e.g., convert to list of artists
df['artists'] = df['artists'].str.split(';').apply(lambda x: [a.strip() for a in x])
# add a column for primary artist (assume first artist listed is primary)
df['primary_artist'] = df['artists'].apply(lambda x: x[0])

In [16]:
# convert duration from milliseconds to seconds for easier interpretation
df['duration_ms'] = pd.to_numeric(df['duration_ms'], errors='coerce')
df['duration_sec'] = df['duration_ms'] / 1000
df['duration_min'] = df['duration_sec'] / 60
df.head()

Unnamed: 0,track_id,artists,track_name,track_genre,explicit,popularity,danceability,energy,key,loudness,...,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,primary_artist,duration_sec,duration_min
0,0000vdREvCVMxbQTkS888c,[Rill],Lolly,[german],True,44,0.91,0.374,8,-9.844,...,0.0757,0.00301,0.154,0.432,104.042,160725,4,Rill,160.725,2.67875
1,000CC8EParg64OmTxVnZ0p,[Glee Cast],It's All Coming Back To Me Now (Glee Cast Vers...,[club],False,47,0.269,0.516,0,-7.361,...,0.406,0.0,0.117,0.341,178.174,322933,4,Glee Cast,322.933,5.382217
2,000Iz0K615UepwSJ5z2RE5,"[Paul Kalkbrenner, Pig&Dan]",Böxig Leise - Pig & Dan Remix,[minimal-techno],False,22,0.686,0.56,5,-13.264,...,0.00114,0.181,0.111,0.108,119.997,515360,4,Paul Kalkbrenner,515.36,8.589333
3,000RDCYioLteXcutOjeweY,[Jordan Sandhu],Teeje Week,[hip-hop],False,62,0.679,0.77,0,-3.537,...,0.0583,0.0,0.0825,0.839,161.721,190203,4,Jordan Sandhu,190.203,3.17005
4,000qpdoc97IMTBvF8gwcpy,[Paul Kalkbrenner],Tief,[minimal-techno],False,19,0.519,0.431,6,-13.606,...,0.000964,0.72,0.0916,0.234,129.971,331240,4,Paul Kalkbrenner,331.24,5.520667


In [18]:
# ensure explicit is boolean
df['explicit'] = df['explicit'].map({True:1, False:0, 'True':1, 'False':0})
df['explicit'] = df['explicit'].fillna(0).astype(int)
df.head()

Unnamed: 0,track_id,artists,track_name,track_genre,explicit,popularity,danceability,energy,key,loudness,...,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,primary_artist,duration_sec,duration_min
0,0000vdREvCVMxbQTkS888c,[Rill],Lolly,[german],1,44,0.91,0.374,8,-9.844,...,0.0757,0.00301,0.154,0.432,104.042,160725,4,Rill,160.725,2.67875
1,000CC8EParg64OmTxVnZ0p,[Glee Cast],It's All Coming Back To Me Now (Glee Cast Vers...,[club],0,47,0.269,0.516,0,-7.361,...,0.406,0.0,0.117,0.341,178.174,322933,4,Glee Cast,322.933,5.382217
2,000Iz0K615UepwSJ5z2RE5,"[Paul Kalkbrenner, Pig&Dan]",Böxig Leise - Pig & Dan Remix,[minimal-techno],0,22,0.686,0.56,5,-13.264,...,0.00114,0.181,0.111,0.108,119.997,515360,4,Paul Kalkbrenner,515.36,8.589333
3,000RDCYioLteXcutOjeweY,[Jordan Sandhu],Teeje Week,[hip-hop],0,62,0.679,0.77,0,-3.537,...,0.0583,0.0,0.0825,0.839,161.721,190203,4,Jordan Sandhu,190.203,3.17005
4,000qpdoc97IMTBvF8gwcpy,[Paul Kalkbrenner],Tief,[minimal-techno],0,19,0.519,0.431,6,-13.606,...,0.000964,0.72,0.0916,0.234,129.971,331240,4,Paul Kalkbrenner,331.24,5.520667


Normalized versions of columns for building model later

In [19]:
audio_cols = ['danceability','energy','speechiness','acousticness',
              'instrumentalness','liveness','valence']

from sklearn.preprocessing import MinMaxScaler

# make a clean copy
df_scaled = df.copy()

scale_cols = audio_cols + ['loudness', 'tempo', 'duration_sec']

scaler = MinMaxScaler()
df_scaled[scale_cols] = scaler.fit_transform(df_scaled[scale_cols])
df_scaled


Unnamed: 0,track_id,artists,track_name,track_genre,explicit,popularity,danceability,energy,key,loudness,...,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,primary_artist,duration_sec,duration_min
0,0000vdREvCVMxbQTkS888c,[Rill],Lolly,[german],1,44,0.923858,0.37400,8,0.734088,...,0.076004,0.00301,0.1540,0.434171,0.427502,160725,4,Rill,0.029097,2.678750
1,000CC8EParg64OmTxVnZ0p,[Glee Cast],It's All Coming Back To Me Now (Glee Cast Vers...,[club],0,47,0.273096,0.51600,0,0.780016,...,0.407631,0.00000,0.1170,0.342714,0.732106,322933,4,Glee Cast,0.060119,5.382217
2,000Iz0K615UepwSJ5z2RE5,"[Paul Kalkbrenner, Pig&Dan]",Böxig Leise - Pig & Dan Remix,[minimal-techno],0,22,0.696447,0.56000,5,0.670828,...,0.001145,0.18100,0.1110,0.108543,0.493060,515360,4,Paul Kalkbrenner,0.096921,8.589333
3,000RDCYioLteXcutOjeweY,[Jordan Sandhu],Teeje Week,[hip-hop],0,62,0.689340,0.77000,0,0.850748,...,0.058534,0.00000,0.0825,0.843216,0.664501,190203,4,Jordan Sandhu,0.034735,3.170050
4,000qpdoc97IMTBvF8gwcpy,[Paul Kalkbrenner],Tief,[minimal-techno],0,19,0.526904,0.43100,6,0.664503,...,0.000968,0.72000,0.0916,0.235176,0.534043,331240,4,Paul Kalkbrenner,0.061708,5.520667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89735,7zxHiMmVLt4LGWpOMqOpUh,"[Haricharan, Gopi Sundar]","Aethu Kari Raavilum - From ""Bangalore Days""",[pop-film],0,56,0.777665,0.38200,7,0.704123,...,0.700803,0.00143,0.1570,0.675377,0.493039,325156,4,Haricharan,0.060545,5.419267
89736,7zxpdh3EqMq2JCkOI0EqcG,[Piano Genie],"Two Worlds (From ""Tarzan"")",[disney],0,23,0.537056,0.00879,10,0.319350,...,1.000000,0.95900,0.0916,0.512563,0.339784,109573,4,Piano Genie,0.019314,1.826217
89737,7zyYmIdjqqiX6kLryb7QBx,[Eric Chou],以後別做朋友,[mandopop],0,61,0.429442,0.36000,3,0.741228,...,0.730924,0.00000,0.1050,0.292462,0.536528,260573,4,Eric Chou,0.048193,4.342883
89738,7zybSU9tFO9HNlwmGF7stc,[Stereoclip],Sunset Drive,[electronic],0,54,0.658883,0.83400,10,0.704752,...,0.269076,0.93200,0.0974,0.150754,0.513633,234300,4,Stereoclip,0.043168,3.905000


In [None]:
# Get the top 20 most popular tracks
result = duckdb.query("""
    SELECT track_name, primary_artist, popularity
    FROM df
    ORDER BY popularity DESC
    LIMIT 20
""").to_df()


print("Top 20 most popular tracks")
print(result)




Top 10 most popular tracks
                               track_name     primary_artist  popularity
0               Unholy (feat. Kim Petras)          Sam Smith         100
1   Quevedo: Bzrp Music Sessions, Vol. 52           Bizarrap          99
2                         I'm Good (Blue)       David Guetta          98
3                              La Bachata      Manuel Turizo          98
4                         Me Porto Bonito          Bad Bunny          97
5                        Tití Me Preguntó          Bad Bunny          97
6                     Under The Influence        Chris Brown          96
7                                  Efecto          Bad Bunny          96
8                         I Ain't Worried        OneRepublic          96
9                               As It Was       Harry Styles          95
10                          Ojitos Lindos          Bad Bunny          95
11                            Moscow Mule          Bad Bunny          94
12                      

In [None]:
# Select tracks with danceability greater than 0.9
result = duckdb.query("""
SELECT track_name, primary_artist, danceability
FROM df
WHERE danceability > 0.9
ORDER BY danceability DESC;
""").to_df()

print("Tracks with danceability greater than 0.9")
print(result)

                                            track_name     primary_artist  \
0                                             Sol Clap            Quantic   
1                                        Medicaid Baby  That Girl Lay Lay   
2                                          Inspiration       Delano Smith   
3                                       Daily Routines    Oliver Schories   
4    Featuring Mixx Master Lee, Red Rum & J. Smoov ...    Mixx Master Lee   
..                                                 ...                ...   
969  Mad Mad World (feat. Sizzla Kalonji & Collie B...             Shaggy   
970                                 No Eres Tú, Soy Yo              Brray   
971                                            Hey Mor              Ozuna   
972                                  Itsy Bitsy Spider      Toddler Tunes   
973                                Leave The Lights On            Droplex   

     danceability  
0           0.985  
1           0.984  
2           0.9