In [2]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [54]:
csv_file = "../data/spotify_dataset.csv"
spotify_df = pd.read_csv(csv_file, sep=",")
print(f"Dataset shape: {spotify_df.shape[0]} Rows and {spotify_df.shape[1]} Columns")
spotify_df.head()

Dataset shape: 114000 Rows and 21 Columns


Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [43]:
spotify_df.columns

Index(['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name',
       'popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre'],
      dtype='object')

#### Columns:

- **track_id:** Spotify ID for the track.

- **artists:** Names of the track's performing artists (semicolon-separated if multiple).
- **album_name:** Name of the album containing the track.
- **track_name:** Name of the track.
- **popularity:** A measure of track popularity, ranging from 0 to 100.
- **duration_ms:** Track length in milliseconds.
- **explicit:** Indicates if the track has explicit lyrics (true = yes, false = no or unknown).
- **danceability:** Describes how suitable the track is for dancing, with values between 0.0 (least danceable) and 1.0 (most danceable).
- **energy:** A measure of intensity and activity in the track, ranging from 0.0 to 1.0.
- **key:**  The musical key of the track, mapped to standard Pitch Class notation (e.g., 0 = C, 1 = C♯/D♭).
- **loudness:** Overall loudness of the track in decibels (dB).
- **mode:** Indicates the modality (major or minor) of the track's melodic content (1 for major, 0 for minor).
- **speechiness:** Detects the presence of spoken words in the track, with values from 0.0 to 1.0.
- **acousticness:** A confidence measure of whether the track is acoustic (1.0 for high confidence).
- **instrumentalness:**  Predicts if the track contains no vocals, with values closer to 1.0 indicating no vocal content.
- **liveness:** Detects the presence of an audience in the recording, with values above 0.8 indicating a high likelihood of a live performance.
- **valence:** A measure from 0.0 to 1.0 describing the musical positiveness conveyed by the track.
- **tempo:** Estimated tempo of the track in beats per minute (BPM).
- **time_signature:** Estimated time signature, ranging from 3 to 7 (e.g., 3/4 to 7/4).
- **track_genre:** The genre to which the track belongs.

In [44]:
spotify_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liveness          11

In [45]:
spotify_df.isnull().sum()

Unnamed: 0          0
track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64

We can see that the artist, album name and track name have one null value each, are these null values in the same row? If that's the case, we might have to drop that row eventually.

In [46]:
rows_with_nulls = spotify_df[spotify_df.isnull().any(axis=1)]
print(rows_with_nulls)

       Unnamed: 0                track_id artists album_name track_name  \
65900       65900  1kR4gIb7nGxHPI3D2ifs59     NaN        NaN        NaN   

       popularity  duration_ms  explicit  danceability  energy  ...  loudness  \
65900           0            0     False         0.501   0.583  ...     -9.46   

       mode  speechiness  acousticness  instrumentalness  liveness  valence  \
65900     0       0.0605          0.69           0.00396    0.0747    0.734   

         tempo  time_signature  track_genre  
65900  138.391               4        k-pop  

[1 rows x 21 columns]


We can see that, in fact, it's a single row that contains null values for the aforementioned columns.

In [47]:
spotify_df.nunique()

Unnamed: 0          114000
track_id             89741
artists              31437
album_name           46589
track_name           73608
popularity             101
duration_ms          50697
explicit                 2
danceability          1174
energy                2083
key                     12
loudness             19480
mode                     2
speechiness           1489
acousticness          5061
instrumentalness      5346
liveness              1722
valence               1790
tempo                45653
time_signature           5
track_genre            114
dtype: int64

In [48]:
spotify_df.describe()

Unnamed: 0.1,Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
count,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0
mean,56999.5,33.238535,228029.2,0.5668,0.641383,5.30914,-8.25896,0.637553,0.084652,0.31491,0.15605,0.213553,0.474068,122.147837,3.904035
std,32909.109681,22.305078,107297.7,0.173542,0.251529,3.559987,5.029337,0.480709,0.105732,0.332523,0.309555,0.190378,0.259261,29.978197,0.432621
min,0.0,0.0,0.0,0.0,0.0,0.0,-49.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,28499.75,17.0,174066.0,0.456,0.472,2.0,-10.013,0.0,0.0359,0.0169,0.0,0.098,0.26,99.21875,4.0
50%,56999.5,35.0,212906.0,0.58,0.685,5.0,-7.004,1.0,0.0489,0.169,4.2e-05,0.132,0.464,122.017,4.0
75%,85499.25,50.0,261506.0,0.695,0.854,8.0,-5.003,1.0,0.0845,0.598,0.049,0.273,0.683,140.071,4.0
max,113999.0,100.0,5237295.0,0.985,1.0,11.0,4.532,1.0,0.965,0.996,1.0,1.0,0.995,243.372,5.0


In [49]:
spotify_df.describe(include=["bool", "object"])

Unnamed: 0,track_id,artists,album_name,track_name,explicit,track_genre
count,114000,113999,113999,113999,114000,114000
unique,89741,31437,46589,73608,2,114
top,6S3JlDAGk3uu3NtZbPnuhS,The Beatles,Alternative Christmas 2022,Run Rudolph Run,False,acoustic
freq,9,279,195,151,104253,1000


Why are there repeated songs in this dataset? Let's look more into that...

In [55]:
spotify_df.sort_values(by = 'track_name').head(10)

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
36750,36750,0fROT4kK5oTm8xO8PX6EJF,Rilès,!I'll Be Back!,!I'll Be Back!,52,178533,True,0.823,0.612,...,-7.767,1,0.248,0.168,0.0,0.109,0.688,142.959,4,french
92751,92751,1hH0t381PIXmUVWyG1Vj3p,Brian Hyland,The Bashful Blond,"""A"" You're Adorable",39,151680,False,0.615,0.375,...,-10.362,0,0.0319,0.482,0.0,0.111,0.922,110.72,4,rockabilly
66970,66970,1B45DvGMoFWdbAEUH2qliG,Little Apple Band,The Favorite Songs Of Sesame Street,"""C"" IS FOR COOKIE",32,84305,False,0.553,0.812,...,-5.542,1,0.0558,0.132,1e-05,0.0794,0.871,118.368,4,kids
66996,66996,73lXf5if6MWVWnsgXhK8bd,Little Apple Band,Sesame Street and Friends,"""C"" is for Cookie",8,86675,False,0.664,0.611,...,-8.687,1,0.0886,0.12,0.0,0.0408,0.758,118.443,4,kids
101161,101161,0jmz4aHEIBCRgrcV2xEkwB,Traditional;Sistine Chapel Choir;Massimo Palom...,Classical Christmas,"""Christe, Redemptor omnium""",0,289133,False,0.111,0.0568,...,-28.053,1,0.0551,0.99,0.697,0.11,0.0395,169.401,1,sleep
82584,82584,5Zx0Rrkn5RFBMD2PRxX3mI,Dillinger Four,C I V I L W A R,"""Contemplate This on the Tree of Woe.""",24,180706,False,0.565,0.977,...,-3.592,1,0.0546,0.0282,0.00132,0.433,0.387,106.478,4,power-pop
5980,5980,3ozivYJGJGq6TSzdy8m64X,Capcom Sound Team,デビル メイ クライ 3 オリジナル・サウンドトラック,"""DEVILS NEVER CRY""(スタッフロール)",55,319906,False,0.264,0.951,...,-7.356,1,0.146,0.000894,0.0442,0.127,0.159,149.99,4,anime
93440,93440,3KKk48f33mlB56F5L5nbJk,Nikolay Kopylov,Popular Opera Arias,"""Don Carlos"" Roderigo'S Death Aria",0,235547,False,0.167,0.332,...,-9.888,1,0.0359,0.992,0.191,0.113,0.0527,62.97,4,romance
93397,93397,3KKk48f33mlB56F5L5nbJk,Nikolay Kopylov,Popular Opera Arias,"""Don Carlos"" Roderigo'S Death Aria",0,235547,False,0.167,0.332,...,-9.888,1,0.0359,0.992,0.191,0.113,0.0527,62.97,4,romance
93895,93895,5OiONTndVC5YOMXg6VC5xs,Nikolay Kopylov,Popular Opera Arias,"""Eugene Onegin"" Ariozo Of Onegin",0,111800,False,0.443,0.514,...,-8.068,1,0.0924,0.988,0.00187,0.0918,0.244,100.752,4,romance


We can see two songs with the same title but one is in lower case and the other is in upper case. Despite the fact that these two songs might be the same, they come from different albums so I will treat them as different tracks.

In [60]:
spotify_df.duplicated(subset=['track_name','album_name','artists']).sum()

24620

There are 24620 duplicated songs in the dataset.

In [63]:
grouped_df = spotify_df.groupby(['track_name','album_name','artists']).size().reset_index(name='count') # grouping by artist and track name
grouped_df.sort_values(by = 'count', ascending= False) # sorting descending

Unnamed: 0,track_name,album_name,artists,count
40094,Last Christmas,Alternative Christmas 2022,Jimmy Eat World,12
13391,Christmastime,Alternative Christmas 2022,The Smashing Pumpkins,10
34575,If We Try,Best 70s Rock Tunes,Don McLean,9
6458,Baby Blue - Remastered 2010,Straight Up (Remastered 2010 / Deluxe Edition),Badfinger,9
68796,Survive,Best 70s Rock Tunes,Jimmy Buffett,9
...,...,...,...,...
32921,I Get The Sweetest Feeling,I Get The Sweetest Feeling,Jackie Wilson,1
32920,I Get Looser,"Cafetorium Songs, Vol. 2",Koo Koo Kanga Roo,1
32919,I Get Loose,"Cafetorium Songs, Vol. 1",Koo Koo Kanga Roo,1
32918,I Get Lonely In A Hurry,I Get Lonely In A Hurry,George Jones,1


There is a total of 89379 unique songs. The one that repeats the most appears 12 times in the dataset.

In [66]:
popular_songs_df = spotify_df.groupby(['track_name','album_name','artists'])['popularity'].idxmax() # grouping by track name, artist and album name and choosing the row with the highest popularity

most_popular_songs = spotify_df.loc[popular_songs_df] # rows with the song with the highest popularity

most_popular_songs

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
36750,36750,0fROT4kK5oTm8xO8PX6EJF,Rilès,!I'll Be Back!,!I'll Be Back!,52,178533,True,0.823,0.6120,...,-7.767,1,0.2480,0.168,0.000000,0.1090,0.6880,142.959,4,french
92751,92751,1hH0t381PIXmUVWyG1Vj3p,Brian Hyland,The Bashful Blond,"""A"" You're Adorable",39,151680,False,0.615,0.3750,...,-10.362,0,0.0319,0.482,0.000000,0.1110,0.9220,110.720,4,rockabilly
66970,66970,1B45DvGMoFWdbAEUH2qliG,Little Apple Band,The Favorite Songs Of Sesame Street,"""C"" IS FOR COOKIE",32,84305,False,0.553,0.8120,...,-5.542,1,0.0558,0.132,0.000010,0.0794,0.8710,118.368,4,kids
66996,66996,73lXf5if6MWVWnsgXhK8bd,Little Apple Band,Sesame Street and Friends,"""C"" is for Cookie",8,86675,False,0.664,0.6110,...,-8.687,1,0.0886,0.120,0.000000,0.0408,0.7580,118.443,4,kids
101161,101161,0jmz4aHEIBCRgrcV2xEkwB,Traditional;Sistine Chapel Choir;Massimo Palom...,Classical Christmas,"""Christe, Redemptor omnium""",0,289133,False,0.111,0.0568,...,-28.053,1,0.0551,0.990,0.697000,0.1100,0.0395,169.401,1,sleep
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65756,65756,2oVHb8wyg6oC2iNpGBNvx1,HEIZE,Hotel del Luna (Original Television Soundtrack...,내 맘을 볼 수 있나요,63,225785,False,0.397,0.1660,...,-10.678,1,0.0349,0.884,0.000000,0.1000,0.1310,134.708,4,k-pop
65859,65859,4kIpBfvK44bxqX7zo8K1oP,Gaho,ITAEWON CLASS (Original Television Soundtrack)...,시작,65,202440,False,0.591,0.8180,...,-3.532,1,0.0730,0.172,0.000000,0.1260,0.5740,108.107,4,k-pop
75903,75903,4mHc7LUlO3k6AXeFV2EiJK,Yiruma,Yiruma Official Album 'Piano Therapy' (The Ori...,약속 (Piano Solo),37,144533,False,0.504,0.1430,...,-18.713,1,0.0701,0.980,0.922000,0.1240,0.0584,148.786,4,new-age
79554,79554,0tQesiSZJQOdHeAC7r59us,GODA,One Punch Man (Original Soundtrack),원펀맨 Theme - Sad Theme,37,228000,False,0.542,0.3370,...,-16.088,1,0.0539,0.985,0.933000,0.1470,0.4430,159.951,4,piano


In [None]:
# TODO: corrplot