In [17]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datetime import datetime

# spotify_data

This dataset was extracted from the Spotify platform using the Python library "Spotipy", which allows users to access music data provided via APIs. The dataset collected includes about 1 Million tracks with 19 features between 2000 and 2023. Also, there is a total of 61,445 unique artists and 82 genres in the data.

This clean data has been prepared and utilized for research purposes. Its significance lies in its potential to unravel patterns and predict song popularity prior to its release. This dataset could be used to create various predictive models with machine-learning/deep-learning techniques.

https://www.kaggle.com/datasets/amitanshjoshi/spotify-1million-tracks

In [18]:
pd.set_option('display.max_columns', None)

In [19]:
spotify = pd.read_csv('reduced_spotify.csv')
spotify.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,589408,Wolfgang Amadeus Mozart,"Requiem, K. 626: VIII. Communio: Lux aeterna (...",7Hrce3SEZWI2WntmTas2Oe,0,2023,classical,0.241,0.288,10,-17.974,1,0.036,0.975,0.726,0.086,0.117,96.389,324000,4
1,1234899,Disko Punks,Punk No Rocker - Muzic Loverz Rock It! Club Remix,3NCWsfcGWPCku2QYmteNBl,0,2008,minimal-techno,0.756,0.656,7,-9.571,1,0.122,0.00287,0.352,0.28,0.304,129.982,340400,4
2,1325753,Robert Glasper,4eva,0vFGIXlsEajsNhGZXzKy3H,14,2009,soul,0.726,0.43,1,-14.665,1,0.226,0.455,0.00201,0.127,0.518,94.974,135933,4
3,275637,Rilès,Understood,4sTDuXa8uGDHwJdsXUqIBo,45,2017,french,0.839,0.629,3,-5.663,0,0.147,0.241,0.0,0.108,0.724,94.008,207772,4
4,337394,Ahmoudou Madassane,Zerzura Theme I,5wWACfP4ohUy8UST5slrag,23,2018,guitar,0.538,0.213,0,-12.785,1,0.0405,0.511,0.137,0.444,0.174,69.575,124902,4


In [20]:
spotify.shape

(20000, 20)

In [21]:
spotify.isna().sum()

Unnamed: 0          0
artist_name         0
track_name          0
track_id            0
popularity          0
year                0
genre               0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
duration_ms         0
time_signature      0
dtype: int64

# 1. Preprocessing 

In [22]:
grouped_df = spotify.groupby('artist_name')['track_name'].apply(list).reset_index()
grouped_df.shape

(12497, 2)

In [23]:
#created a new encoded column that identifies each singer 
#motive, maybe if a singer is know it'll have more popularity
spotify['artist_label'] = pd.factorize(spotify['artist_name'])[0]
spotify['artist_label'].describe()

count    20000.000000
mean      5249.164100
std       3578.421228
min          0.000000
25%       2097.000000
50%       4761.500000
75%       8160.250000
max      12496.000000
Name: artist_label, dtype: float64

In [24]:
#Maybe the shorter the name better popularity
def count_words(text):
    return len(text.split())

# Create a new column 'word_count' with the total word count for each track name
spotify['word_count_track'] = spotify['track_name'].apply(count_words)
spotify['word_count_track'].describe()

count    20000.000000
mean         3.608300
std          2.671409
min          1.000000
25%          2.000000
50%          3.000000
75%          5.000000
max         32.000000
Name: word_count_track, dtype: float64

In [25]:
print(spotify['genre'].nunique())
encoder = LabelEncoder()
spotify['genre_encoded'] = encoder.fit_transform(spotify['genre'])

82


In [26]:
spotify['years_since_out'] = datetime.today().year - spotify.year 
# Print the result
print(spotify[['year', 'years_since_out']].head())

   year  years_since_out
0  2023                0
1  2008               15
2  2009               14
3  2017                6
4  2018                5


In [27]:
#Also drop the ones I don't think apport any value
columns1 = ['track_id', 'Unnamed: 0']
spotify = spotify.drop(columns1, axis=1)

In [28]:
spotify.describe()

Unnamed: 0,popularity,year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,artist_label,word_count_track,genre_encoded,years_since_out
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,18.3695,2011.91215,0.537815,0.640338,5.3118,-8.97723,0.63185,0.092952,0.321783,0.251428,0.223342,0.45614,121.73055,248884.6,3.88395,5249.1641,3.6083,37.49675,11.08785
std,15.887913,6.798481,0.183878,0.270427,3.567901,5.682344,0.482314,0.127929,0.355666,0.364213,0.202409,0.2686,29.915336,154165.9,0.484659,3578.421228,2.671409,23.67033,6.798481
min,0.0,2000.0,0.0,2e-05,0.0,-53.613,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4440.0,0.0,0.0,1.0,0.0,0.0
25%,5.0,2006.0,0.415,0.457,2.0,-10.77525,0.0,0.0373,0.00657,1e-06,0.0983,0.227,99.0805,181299.0,4.0,2097.0,2.0,16.0,5.0
50%,15.0,2012.0,0.551,0.694,5.0,-7.461,1.0,0.0511,0.145,0.00152,0.134,0.439,122.025,224820.0,4.0,4761.5,3.0,35.0,11.0
75%,29.0,2018.0,0.676,0.872,9.0,-5.291,1.0,0.0893,0.642,0.603,0.292,0.674,139.98625,284988.2,4.0,8160.25,5.0,57.0,17.0
max,90.0,2023.0,0.979,1.0,11.0,2.276,1.0,0.968,0.996,0.998,0.996,0.991,247.337,6000061.0,5.0,12496.0,32.0,81.0,23.0


# 2. Feature creation

In [29]:
spotify['party_song'] = ((spotify['danceability'] > 0.5) & (spotify['energy'] > 0.5) & (spotify['loudness'] < -10) & (spotify['liveness'] > 0.5))
spotify['party_song'] = encoder.fit_transform(spotify['party_song'])
spotify['party_song'].head()

0    0
1    0
2    0
3    0
4    0
Name: party_song, dtype: int64

In [30]:
print(spotify['party_song'].nunique())

2


# 3. Split or cross validation

# 4. Train model and generate predictions

# 5. Hyperparameter tuning and choose the best set of hyperparameters.

# 6.Model’s performance on a set of metrics.