In [2]:
import pandas as pd

DATA_PATH = "../data/raw/spotify-2023.csv"
df = pd.read_csv(DATA_PATH, encoding="latin1")

df.head()


Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703,43,...,125,B,Major,80,89,83,31,0,8,4
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286,48,...,92,C#,Major,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974,94,...,138,F,Major,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817,116,...,170,A,Major,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322,84,...,144,A,Minor,65,23,80,14,63,11,6


In [3]:
df.shape

(953, 24)

In [4]:
df.isnull().sum().sort_values(ascending=False).head(10)


key                   95
in_shazam_charts      50
track_name             0
artist(s)_name         0
liveness_%             0
instrumentalness_%     0
acousticness_%         0
energy_%               0
valence_%              0
danceability_%         0
dtype: int64

In [5]:
df["streams"].describe()


count           953
unique          949
top       723894473
freq              2
Name: streams, dtype: object

In [9]:
df = pd.read_csv(
    DATA_PATH,
    encoding="latin1",
    on_bad_lines="skip"
)


In [12]:
df["streams"] = (
    df["streams"]
    .astype(str)
    .str.replace(",", "", regex=False)
)

df["streams"] = pd.to_numeric(df["streams"], errors="coerce")


In [13]:
df = df.dropna(subset=["streams"])


In [14]:
import numpy as np

df["log_streams"] = np.log1p(df["streams"])


In [15]:
df.shape
df[["streams", "log_streams"]].describe()


Unnamed: 0,streams,log_streams
count,952.0,952.0
mean,514137400.0,19.506648
std,566856900.0,1.146289
min,2762.0,7.924072
25%,141636200.0,18.768772
50%,290530900.0,19.48722
75%,673869000.0,20.328546
max,3703895000.0,22.032651


In [7]:
df[df["streams"].astype(str).str.contains("BPM", na=False)]


Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
574,Love Grows (Where My Rosemary Goes),Edison Lighthouse,1,1970,1,1,2877,0,BPM110KeyAModeMajorDanceability53Valence75Ener...,16,...,110,A,Major,53,75,69,7,0,17,3


In [17]:
df["streams"].isna().sum()


np.int64(0)

In [18]:
FEATURES = [
    "danceability_%", "energy_%", "valence_%",
    "acousticness_%", "instrumentalness_%",
    "liveness_%", "speechiness_%",
    "bpm"
]


In [19]:
X = df[FEATURES]
y = df["log_streams"]


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rmse


np.float64(1.0080565407443602)

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=12,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_val)
rmse_rf = np.sqrt(mean_squared_error(y_val, y_pred_rf))

rmse_rf


np.float64(1.0293253921517374)

In [23]:
import pandas as pd

importance = pd.Series(
    rf.feature_importances_,
    index=FEATURES
).sort_values(ascending=False)

importance


bpm                   0.192572
danceability_%        0.156255
valence_%             0.154271
acousticness_%        0.145105
energy_%              0.127102
liveness_%            0.116430
speechiness_%         0.101623
instrumentalness_%    0.006642
dtype: float64