In [181]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

In [182]:
df = pd.read_csv("../data/train_data_with_genre.csv")
df.head()

Unnamed: 0,track_id,track_name,artist_id,artist_name,popularity,album_name,genre,artist_genres,duration_ms,explicit,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,analysis_url,time_signature
0,2plbrEY59IikOBgBGLjaoe,Die With A Smile,1HY2Jd0NmPuamShAr6KMms,Lady Gaga,100,Die With A Smile,pop,"art pop,dance pop,pop",251667,False,...,-7.777,0,0.0304,0.308,0.0,0.122,0.535,157.969,https://api.spotify.com/v1/audio-analysis/2plb...,3
1,5R8dQOPq8haW94K7mgERlO,Poker Face,1HY2Jd0NmPuamShAr6KMms,Lady Gaga,76,The Fame,pop,"art pop,dance pop,pop",237200,False,...,-4.618,1,0.0787,0.119,2e-06,0.121,0.774,119.001,https://api.spotify.com/v1/audio-analysis/5R8d...,4
2,0SiywuOBRcynK0uKGWdCnn,Bad Romance,1HY2Jd0NmPuamShAr6KMms,Lady Gaga,82,The Fame Monster (Deluxe Edition),pop,"art pop,dance pop,pop",294573,True,...,-3.752,1,0.0363,0.00314,5.3e-05,0.0842,0.714,119.007,https://api.spotify.com/v1/audio-analysis/0Siy...,4
3,2x7MyWybabEz6Y6wvHuwGE,Just Dance,1HY2Jd0NmPuamShAr6KMms,Lady Gaga,74,The Fame,pop,"art pop,dance pop,pop",241933,False,...,-4.541,0,0.0311,0.0264,4.3e-05,0.181,0.745,118.99,https://api.spotify.com/v1/audio-analysis/2x7M...,4
4,11BKm0j4eYoCPPpCONAVwA,Bloody Mary,1HY2Jd0NmPuamShAr6KMms,Lady Gaga,73,Born This Way (Special Edition),pop,"art pop,dance pop,pop",244760,False,...,-6.365,0,0.0291,0.0107,2e-06,0.113,0.44,99.998,https://api.spotify.com/v1/audio-analysis/11BK...,4


In [183]:
df = df.sample(frac=1)

In [184]:
def bpm_to_tempo(bpm):
    if bpm <= 20:
        return 0  # Larghissimo
    elif 21 <= bpm <= 40:
        return 1  # Grave
    elif 41 <= bpm <= 50:
        return 2  # Lento
    elif 51 <= bpm <= 60:
        return 3  # Largo
    elif 61 <= bpm <= 66:
        return 4  # Larghetto
    elif 67 <= bpm <= 75:
        return 5  # Adagio
    elif 76 <= bpm <= 80:
        return 6  # Adagietto
    elif 81 <= bpm <= 100:
        return 7  # Andante
    elif 101 <= bpm <= 120:
        return 8  # Moderato
    elif 121 <= bpm <= 160:
        return 9  # Allegro
    elif 161 <= bpm <= 175:
        return 10  # Vivace
    elif 176 <= bpm <= 200:
        return 11  # Presto
    else:
        return 12  # Prestissimo

classical_tempo = [bpm_to_tempo(bpm) for bpm in df["tempo"]]
df["tempo"] = classical_tempo

# Multinomial Logistic work with both continuous and categorical data, so...

In [185]:
X = df.drop(columns=['track_id', 'track_name', 'artist_id', 'artist_name', 'popularity', 'key', 'mode',
       'album_name', 'genre', 'artist_genres', 'analysis_url', 'duration_ms'])
X.head()

Unnamed: 0,explicit,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
4572,False,0.526,0.214,-18.89,0.0358,0.754,0.782,0.334,0.203,8,4
7267,False,0.772,0.65,-7.449,0.0341,0.0047,0.733,0.146,0.197,12,4
1877,False,0.489,0.505,-8.022,0.117,0.579,0.000333,0.104,0.337,10,4
4108,False,0.302,0.0372,-28.311,0.0434,0.995,0.886,0.075,0.134,11,4
219,False,0.449,0.51,-9.075,0.0278,0.351,0.761,0.139,0.115,7,4


In [186]:
# keys = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]

In [187]:
X_encoded = pd.get_dummies(X, columns=["time_signature"])

In [188]:
X_encoded

Unnamed: 0,explicit,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5
4572,False,0.526,0.2140,-18.890,0.0358,0.7540,0.782000,0.3340,0.2030,8,False,False,False,True,False
7267,False,0.772,0.6500,-7.449,0.0341,0.0047,0.733000,0.1460,0.1970,12,False,False,False,True,False
1877,False,0.489,0.5050,-8.022,0.1170,0.5790,0.000333,0.1040,0.3370,10,False,False,False,True,False
4108,False,0.302,0.0372,-28.311,0.0434,0.9950,0.886000,0.0750,0.1340,11,False,False,False,True,False
219,False,0.449,0.5100,-9.075,0.0278,0.3510,0.761000,0.1390,0.1150,7,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3879,False,0.266,0.0169,-32.346,0.0376,0.9900,0.955000,0.0841,0.0791,5,False,False,False,True,False
59,False,0.619,0.5420,-8.681,0.0529,0.2880,0.000000,0.1510,0.3080,7,False,False,False,True,False
3384,False,0.417,0.0509,-19.939,0.0644,0.9660,0.368000,0.1130,0.2890,4,False,False,False,True,False
4254,False,0.346,0.0578,-22.644,0.0352,0.9320,0.928000,0.2110,0.4220,7,False,False,True,False,False


In [189]:
y = df["genre"]

# Scaler

In [190]:
continuous_cols = ['danceability', "energy", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence"]
scaler = StandardScaler()
X_encoded[continuous_cols] = scaler.fit_transform(X_encoded[continuous_cols])

In [191]:
X_encoded

Unnamed: 0,explicit,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5
4572,False,-0.296038,-1.170551,-1.192988,-0.528968,0.992957,1.725042,1.229544,-1.025802,8,False,False,False,True,False
7267,False,1.059886,0.421484,0.390441,-0.549430,-1.054083,1.580973,-0.176379,-1.049249,12,False,False,False,True,False
1877,False,-0.499978,-0.107977,0.311139,0.448394,0.514868,-0.573211,-0.490468,-0.502149,10,False,False,False,True,False
4108,False,-1.530700,-1.816129,-2.496851,-0.437491,1.651354,2.030822,-0.707339,-1.295445,11,False,False,False,True,False
219,False,-0.720453,-0.089720,0.165404,-0.625260,-0.108013,1.663298,-0.228727,-1.369694,7,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3879,False,-1.729128,-1.890253,-3.055293,-0.507302,1.637695,2.233696,-0.639286,-1.509986,5,False,False,False,True,False
59,False,0.216567,0.027127,0.219933,-0.323144,-0.280125,-0.574190,-0.138987,-0.615477,7,False,False,False,True,False
3384,False,-0.896833,-1.766104,-1.338170,-0.184725,1.572128,0.507802,-0.423163,-0.689727,4,False,False,False,True,False
4254,False,-1.288177,-1.740909,-1.712540,-0.536190,1.479242,2.154310,0.309712,-0.169981,7,False,False,True,False,False


# split train test

In [192]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.25, stratify=y)


In [193]:
X_train.columns

Index(['explicit', 'danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature_0', 'time_signature_1', 'time_signature_3',
       'time_signature_4', 'time_signature_5'],
      dtype='object')

# Logistic model

In [194]:
lr = LogisticRegression(multi_class="multinomial", max_iter=10000)
lr.fit(X_train, y_train)

In [195]:
y_preds = lr.predict(X_test)
y_preds

array(['pop', 'blues', 'classical', ..., 'reggae', 'acoustic', 'reggae'],
      dtype=object)

In [196]:
metrics.accuracy_score(y_test, y_preds)

0.4963361016121153

# => Accuracy Multinomial logistic: 0.496