In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

In [4]:
df = pd.read_csv("../data/train_data_with_genre.csv")
df.head()

Unnamed: 0,track_id,track_name,artist_id,artist_name,popularity,album_name,genre,artist_genres,duration_ms,explicit,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,analysis_url,time_signature
0,2plbrEY59IikOBgBGLjaoe,Die With A Smile,1HY2Jd0NmPuamShAr6KMms,Lady Gaga,100,Die With A Smile,pop,"art pop,dance pop,pop",251667,False,...,-7.777,0,0.0304,0.308,0.0,0.122,0.535,157.969,https://api.spotify.com/v1/audio-analysis/2plb...,3
1,5R8dQOPq8haW94K7mgERlO,Poker Face,1HY2Jd0NmPuamShAr6KMms,Lady Gaga,76,The Fame,pop,"art pop,dance pop,pop",237200,False,...,-4.618,1,0.0787,0.119,2e-06,0.121,0.774,119.001,https://api.spotify.com/v1/audio-analysis/5R8d...,4
2,0SiywuOBRcynK0uKGWdCnn,Bad Romance,1HY2Jd0NmPuamShAr6KMms,Lady Gaga,82,The Fame Monster (Deluxe Edition),pop,"art pop,dance pop,pop",294573,True,...,-3.752,1,0.0363,0.00314,5.3e-05,0.0842,0.714,119.007,https://api.spotify.com/v1/audio-analysis/0Siy...,4
3,2x7MyWybabEz6Y6wvHuwGE,Just Dance,1HY2Jd0NmPuamShAr6KMms,Lady Gaga,74,The Fame,pop,"art pop,dance pop,pop",241933,False,...,-4.541,0,0.0311,0.0264,4.3e-05,0.181,0.745,118.99,https://api.spotify.com/v1/audio-analysis/2x7M...,4
4,11BKm0j4eYoCPPpCONAVwA,Bloody Mary,1HY2Jd0NmPuamShAr6KMms,Lady Gaga,73,Born This Way (Special Edition),pop,"art pop,dance pop,pop",244760,False,...,-6.365,0,0.0291,0.0107,2e-06,0.113,0.44,99.998,https://api.spotify.com/v1/audio-analysis/11BK...,4


In [132]:
df = df.sample(frac=1)

In [133]:
def bpm_to_tempo(bpm):
    if bpm <= 20:
        return 0  # Larghissimo
    elif 21 <= bpm <= 40:
        return 1  # Grave
    elif 41 <= bpm <= 50:
        return 2  # Lento
    elif 51 <= bpm <= 60:
        return 3  # Largo
    elif 61 <= bpm <= 66:
        return 4  # Larghetto
    elif 67 <= bpm <= 75:
        return 5  # Adagio
    elif 76 <= bpm <= 80:
        return 6  # Adagietto
    elif 81 <= bpm <= 100:
        return 7  # Andante
    elif 101 <= bpm <= 120:
        return 8  # Moderato
    elif 121 <= bpm <= 160:
        return 9  # Allegro
    elif 161 <= bpm <= 175:
        return 10  # Vivace
    elif 176 <= bpm <= 200:
        return 11  # Presto
    else:
        return 12  # Prestissimo

classical_tempo = [bpm_to_tempo(bpm) for bpm in df["tempo"]]
df["tempo"] = classical_tempo

# Multinomial Logistic work with both continuous and categorical data, so...

In [134]:
X = df.drop(columns=['track_id', 'track_name', 'artist_id', 'artist_name', 'popularity', 'key', 'mode',
       'album_name', 'genre', 'artist_genres', 'analysis_url', 'duration_ms'])
X.head()

Unnamed: 0,explicit,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
5915,False,0.639,0.791,-4.443,0.254,0.0275,0.0,0.362,0.933,0,4
5387,True,0.824,0.878,-5.02,0.273,0.169,0.0,0.369,0.645,0,4
3313,False,0.507,0.309,-14.429,0.0383,0.784,0.0,0.138,0.716,0,4
4653,False,0.563,0.741,-11.947,0.0486,0.602,0.00251,0.325,0.967,0,4
197,True,0.795,0.594,-6.2,0.0748,0.112,0.0,0.162,0.409,0,4


In [113]:
# keys = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]

In [135]:
X_encoded = pd.get_dummies(X, columns=["time_signature"])

In [136]:
X_encoded

Unnamed: 0,explicit,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5
5915,False,0.639,0.791,-4.443,0.2540,0.02750,0.000000,0.3620,0.9330,0,False,False,False,True,False
5387,True,0.824,0.878,-5.020,0.2730,0.16900,0.000000,0.3690,0.6450,0,False,False,False,True,False
3313,False,0.507,0.309,-14.429,0.0383,0.78400,0.000000,0.1380,0.7160,0,False,False,False,True,False
4653,False,0.563,0.741,-11.947,0.0486,0.60200,0.002510,0.3250,0.9670,0,False,False,False,True,False
197,True,0.795,0.594,-6.200,0.0748,0.11200,0.000000,0.1620,0.4090,0,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6822,False,0.297,0.634,-7.927,0.0335,0.00141,0.175000,0.2470,0.3310,0,False,False,False,True,False
1783,False,0.622,0.931,-4.660,0.0507,0.10600,0.000002,0.3280,0.5210,0,False,False,False,True,False
4828,False,0.659,0.779,-5.580,0.0608,0.26400,0.079200,0.0773,0.6300,0,False,False,False,True,False
4036,False,0.331,0.178,-22.341,0.0384,0.99500,0.899000,0.1070,0.0937,0,False,False,False,True,False


In [137]:
y = df["genre"]

# split train test

In [138]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)


# Logistic model

In [139]:
lr = LogisticRegression(multi_class="multinomial", max_iter=10000)
lr.fit(X_train, y_train)

In [140]:
y_preds = lr.predict(X_test)
y_preds

array(['acoustic', 'acoustic', 'rock', ..., 'reggae', 'pop', 'acoustic'],
      dtype=object)

In [141]:
metrics.accuracy_score(y_test, y_preds)

0.48851978505129456

# => Accuracy Multinomial logistic: 0.488 