In [2]:
# import libraries
import os
import json
import requests
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
sns.set_theme(style="whitegrid")

In [4]:
# import data
df = pd.read_csv('data/tracks_1000+.csv', index_col=False)
df.drop(columns={'Unnamed: 0'}, inplace=True)

In [5]:
del df["track_id"]; del df["artist_name"]; del df["track_name"]

### This Version will be working with 1000 Samples for each Genre

In [22]:
# Make it 1000 for each category
result = df.groupby("genre").apply(lambda x: x.head(1000)).reset_index(drop=True)
result

Unnamed: 0,popularity,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,86,blues,0.743,0.446,4,-7.807,0,0.0683,0.00390,0.1180,0.3370,0.2780,123.596,232107,4
1,86,blues,0.743,0.446,4,-7.807,0,0.0683,0.00390,0.1180,0.3370,0.2780,123.596,232107,4
2,86,blues,0.743,0.446,4,-7.807,0,0.0683,0.00390,0.1180,0.3370,0.2780,123.596,232107,4
3,80,blues,0.636,0.676,2,-3.442,1,0.0263,0.08070,0.0000,0.0831,0.2730,113.980,208760,4
4,80,blues,0.636,0.676,2,-3.442,1,0.0263,0.08070,0.0000,0.0831,0.2730,113.980,208760,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,37,techno,0.714,0.621,9,-7.005,1,0.0893,0.20100,0.0192,0.0568,0.7450,114.237,331040,4
9996,34,techno,0.803,0.960,9,-3.342,0,0.0429,0.01570,0.0242,0.0566,0.5470,125.990,231271,4
9997,49,techno,0.880,0.477,0,-10.947,1,0.0732,0.00143,0.6270,0.0922,0.2090,125.017,539520,4
9998,47,techno,0.626,0.665,7,-10.134,0,0.0574,0.00775,0.7560,0.0811,0.0327,125.009,240000,4


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

y_tmp = pd.DataFrame(result["genre"])
X = result.drop("genre", axis=1)
encoder = OneHotEncoder(sparse=False)
encoder.fit(y_tmp)
cols = [x.replace("x0_", "") for x in encoder.get_feature_names()]
display(len(cols))
y = pd.DataFrame(encoder.transform(y_tmp), columns=cols)
X_train_org, X_test_org, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)# , stratify=y > 0



10

In [24]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

def scaler_pipeline():
    return make_pipeline(SimpleImputer(strategy='constant', fill_value=0), RobustScaler())

def scaler_pipeline_pwr():
    return make_pipeline(SimpleImputer(strategy='constant', fill_value=0), RobustScaler(), PowerTransformer(method='yeo-johnson', standardize=True))

ct = ColumnTransformer([
        ('stdscaled', StandardScaler(), ["popularity","danceability","energy","key","loudness","mode","speechiness","acousticness","instrumentalness","liveness","valence","tempo","duration_ms","time_signature"]),
       ], remainder='passthrough')

X_train_scaled = ct.fit_transform(X_train_org)
X_test_scaled  = ct.transform(X_test_org)
feature_names = ct.get_feature_names_out()
X_train = X_train_scaled
X_test  = X_test_scaled

In [25]:
feature_names

array(['stdscaled__popularity', 'stdscaled__danceability',
       'stdscaled__energy', 'stdscaled__key', 'stdscaled__loudness',
       'stdscaled__mode', 'stdscaled__speechiness',
       'stdscaled__acousticness', 'stdscaled__instrumentalness',
       'stdscaled__liveness', 'stdscaled__valence', 'stdscaled__tempo',
       'stdscaled__duration_ms', 'stdscaled__time_signature'],
      dtype=object)

In [26]:
y_train_rap = y_train.rap
y_train.columns

Index(['blues', 'classical', 'electronic', 'funk', 'jazz', 'metal', 'r&b',
       'rap', 'rock', 'techno'],
      dtype='object')

In [27]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [28]:
classifiers = [
    ('LogisticRegression', LogisticRegression(random_state=42)),
    ('KNeighborsClassifier', KNeighborsClassifier()),
    ('DecisionTreeClassifier', DecisionTreeClassifier(random_state=42)),
    ('RandomForestClassifier', RandomForestClassifier(random_state=42)),
    ('SVC', SVC(random_state=42))
]

result = {}
print("predicitng Rap!")
for gen in y_train.columns:
    for name, classifier in classifiers:
        y_target = y_train[gen]
        scores = [round(x,2) for x in cross_val_score(classifier, X_train, y_train[gen], cv=3, scoring='accuracy')]
        _tmp = {"min":min(scores), "max":max(scores), "mean":round(np.mean(scores),2)}
        print(f"cross_val_score: [{gen}]->{name}: {_tmp}")
        classifier.fit(X_train, y_target)
    print()
    for name, classifier in classifiers:
        y_pred = classifier.predict(X_test)
        _score = accuracy_score(y_test[gen], y_pred)
        print(f"test-df: [{gen}]->{name}: {round(_score, 2)}")
    print()

predicitng Rap!
cross_val_score: [blues]->LogisticRegression: {'min': 0.9, 'max': 0.9, 'mean': 0.9}
cross_val_score: [blues]->KNeighborsClassifier: {'min': 0.89, 'max': 0.9, 'mean': 0.89}
cross_val_score: [blues]->DecisionTreeClassifier: {'min': 0.87, 'max': 0.88, 'mean': 0.87}
cross_val_score: [blues]->RandomForestClassifier: {'min': 0.9, 'max': 0.9, 'mean': 0.9}
cross_val_score: [blues]->SVC: {'min': 0.9, 'max': 0.9, 'mean': 0.9}

test-df: [blues]->LogisticRegression: 0.89
test-df: [blues]->KNeighborsClassifier: 0.89
test-df: [blues]->DecisionTreeClassifier: 0.88
test-df: [blues]->RandomForestClassifier: 0.9
test-df: [blues]->SVC: 0.9

cross_val_score: [classical]->LogisticRegression: {'min': 0.99, 'max': 0.99, 'mean': 0.99}
cross_val_score: [classical]->KNeighborsClassifier: {'min': 0.98, 'max': 0.99, 'mean': 0.99}
cross_val_score: [classical]->DecisionTreeClassifier: {'min': 0.98, 'max': 0.98, 'mean': 0.98}
cross_val_score: [classical]->RandomForestClassifier: {'min': 0.99, 'max': 

In [29]:
accuracy_score

<function sklearn.metrics._classification.accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None)>