# Multiclass Classifier Soft Voting
- This notebook trains one model for each genre and consolidates them for prediction.
- Soft voting chooses the class with the greates prediction value as the outcome of the consolidation.

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
# import data
df = pd.read_csv('data/tracks_1000Unique+_with_duplicates.csv', index_col=False)
df.drop_duplicates('track_id', keep='first', inplace=True)
df.drop(columns={'Unnamed: 0'}, inplace=True)

In [3]:
del df["track_id"]; del df["artist_name"]; del df["track_name"]; del df["duplicated_y"]; del df["duplicated_x"];

# Make it 1000 for each category
result = df.groupby("genre").apply(lambda x: x.head(1000)).reset_index(drop=True)
result

Unnamed: 0,genre,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,blues,77.0,0.274,0.348,5,-8.631,1,0.0293,0.547000,0.013300,0.3340,0.328,87.430,179693,3
1,blues,77.0,0.756,0.401,7,-10.702,0,0.0526,0.582000,0.011000,0.0541,0.514,101.954,199396,4
2,blues,74.0,0.581,0.687,6,-5.400,1,0.1050,0.229000,0.000000,0.1090,0.187,76.014,240600,4
3,blues,75.0,0.635,0.184,8,-10.785,1,0.0456,0.665000,0.000000,0.1460,0.177,128.424,240200,4
4,blues,75.0,0.477,0.433,11,-6.473,0,0.0247,0.689000,0.000195,0.1510,0.611,82.520,156653,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,techno,29.0,0.732,0.875,4,-5.842,0,0.0326,0.126000,0.000234,0.2200,0.933,132.987,214960,4
9996,techno,49.0,0.738,0.816,4,-6.304,0,0.0322,0.000877,0.785000,0.1340,0.884,129.998,198793,4
9997,techno,35.0,0.610,0.875,5,-5.455,0,0.0333,0.028300,0.908000,0.1140,0.873,144.999,309023,4
9998,techno,44.0,0.756,0.527,8,-11.861,1,0.0428,0.002010,0.509000,0.1120,0.248,119.996,416500,4


In [5]:
# prepare data for VotingClassifier
y = pd.DataFrame(result["genre"])
X = result.drop("genre", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

# prepare data for OneVsAll Classifier
encoder = OneHotEncoder(sparse=False)
encoder.fit(y)
cols = [x.replace("x0_", "") for x in encoder.get_feature_names_out()]
print('Number of classes:\t', len(cols))
y_ohe = pd.DataFrame(encoder.transform(y), columns=cols)
X_train_ohe, X_test_ohe, y_train_ohe, y_test_ohe = train_test_split(X, y_ohe, test_size=0.2, random_state = 42)

Number of classes:	 10




In [6]:
# scale data
ct = ColumnTransformer([
        ('stdscaled', StandardScaler(), ["popularity","danceability","energy","key","loudness","mode","speechiness","acousticness","instrumentalness","liveness","valence","tempo","duration_ms","time_signature"]),
       ], remainder='passthrough')

# VotingClassifier
X_train_scaled = ct.fit_transform(X_train)
X_test_scaled  = ct.transform(X_test)
X_train = X_train_scaled
X_test  = X_test_scaled
# OneVsAll
X_train_scaled_ohe = ct.fit_transform(X_train_ohe)
X_test_scaled_ohe  = ct.transform(X_test_ohe)
X_train_ohe = X_train_scaled_ohe
X_test_ohe  = X_test_scaled_ohe

In [9]:
# training of OnevsAll Classifiers for each genre
classifiers = [
    ('RandomForestClassifier', RandomForestClassifier(random_state=42)),
    ('SVC', SVC(random_state=42))
]

trained_models = {
    'name': [],
    'genre': [],
    'classifier': []
}

for genre in y_train_ohe.columns:
    for name, model_type in classifiers:
        classifier = model_type
        y_target = y_train_ohe[genre]
        scores = [round(x,2) for x in cross_val_score(classifier, X_train_scaled_ohe, y_train_ohe[genre], cv=3, scoring='accuracy')]
        _tmp = {"min":min(scores), "max":max(scores), "mean":round(np.mean(scores),2)}
        print(f"cross_val_score: [{genre}]->{name}: {_tmp}")
        classifier.fit(X_train_scaled_ohe, y_target)
        trained_models['name'].append(name)
        trained_models['genre'].append(genre)
        trained_models['classifier'].append(classifier)

    _genre_result = {}
    for name, classifier in classifiers:
        y_pred = classifier.predict(X_test_ohe)
        _score = accuracy_score(y_test_ohe[genre], y_pred)
        print(f"test-df: [{genre}] -> {name}: {round(_score, 2)}")
        _genre_result[name] = _score

cross_val_score: [genre_blues]->RandomForestClassifier: {'min': 0.91, 'max': 0.92, 'mean': 0.91}
cross_val_score: [genre_blues]->SVC: {'min': 0.9, 'max': 0.9, 'mean': 0.9}
test-df: [genre_blues] -> RandomForestClassifier: 0.91
test-df: [genre_blues] -> SVC: 0.9
cross_val_score: [genre_classical]->RandomForestClassifier: {'min': 0.98, 'max': 0.98, 'mean': 0.98}
cross_val_score: [genre_classical]->SVC: {'min': 0.98, 'max': 0.98, 'mean': 0.98}
test-df: [genre_classical] -> RandomForestClassifier: 0.98
test-df: [genre_classical] -> SVC: 0.98
cross_val_score: [genre_electronic]->RandomForestClassifier: {'min': 0.91, 'max': 0.91, 'mean': 0.91}
cross_val_score: [genre_electronic]->SVC: {'min': 0.9, 'max': 0.9, 'mean': 0.9}
test-df: [genre_electronic] -> RandomForestClassifier: 0.91
test-df: [genre_electronic] -> SVC: 0.9
cross_val_score: [genre_funk]->RandomForestClassifier: {'min': 0.92, 'max': 0.92, 'mean': 0.92}
cross_val_score: [genre_funk]->SVC: {'min': 0.9, 'max': 0.91, 'mean': 0.9}
tes

## Build VotingClassifier
- `trained_models` contains all models for each genre and model type
- they will be used to create `VotingClassifier`

In [10]:
vc_rfc = VotingClassifier(estimators=[
    ('blues', trained_models['classifier'][0]),
    ('classical', trained_models['classifier'][2]),
    ('electronic', trained_models['classifier'][4]),
    ('funk', trained_models['classifier'][6]),
    ('jazz', trained_models['classifier'][8]),
    ('metal', trained_models['classifier'][10]),
    ('r&b', trained_models['classifier'][12]),
    ('rap', trained_models['classifier'][14]),
    ('rock', trained_models['classifier'][16]),
    ('techno', trained_models['classifier'][18])
], voting='soft')

In [11]:
# compute accuracy of VotingClassifier
vc_rfc.fit(X_train, y_train)
y_pred = vc_rfc.predict(X_test)
print('VotingClassifier Accuracy:\t', accuracy_score(y_test, y_pred))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


VotingClassifier Accuracy:	 0.6255
