# Multiclass Classifier Soft Voting - Ensemble
- This notebook trains one model for each genre and consolidates them for prediction.
- Soft voting chooses the class with the greates prediction value as the outcome of the consolidation.

In [13]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [14]:
# import data
df = pd.read_csv('data/tracks_1000Unique+_with_duplicates.csv', index_col=False)
df.drop_duplicates('track_id', keep='first', inplace=True)
df.drop(columns={'Unnamed: 0'}, inplace=True)

In [15]:
del df["track_id"]; del df["artist_name"]; del df["track_name"]; del df["duplicated_y"]; del df["duplicated_x"];

# Make it 1000 for each category
result = df.groupby("genre").apply(lambda x: x.head(1000)).reset_index(drop=True)
result

Unnamed: 0,genre,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,blues,77.0,0.274,0.348,5,-8.631,1,0.0293,0.547000,0.013300,0.3340,0.328,87.430,179693,3
1,blues,77.0,0.756,0.401,7,-10.702,0,0.0526,0.582000,0.011000,0.0541,0.514,101.954,199396,4
2,blues,74.0,0.581,0.687,6,-5.400,1,0.1050,0.229000,0.000000,0.1090,0.187,76.014,240600,4
3,blues,75.0,0.635,0.184,8,-10.785,1,0.0456,0.665000,0.000000,0.1460,0.177,128.424,240200,4
4,blues,75.0,0.477,0.433,11,-6.473,0,0.0247,0.689000,0.000195,0.1510,0.611,82.520,156653,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,techno,29.0,0.732,0.875,4,-5.842,0,0.0326,0.126000,0.000234,0.2200,0.933,132.987,214960,4
9996,techno,49.0,0.738,0.816,4,-6.304,0,0.0322,0.000877,0.785000,0.1340,0.884,129.998,198793,4
9997,techno,35.0,0.610,0.875,5,-5.455,0,0.0333,0.028300,0.908000,0.1140,0.873,144.999,309023,4
9998,techno,44.0,0.756,0.527,8,-11.861,1,0.0428,0.002010,0.509000,0.1120,0.248,119.996,416500,4


In [16]:
# prepare data for VotingClassifier
y = pd.DataFrame(result["genre"])
X = result.drop("genre", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

# prepare data for OneVsAll Classifier
encoder = OneHotEncoder(sparse=False)
encoder.fit(y)
cols = [x.replace("x0_", "") for x in encoder.get_feature_names_out()]
print('Number of classes:\t', len(cols))
y_ohe = pd.DataFrame(encoder.transform(y), columns=cols)
X_train_ohe, X_test_ohe, y_train_ohe, y_test_ohe = train_test_split(X, y_ohe, test_size=0.2, random_state = 42)

Number of classes:	 10




In [17]:
# scale data
ct = ColumnTransformer([
        ('stdscaled', StandardScaler(), ["popularity","danceability","energy","key","loudness","mode","speechiness","acousticness","instrumentalness","liveness","valence","tempo","duration_ms","time_signature"]),
       ], remainder='passthrough')

# VotingClassifier
X_train_scaled = ct.fit_transform(X_train)
X_test_scaled  = ct.transform(X_test)
X_train = X_train_scaled
X_test  = X_test_scaled
# OneVsAll
X_train_scaled_ohe = ct.fit_transform(X_train_ohe)
X_test_scaled_ohe  = ct.transform(X_test_ohe)
X_train_ohe = X_train_scaled_ohe
X_test_ohe  = X_test_scaled_ohe

In [32]:
# training of OnevsAll Classifiers for each genre
classifiers = [
    ('KNNClassifier',     (KNeighborsClassifier())),
    ('RandomForestClassifier', (RandomForestClassifier(random_state=42, max_features='sqrt'))),
    ('DecisionTreeClassifier', (DecisionTreeClassifier(random_state=42))),
    ('SVC', (SVC(random_state=42)))
]

trained_models = {
    'name': [],
    'genre': [],
    'classifier': []
}

model_genre_accuracy = {
    'genre': [],
    'name': [],
    'test-accuracy': []
}

for genre in y_train_ohe.columns:
    for name, model_type in classifiers:
        classifier = model_type
        y_target = y_train_ohe[genre]
        scores = [round(x,2) for x in cross_val_score(classifier, X_train_scaled_ohe, y_train_ohe[genre], cv=3, scoring='accuracy')]
        _tmp = {"min":min(scores), "max":max(scores), "mean":round(np.mean(scores),2)}
        print(f"cross_val_score: [{genre}]->{name}: {_tmp}")
        classifier.fit(X_train_scaled_ohe, y_target)
        trained_models['name'].append(name)
        trained_models['genre'].append(genre)
        trained_models['classifier'].append(classifier)

    for name, classifier in classifiers:
        y_pred = classifier.predict(X_test_ohe)
        _score = accuracy_score(y_test_ohe[genre], y_pred)
        print(f"test-df: [{genre}] -> {name}: {round(_score, 2)}")
        model_genre_accuracy['name'].append(name)
        model_genre_accuracy['genre'].append(genre)
        model_genre_accuracy['test-accuracy'].append(round(_score, 2))

model_genre_accuracy_df = pd.DataFrame.from_dict(model_genre_accuracy)

cross_val_score: [genre_blues]->KNNClassifier: {'min': 0.89, 'max': 0.91, 'mean': 0.9}
cross_val_score: [genre_blues]->RandomForestClassifier: {'min': 0.91, 'max': 0.92, 'mean': 0.91}
cross_val_score: [genre_blues]->DecisionTreeClassifier: {'min': 0.87, 'max': 0.88, 'mean': 0.87}
cross_val_score: [genre_blues]->SVC: {'min': 0.9, 'max': 0.9, 'mean': 0.9}
test-df: [genre_blues] -> KNNClassifier: 0.89
test-df: [genre_blues] -> RandomForestClassifier: 0.91
test-df: [genre_blues] -> DecisionTreeClassifier: 0.88
test-df: [genre_blues] -> SVC: 0.9
cross_val_score: [genre_classical]->KNNClassifier: {'min': 0.97, 'max': 0.98, 'mean': 0.98}
cross_val_score: [genre_classical]->RandomForestClassifier: {'min': 0.98, 'max': 0.98, 'mean': 0.98}
cross_val_score: [genre_classical]->DecisionTreeClassifier: {'min': 0.96, 'max': 0.97, 'mean': 0.97}
cross_val_score: [genre_classical]->SVC: {'min': 0.98, 'max': 0.98, 'mean': 0.98}
test-df: [genre_classical] -> KNNClassifier: 0.98
test-df: [genre_classical] 

In [33]:
model_genre_accuracy_df

Unnamed: 0,genre,name,test-accuracy
0,genre_blues,KNNClassifier,0.89
1,genre_blues,RandomForestClassifier,0.91
2,genre_blues,DecisionTreeClassifier,0.88
3,genre_blues,SVC,0.9
4,genre_classical,KNNClassifier,0.98
5,genre_classical,RandomForestClassifier,0.98
6,genre_classical,DecisionTreeClassifier,0.97
7,genre_classical,SVC,0.98
8,genre_electronic,KNNClassifier,0.9
9,genre_electronic,RandomForestClassifier,0.91


## Build VotingClassifier
- `trained_models` contains all models for each genre and model type
- they will be used to create `VotingClassifier` for each model

In [78]:
models = { 0: 'knn', 1: 'rfc', 2: 'dtc', 3: 'svc'}

vc_dict = {}

for i in models:
    _vc = VotingClassifier(estimators=[
        ('blues', trained_models['classifier'][i + 0]),
        ('classical', trained_models['classifier'][i + 4]),
        ('electronic', trained_models['classifier'][i + 8]),
        ('funk', trained_models['classifier'][i + 12]),
        ('jazz', trained_models['classifier'][i + 16]),
        ('metal', trained_models['classifier'][i + 20]),
        ('r&b', trained_models['classifier'][i + 24]),
        ('rap', trained_models['classifier'][i + 28]),
        ('rock', trained_models['classifier'][i + 32]),
        ('techno', trained_models['classifier'][i + 36])
    ], voting='soft')

    vc_dict[models[i]] = _vc

In [79]:
vc_dict

{'knn': VotingClassifier(estimators=[('blues', KNeighborsClassifier()),
                              ('classical', KNeighborsClassifier()),
                              ('electronic', KNeighborsClassifier()),
                              ('funk', KNeighborsClassifier()),
                              ('jazz', KNeighborsClassifier()),
                              ('metal', KNeighborsClassifier()),
                              ('r&b', KNeighborsClassifier()),
                              ('rap', KNeighborsClassifier()),
                              ('rock', KNeighborsClassifier()),
                              ('techno', KNeighborsClassifier())],
                  voting='soft'),
 'rfc': VotingClassifier(estimators=[('blues', RandomForestClassifier(random_state=42)),
                              ('classical',
                               RandomForestClassifier(random_state=42)),
                              ('electronic',
                               RandomForestClassifier(

In [82]:
for vc in vc_dict:
    if vc == 'svc':
        break
    vc_dict[vc].fit(X_train, y_train)
    y_pred = vc_dict[vc].predict(X_test)
    print(vc, 'VotingClassifier Accuracy:\t', accuracy_score(y_test, y_pred))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


knn VotingClassifier Accuracy:	 0.4715


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


rfc VotingClassifier Accuracy:	 0.6255


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


dtc VotingClassifier Accuracy:	 0.483


### Hyperparameter Optimization
- Use the optimized parameter set for binary classifiers and recreate VotingClassifier
- [Hyperparameter Tuning: RandomForestClassifier](./hyperparameter-tuning-RFC.ipynb) (max_depth = 7, max_features = 8)
- [Hyperparameter Tuning: SuperVectorClassifier](./hyperparameter-tuning-SVC.ipynb)

In [85]:
classifiers_with_optimized_params = [
    ('KNNClassifier',     (KNeighborsClassifier())),
    ('RandomForestClassifier', (RandomForestClassifier(max_depth=7, max_features=8, random_state=42))),
    ('DecisionTreeClassifier', (DecisionTreeClassifier(max_depth=7, max_features=8, random_state=42))),
    ('SVC', (SVC(C=1.34, gamma=1, random_state=42)))
]

trained_models_optimized = {
    'name': [],
    'genre': [],
    'classifier': []
}

model_genre_accuracy_optimized = {
    'genre': [],
    'name': [],
    'test-accuracy': []
}

for genre in y_train_ohe.columns:
    for name, model_type in classifiers_with_optimized_params:
        classifier = model_type
        y_target = y_train_ohe[genre]
        scores = [round(x,2) for x in cross_val_score(classifier, X_train_scaled_ohe, y_train_ohe[genre], cv=3, scoring='accuracy')]
        _tmp = {"min":min(scores), "max":max(scores), "mean":round(np.mean(scores),2)}
        print(f"cross_val_score: [{genre}]->{name}: {_tmp}")
        classifier.fit(X_train_scaled_ohe, y_target)
        trained_models_optimized['name'].append(name)
        trained_models_optimized['genre'].append(genre)
        trained_models_optimized['classifier'].append(classifier)

    for name, classifier in classifiers:
        y_pred = classifier.predict(X_test_ohe)
        _score = accuracy_score(y_test_ohe[genre], y_pred)
        print(f"test-df: [{genre}] -> {name}: {round(_score, 2)}")
        model_genre_accuracy_optimized['name'].append(name)
        model_genre_accuracy_optimized['genre'].append(genre)
        model_genre_accuracy_optimized['test-accuracy'].append(round(_score, 2))

model_genre_accuracy_optimized = pd.DataFrame.from_dict(model_genre_accuracy)

cross_val_score: [genre_blues]->KNNClassifier: {'min': 0.89, 'max': 0.91, 'mean': 0.9}
cross_val_score: [genre_blues]->RandomForestClassifier: {'min': 0.91, 'max': 0.91, 'mean': 0.91}
cross_val_score: [genre_blues]->DecisionTreeClassifier: {'min': 0.89, 'max': 0.9, 'mean': 0.9}
cross_val_score: [genre_blues]->SVC: {'min': 0.91, 'max': 0.91, 'mean': 0.91}
test-df: [genre_blues] -> KNNClassifier: 0.83
test-df: [genre_blues] -> RandomForestClassifier: 0.83
test-df: [genre_blues] -> DecisionTreeClassifier: 0.81
test-df: [genre_blues] -> SVC: 0.84
cross_val_score: [genre_classical]->KNNClassifier: {'min': 0.97, 'max': 0.98, 'mean': 0.98}
cross_val_score: [genre_classical]->RandomForestClassifier: {'min': 0.98, 'max': 0.98, 'mean': 0.98}
cross_val_score: [genre_classical]->DecisionTreeClassifier: {'min': 0.97, 'max': 0.97, 'mean': 0.97}
cross_val_score: [genre_classical]->SVC: {'min': 0.92, 'max': 0.92, 'mean': 0.92}
test-df: [genre_classical] -> KNNClassifier: 0.83
test-df: [genre_classical

In [88]:
model_genre_accuracy['test-accuracy'] - model_genre_accuracy_optimized['test-accuracy']

0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
16    0.0
17    0.0
18    0.0
19    0.0
20    0.0
21    0.0
22    0.0
23    0.0
24    0.0
25    0.0
26    0.0
27    0.0
28    0.0
29    0.0
30    0.0
31    0.0
32    0.0
33    0.0
34    0.0
35    0.0
36    0.0
37    0.0
38    0.0
39    0.0
Name: test-accuracy, dtype: float64