# One vs All Classifier
- So far we have trained a model for genre and trained a multiclass predictor model. This notebook aims to create a constellation of (one vs. all) models.
- This notebook uses OneVsRestClassifier offered by SciKit Learn.

In [28]:
import pandas as pd
import time
import seaborn as sns
%matplotlib inline
sns.set_theme(style="whitegrid")

In [10]:
# import data
df = pd.read_csv('data/tracks_1000Unique+_with_duplicates.csv', index_col=False)
df.drop_duplicates('track_id', keep='first', inplace=True)
df.drop(columns={'Unnamed: 0'}, inplace=True)

In [11]:
del df["track_id"]; del df["artist_name"]; del df["track_name"]; del df["duplicated_y"]; del df["duplicated_x"];

In [12]:
# Make it 1000 for each category
result = df.groupby("genre").apply(lambda x: x.head(1000)).reset_index(drop=True)
result

Unnamed: 0,genre,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,blues,77.0,0.274,0.348,5,-8.631,1,0.0293,0.547000,0.013300,0.3340,0.328,87.430,179693,3
1,blues,77.0,0.756,0.401,7,-10.702,0,0.0526,0.582000,0.011000,0.0541,0.514,101.954,199396,4
2,blues,74.0,0.581,0.687,6,-5.400,1,0.1050,0.229000,0.000000,0.1090,0.187,76.014,240600,4
3,blues,75.0,0.635,0.184,8,-10.785,1,0.0456,0.665000,0.000000,0.1460,0.177,128.424,240200,4
4,blues,75.0,0.477,0.433,11,-6.473,0,0.0247,0.689000,0.000195,0.1510,0.611,82.520,156653,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,techno,29.0,0.732,0.875,4,-5.842,0,0.0326,0.126000,0.000234,0.2200,0.933,132.987,214960,4
9996,techno,49.0,0.738,0.816,4,-6.304,0,0.0322,0.000877,0.785000,0.1340,0.884,129.998,198793,4
9997,techno,35.0,0.610,0.875,5,-5.455,0,0.0333,0.028300,0.908000,0.1140,0.873,144.999,309023,4
9998,techno,44.0,0.756,0.527,8,-11.861,1,0.0428,0.002010,0.509000,0.1120,0.248,119.996,416500,4


In [13]:
from sklearn.model_selection import train_test_split

y = pd.DataFrame(result["genre"])
X = result.drop("genre", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42, stratify=y)

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

def scaler_pipeline():
    return make_pipeline(SimpleImputer(strategy='constant', fill_value=0), RobustScaler())

def scaler_pipeline_pwr():
    return make_pipeline(SimpleImputer(strategy='constant', fill_value=0), RobustScaler(), PowerTransformer(method='yeo-johnson', standardize=True))

ct = ColumnTransformer([
        ('stdscaled', StandardScaler(), ["popularity","danceability","energy","key","loudness","mode","speechiness","acousticness","instrumentalness","liveness","valence","tempo","duration_ms","time_signature"]),
       ], remainder='passthrough')

X_train_scaled = ct.fit_transform(X_train_org)
X_test_scaled  = ct.transform(X_test_org)
feature_names = ct.get_feature_names_out()
X_train = X_train_scaled
X_test  = X_test_scaled

### OneVsAll Classifier: RandomForestClassifier, SuperVectorClassifier
Note: OneVsRest and OneVsAll are used interchangeably.

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier

In [26]:
classifiers = [
    ('LogisticRegression', LogisticRegression(random_state=42)),
    ('KNeighborsClassifier', KNeighborsClassifier()),
    ('DecisionTreeClassifier', DecisionTreeClassifier(random_state=42)),
    ('RandomForestClassifier', RandomForestClassifier(random_state=42)),
    ('SVC', SVC(random_state=42))
]

for name, classifier in classifiers:
    _ovr_classifier = OneVsRestClassifier(classifier)
    _ovr_classifier.fit(X_train, y_train)
    y_pred = _ovr_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(name, " - accuracy:\t", round(accuracy, 2))

LogisticRegression  - accuracy:	 0.5
KNeighborsClassifier  - accuracy:	 0.47
DecisionTreeClassifier  - accuracy:	 0.44
RandomForestClassifier  - accuracy:	 0.62
SVC  - accuracy:	 0.56


## Hyperparameter Optimization

In [31]:
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV

### RanfomForestClassifier

In [32]:
rfc = RandomForestClassifier() # model to be optimized

# parameters to be optimized
params_grid = {
    'n_estimators': [10, 50, 100, 500, 1000],
    'max_features': range(1, X.shape[1]),
    'max_depth': range(1,8)
}

# tuning parameters
n_iter = 100
cv = KFold(3, random_state=42, shuffle=True)
n_jobs = -1
scoring = 'accuracy'

In [33]:
start = time.time()
print('Randomized Search for RFC')
random_search = RandomizedSearchCV(rfc, params_grid, n_iter=n_iter, cv=cv, n_jobs=n_jobs, scoring=scoring, random_state=42, verbose=3)
random_search.fit(X_train, y_train)
predictions = random_search.best_estimator_.predict(X_test)
test_accuracy = round(accuracy_score(y_test, predictions), 4) * 100
print(f'Test set accuracy:\t {test_accuracy}%.')
end = round(time.time() - start)
print('##########################################')
print('Total process duration:\t', end, ' seconds')

Randomized Search for RFC
Fitting 3 folds for each of 100 candidates, totalling 300 fits


  self.best_estimator_.fit(X, y, **fit_params)


Test set accuracy:	 56.25%.
##########################################
Total process duration:	 100  seconds


### SuperVectorClassifier

In [36]:
svc = SVC() # model to be optimized
# c, gamma are hyperparameters of SVM

# parameters to be optimized
params_grid = {
    'C': np.arange(0.01, 1.5, 0.01),
    'gamma': np.arange(1, 12, 1)
}

# tuning parameters
n_iter = 100
cv = KFold(3, random_state=42, shuffle=True)
n_jobs = -1
scoring = 'accuracy'

In [37]:
start = time.time()
print('Randomized Search for SVC')
random_search = RandomizedSearchCV(svc, params_grid, n_iter=n_iter, cv=cv, n_jobs=n_jobs, scoring=scoring, random_state=42, verbose=3)
random_search.fit(X_train, y_train)
predictions = random_search.best_estimator_.predict(X_test)
test_accuracy = round(accuracy_score(y_test, predictions), 4) * 100
print(f'Test set accuracy:\t {test_accuracy}%.')
end = round(time.time() - start)
print('##########################################')
print('Total process duration:\t', end, ' seconds')

Randomized Search for SVC
Fitting 3 folds for each of 100 candidates, totalling 300 fits


  y = column_or_1d(y, warn=True)


Test set accuracy:	 49.45%.
##########################################
Total process duration:	 202  seconds
