# Hyperparameter optimization

### Models that require no preprocessing
* Decision Tree
* Random Forest

In [11]:
import pandas as pd
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from ml_methods.data_prep import add_artists_as_features


songs = pd.read_csv('data/spotify_simplified.csv', index_col=[0])
songs_data = add_artists_as_features(songs, 2000)
songs_data = songs.drop(columns = ["track_id", "artists", "album_name", "track_name", "track_genre"])
genres = songs["track_genre"]

# Numerically encode the labels
label_encoder = LabelEncoder()
encoded_genres = label_encoder.fit_transform(genres)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(songs_data, encoded_genres, test_size=0.3,
                                                    stratify=encoded_genres, shuffle=True, random_state=100)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
model = DecisionTreeClassifier()
pipeline = Pipeline([
    ('estimator', model)])

parameters = {
    'estimator__max_depth':[None, 20, 50, 100],
    'estimator__min_samples_split' :[2, 10, 20],
    'estimator__min_samples_leaf':[1, 10, 20],
    'estimator__max_features':[None, 'log2', 'sqrt']
}

# create the grid search estimator
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring='f1_weighted', cv=cv, n_jobs=-1)
# fit the grid search (= determine the optimal parameters)
grid_search_estimator.fit(X_train.values, y_train)

# Get the best parameters and best model
best_params = grid_search_estimator.best_params_
best_model = grid_search_estimator.best_estimator_

# Evaluate the best model on the test set
best_predictions = best_model.predict(X_test)
best_accuracy = accuracy_score(y_test, best_predictions)
best_f1_weighted = f1_score(y_test, best_predictions, average='weighted')
print("Optimised Parameters: {}".format(best_params))
print(f"Accuracy: {best_accuracy:.4f}")
print(f"F1: {best_f1_weighted:.4f}")

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Optimised Parameters: {'estimator__max_depth': 20, 'estimator__max_features': None, 'estimator__min_samples_leaf': 20, 'estimator__min_samples_split': 2}
Accuracy: 0.4479
F1: 0.4413




### Models that require preprocessing
* All other models used StandardScaler to normalise the data to unit variance and zero mean

In [6]:
import pandas as pd
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

songs = pd.read_csv('data/spotify_simplified.csv', index_col=[0])
songs_data = songs.drop(columns = ["track_id", "artists", "album_name", "track_name", "track_genre"])
genres = songs["track_genre"]

# Numerically encode the labels
label_encoder = LabelEncoder()
encoded_genres = label_encoder.fit_transform(genres)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(songs_data, encoded_genres, test_size=0.3,
                                                    stratify=encoded_genres, shuffle=True, random_state=100)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
scaler = StandardScaler()
model = KNeighborsClassifier(weights='distance')
pipeline = Pipeline([('normalisation', scaler), ('estimator', model)])

parameters = {
    'estimator__n_neighbors':list(range(1, 30)),
    'estimator__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']
}
# create the grid search estimator
grid_search_estimator = GridSearchCV(pipeline, parameters, scoring='f1_weighted', cv=cv)
# fit the grid search (= determine the optimal parameters)
grid_search_estimator.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search_estimator.best_params_
best_model = grid_search_estimator.best_estimator_

# Evaluate the best model on the test set
best_predictions = best_model.predict(X_test)
best_accuracy = accuracy_score(y_test, best_predictions)
best_f1_weighted = f1_score(y_test, best_predictions, average='weighted')
print("Optimised Parameters: {}".format(best_params))
print(f"Accuracy: {best_accuracy:.4f}")
print(f"F1: {best_f1_weighted:.4f}")

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Optimised Parameters: {'estimator__algorithm': 'auto', 'estimator__n_neighbors': 20}
Accuracy: 0.4748
F1: 0.4639
