In [1]:
import pandas as pd
pd.options.display.max_columns = 200

songs = pd.read_csv('../data/spotify_simplified.csv', index_col=[0])

In [None]:
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from data_prep import scale_data, add_artists_as_features

# Normalize data in 0-1 range
scaler = MinMaxScaler()
with_authors = add_artists_as_features(songs, 500)
songs_data = scale_data(with_authors, scaler)
genres = songs["track_genre"]
# Numerically encode the labels
label_encoder = LabelEncoder()
encoded_genres = label_encoder.fit_transform(genres)
# Using stratify might help because we have an imbalanced dataset
X_train, X_test, y_train, y_test = train_test_split(songs_data, encoded_genres, test_size=0.2, 
                                                    stratify=encoded_genres, shuffle=True, random_state=50)
# Train model
mlp = MLPClassifier(max_iter=400, verbose=True)
mlp.fit(X_train, y_train)

In [None]:
# Evaluate model
predictions = mlp.predict(X_test)
base_accuracy = accuracy_score(y_test, predictions)
base_f1_weighted = f1_score(y_test, predictions, average='weighted')
print("Baseline performance using an MLP")
print(f"Accuracy: {base_accuracy}")
print(f"F1-score: {base_f1_weighted}")

Baseline performance using an MLP
Accuracy: 0.6109315801203477
F1-score: 0.6066895937163028
