In [5]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load dataset using utils.load()
import utils

tracks = utils.load('data/fma_metadata/tracks.csv')
genres = utils.load('data/fma_metadata/genres.csv')
features = utils.load('data/fma_metadata/features.csv')
echonest = utils.load('data/fma_metadata/echonest.csv')

# Extract only small subset
small = tracks['set', 'subset'] <= 'small'

# Extract Features and Labels
X = features.loc[small, 'mfcc']  # Using only MFCC features
y = tracks.loc[small, ('track', 'genre_top')]  # Extracting genre labels

# Perform Train-Test Split
test_size_ratio = 0.9  # Change to 0.3 for 70/30, etc.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_ratio, stratify=y, random_state=42)

print(f"{y_train.size} training examples, {y_test.size} testing examples")
print(f"{X_train.shape[1]} features, {np.unique(y_train).size} classes")

# Scale Features to Improve Convergence
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Handle Missing Values
X_train = np.nan_to_num(X_train)
X_test = np.nan_to_num(X_test)

# Apply PCA if needed (e.g., reducing to 50 components)
if X_train.shape[1] > 200:
    print("Applying PCA to reduce features...")
    pca = PCA(n_components=50)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

# -------------------------- Model Training and Evaluation --------------------------

# 1️⃣ Logistic Regression
lr_model = LogisticRegression(max_iter=2000, solver='saga', random_state=42)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# 2️⃣ Support Vector Machine (SVM)
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# 3️⃣ Random Forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# 4️⃣ K-Nearest Neighbors (KNN)
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

# -------------------------- Evaluation --------------------------

# Function to evaluate models
def evaluate_model(name, y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    print(f"\n{name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-score: {f1:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Evaluate all models
evaluate_model("Logistic Regression", y_test, y_pred_lr)
evaluate_model("Support Vector Machine (SVM)", y_test, y_pred_svm)
evaluate_model("Random Forest", y_test, y_pred_rf)
evaluate_model("K-Nearest Neighbors (KNN)", y_test, y_pred_knn)


800 training examples, 7200 testing examples
140 features, 8 classes

Logistic Regression Performance:
Accuracy: 0.3964
F1-score: 0.3941
Confusion Matrix:
 [[307 101  26 172  77  66  85  66]
 [ 97 228  83  58 158  94  95  87]
 [ 30  90 450  23  92  72  90  53]
 [149  30  12 467  28  73  96  45]
 [ 78 169 110  15 353  55  64  56]
 [ 85  91  93  96  38 376  70  51]
 [115  95 116  95  43 111 200 125]
 [ 34  54  70  43  37  45 144 473]]

Support Vector Machine (SVM) Performance:
Accuracy: 0.4744
F1-score: 0.4708
Confusion Matrix:
 [[360 132  24 215  55  25  57  32]
 [ 85 320  62  51 165  57  89  71]
 [ 15  90 522  20  77  59  71  46]
 [130  37   7 574  22  28  83  19]
 [ 45 133  98  22 492  27  41  42]
 [ 89  68 114 142  18 382  53  34]
 [105  85 105 126  53  69 236 121]
 [ 31  58  53  37  39  19 133 530]]

Random Forest Performance:
Accuracy: 0.4707
F1-score: 0.4574
Confusion Matrix:
 [[362  29  29 243  79  39  60  59]
 [100 207  90  53 215  81  77  77]
 [ 17  28 551  28 113  47  63  53]
