# Feature Analysis
### Cursory exploration of model performance when comparing the 33 mfcc features, the echonest audio features, and both combined.

## Setup
* Library Imports
* Data set and subset loading
* Function definitions 

In [15]:
# -------------------------- Library Imports --------------------------
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load dataset using utils.load()
# Non-standard library in root directory
import utils


# -------------------------- Data Loading --------------------------

tracks = utils.load('data/fma_metadata/tracks.csv')
genres = utils.load('data/fma_metadata/genres.csv')
features = utils.load('data/fma_metadata/features.csv')
echonest = utils.load('data/fma_metadata/echonest.csv')

# Inner join all tracks present in both the Echonest and Features data sets.
features_all = features.join(echonest, how='inner').sort_index(axis=1)

# Extract subsets
small = tracks['set', 'subset'] <= 'small'
medium = tracks['set', 'subset'] <= 'medium'


# -------------------------- Function Definitions --------------------------

def preprocess_data(X, y, test_size_ratio=0.2):
    try:
        # Perform Train-Test Split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_ratio, stratify=y, random_state=42)
        
        print(f"{y_train.size} training examples, {y_test.size} testing examples")
        print(f"{X_train.shape[1]} features, {np.unique(y_train).size} classes")
        
        # Scale Features to Improve Convergence
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        # Handle Missing Values
        X_train = np.nan_to_num(X_train)
        X_test = np.nan_to_num(X_test)
        
        # Apply PCA if needed (e.g., reducing to 50 components)
        if X_train.shape[1] > 200:
            print("Applying PCA to reduce features...")
            pca = PCA(n_components=50)
            X_train = pca.fit_transform(X_train)
            X_test = pca.transform(X_test)

        # Return processed test/train sets
        return (X_train, X_test, y_train, y_test)
            
    except Exception as e:
        print(f'error: {e}')
    
def train_models(X_train, y_train, X_test):
    try:
        # Logistic Regression 
        lr_model = LogisticRegression(max_iter=2000, solver='saga', random_state=42)
        lr_model.fit(X_train, y_train)
        y_pred_lr = lr_model.predict(X_test)
        
        # Support Vector Machine (SVM)
        svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
        svm_model.fit(X_train, y_train)
        y_pred_svm = svm_model.predict(X_test)
        
        # K-Nearest Neighbors (KNN)
        knn_model = KNeighborsClassifier(n_neighbors=5)
        knn_model.fit(X_train, y_train)
        y_pred_knn = knn_model.predict(X_test)
        
        # Multilayer Perceptron
        mlp_model = MLPClassifier(hidden_layer_sizes=(100,)*10, max_iter=2000)
        mlp_model.fit(X_train, y_train)
        y_pred_mlp = mlp_model.predict(X_test)

        return (y_pred_lr, y_pred_svm, y_pred_knn, y_pred_mlp)

    except Exception as e:
        print(f'error: {e}')
    
# Function to evaluate models
def evaluate_model(name, y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    print(f"\n{name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-score: {f1:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Experiment 1 - MFCC features
This experiment simply serves as a basis for comparison and simply regenerates the results of the original researchers.  
Here we are using the small data set, to keep our data set size relatively similar between experiments.

In [16]:
# Extract Features and Labels
X = features.loc[small, 'mfcc']  # Using only MFCC features
y = tracks.loc[small, ('track', 'genre_top')]  # Extracting genre labels

# -------------------------- Data Preprocessing --------------------------
X_train, X_test, y_train, y_test = preprocess_data(X, y)

# -------------------------- Model Training and Evaluation --------------------------
y_pred_lr, y_pred_svm, y_pred_knn, y_pred_mlp = train_models(X_train, y_train, X_test)

# -------------------------- Evaluation --------------------------
evaluate_model("Logistic Regression (LR)", y_test, y_pred_lr)
evaluate_model("Support Vector Machine (SVM)", y_test, y_pred_svm)
evaluate_model("K-Nearest Neighbors (KNN)", y_test, y_pred_knn)
evaluate_model("Multilayer Perceptron (MLP)", y_test, y_pred_mlp)


6400 training examples, 1600 testing examples
140 features, 8 classes

Logistic Regression (LR) Performance:
Accuracy: 0.5081
F1-score: 0.4984
Confusion Matrix:
 [[ 99  14   1  38  12  12  11  13]
 [ 22  52  14  14  39  18  19  22]
 [  3   7 131   1  17  16  16   9]
 [ 25   9   5 128   4  12  12   5]
 [ 13  22  15   3 110  14   9  14]
 [ 10  13  18  17  10 116   8   8]
 [ 20  22  21  22  17  16  51  31]
 [  9   8  12   7   8  13  17 126]]

Support Vector Machine (SVM) Performance:
Accuracy: 0.5713
F1-score: 0.5691
Confusion Matrix:
 [[113  17   1  26  12   7  17   7]
 [ 23  89   9   9  36   4  20  10]
 [  3   8 140   1  16  11  15   6]
 [ 32   9   3 131   5   7  10   3]
 [ 16  23  13   1 123   3  12   9]
 [ 16   9  16  14   5 123  10   7]
 [ 18  19  18  22  15   9  68  31]
 [ 11  14  17   5   6   7  13 127]]

K-Nearest Neighbors (KNN) Performance:
Accuracy: 0.5012
F1-score: 0.4895
Confusion Matrix:
 [[ 86  11  13  45  10  17  14   4]
 [ 25  48  21  24  35  20  18   9]
 [  2   3 149   5

# Experiment 2 - Echonest Audio Features
Using the same models as before, we instead train them on the echonest audio features (danceability, energy, etc.).  
Here we switched to the medium set, since the available tracks which have echonest data is much smaller.

In [17]:
# Extract Features and Labels
X = features_all.loc[medium, ('echonest', 'audio_features')]  # Using only Echonest audio features
y = tracks.loc[medium, ('track', 'genre_top')]  # Extracting genre labels
X, y = X.align(y, join='inner', axis=0)

# -------------------------- Data Preprocessing --------------------------
X_train, X_test, y_train, y_test = preprocess_data(X, y)

# -------------------------- Model Training and Evaluation --------------------------
y_pred_lr, y_pred_svm, y_pred_knn, y_pred_mlp = train_models(X_train, y_train, X_test)

# -------------------------- Evaluation --------------------------
evaluate_model("Logistic Regression (LR)", y_test, y_pred_lr)
evaluate_model("Support Vector Machine (SVM)", y_test, y_pred_svm)
evaluate_model("K-Nearest Neighbors (KNN)", y_test, y_pred_knn)
evaluate_model("Multilayer Perceptron (MLP)", y_test, y_pred_mlp)


4224 training examples, 1057 testing examples
8 features, 12 classes

Logistic Regression (LR) Performance:
Accuracy: 0.5904
F1-score: 0.3013
Confusion Matrix:
 [[  0   0   1   0   0   0   0   0   0   4   0   4]
 [  0  24   0   0   6   1   0   0   0   1   0   3]
 [  0   2 187   0   8   7   0   1   0   0   2  70]
 [  0   0   0   0   1   0   0   0   0   0   0   2]
 [  0   8   8   0  15   3   0   0   1   4   4  43]
 [  0   0  43   0   1  45   0   0   0   1   0  16]
 [  0   0   5   0   1   0   0   0   0   0   0   5]
 [  0   0   2   0   3   2   0   1   0   5   0   9]
 [  0   3   4   0   7   2   0   0   0   2   0  11]
 [  0   7   1   0   3   0   0   1   0  49   1   2]
 [  0   1   8   0   7   5   0   0   0   1   5  19]
 [  0   0  47   0   9   6   0   1   0   5   3 298]]

Support Vector Machine (SVM) Performance:
Accuracy: 0.6112
F1-score: 0.3156
Confusion Matrix:
 [[  0   0   1   0   0   1   0   2   0   2   0   3]
 [  0  26   1   0   6   0   0   0   0   1   0   1]
 [  0   2 195   0   8   6   

## Experiment 3 - MFCC + Echonest Audio Features
This experiment combines both feature sets. Note that the total data set size is still the same, we just have an additional 33 columns.

In [None]:
# Extract Features and Labels
X = pd.concat([ 
    features_all.loc[medium, ('echonest', 'audio_features')], # Both Echonest audio features and mfcc
    features_all.loc[medium, 'mfcc']
    ], axis=1)
X.columns = X.columns.astype(str) # Since the feature col names are tuples in features set, but strings in echonest set - we must convert
y = tracks.loc[medium, ('track', 'genre_top')]  # Extracting genre labels
X, y = X.align(y, join='inner', axis=0)

# -------------------------- Data Preprocessing --------------------------
X_train, X_test, y_train, y_test = preprocess_data(X, y)

# -------------------------- Model Training and Evaluation --------------------------
y_pred_lr, y_pred_svm, y_pred_knn, y_pred_mlp = train_models(X_train, y_train, X_test)

# -------------------------- Evaluation --------------------------
evaluate_model("Logistic Regression (LR)", y_test, y_pred_lr)
evaluate_model("Support Vector Machine (SVM)", y_test, y_pred_svm)
evaluate_model("K-Nearest Neighbors (KNN)", y_test, y_pred_knn)
evaluate_model("Multilayer Perceptron (MLP)", y_test, y_pred_mlp)


4224 training examples, 1057 testing examples
148 features, 12 classes





Logistic Regression (LR) Performance:
Accuracy: 0.7086
F1-score: 0.5405
Confusion Matrix:
 [[  2   0   1   0   0   1   0   0   1   0   0   4]
 [  0  27   1   1   1   0   0   0   3   0   0   2]
 [  0   0 204   0   5  25   3   1   7   1   5  26]
 [  0   0   1   1   0   0   0   0   0   0   0   1]
 [  0   1   6   1  43   1   0   1   2   1   3  27]
 [  2   0  27   0   1  60   2   0   1   0   1  12]
 [  0   0   2   0   0   0   3   0   1   0   1   4]
 [  1   1   0   0   3   2   0  14   0   0   1   0]
 [  1   3   5   0   5   0   0   0   9   0   0   6]
 [  0   1   1   0   2   0   0   0   0  59   0   1]
 [  1   0   7   0   7   3   0   0   0   0   9  19]
 [  0   2  22   0   9   8   1   6   1   1   1 318]]

Support Vector Machine (SVM) Performance:
Accuracy: 0.7493
F1-score: 0.5083
Confusion Matrix:
 [[  1   0   2   0   0   0   0   0   0   0   0   6]
 [  0  30   1   0   1   0   0   0   1   0   0   2]
 [  0   0 236   0   6   7   0   0   0   0   0  28]
 [  0   0   0   0   1   0   0   0   0   0   0 

### Quick script to get the in-order column feature names

In [29]:
cm = confusion_matrix(y_test, y_pred_mlp)
 
# Print out the columns (class labels)
print("Labels: (In order):\n", np.unique(y_test))

Labels: (In order):
 ['Blues' 'Classical' 'Electronic' 'Experimental' 'Folk' 'Hip-Hop'
 'Instrumental' 'International' 'Jazz' 'Old-Time / Historic' 'Pop' 'Rock']
