# TASK 3 Classification



In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, \
    confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score, train_test_split

In [2]:
metadata = pd.read_csv('metadata.csv')
print(f"Shape of metadata: {metadata.shape}")
train_files = metadata["filename"]

features_dir = 'audio_features'
labels_dir = 'labels'
categories = ['Airplane', 'Alarm', 'Beep/Bleep', 'Bell', 'Bicycle', 'Bird Chirp', 'Bus', 'Car', 'Cat Meow',
              'Chainsaw', 'Clapping', 'Cough', 'Cow Moo', 'Cowbell', 'Crying', 'Dog Bark', 'Doorbell', 'Drip',
              'Drums', 'Fire', 'Footsteps', 'Guitar', 'Hammer', 'Helicopter', 'Hiccup', 'Horn Honk', 'Horse Neigh',
              'Insect Buzz', 'Jackhammer', 'Laughter', 'Lawn Mower', 'Motorcycle', 'Piano', 'Pig Oink', 'Power Drill',
              'Power Saw', 'Rain', 'Rooster Crow', 'Saxophone', 'Sewing Machine', 'Sheep/Goat Bleat', 'Ship/Boat',
              'Shout', 'Singing', 'Siren', 'Sneeze', 'Snoring', 'Speech', 'Stream/River', 'Thunder', 'Train', 'Truck',
              'Trumpet', 'Vacuum Cleaner', 'Violin', 'Washing Machine', 'Waves', 'Wind']
print(f"Amount of categories: {len(categories)}")

Shape of metadata: (8230, 12)
Amount of categories: 58


## Aggregating labels for a frame when same class

In [3]:
def aggregate_labels(file_labels):
    """
    When a frame has multiple annotations for the same class, this function aggregates them
    :param file_labels: list of lists
    :return: list
    """
    __y = []
    for frame_labels in file_labels:
        if (sum(frame_labels) == 0):
            __y.append([0])
        elif (np.count_nonzero(frame_labels) == len(frame_labels)):
            __y.append([1])
        else:  #The annotators don't agree on the label
            __y.append([np.random.choice(frame_labels)])
    return __y

## Reading the files

In [4]:
import itertools


def read_files(file_names, num_to_read=1000):
    X_train = []
    Y_train = {}
    for c in categories:
        Y_train[c] = []
    for f in file_names[:num_to_read]:  #we are not loading the entire dataset due to processing time
        if not os.path.exists(os.path.join(features_dir, f.split('.')[0] + '.npz')):
            continue
        features = np.load(os.path.join(features_dir, f.split('.')[0] + '.npz'))["embeddings"]
        X_train.append(features)
        y = np.load(os.path.join(labels_dir, f.split('.')[0] + '_labels.npz'))
        for c in categories:
            _y = aggregate_labels(y[c])
            Y_train[c].extend(list(itertools.chain.from_iterable(_y)))
    X_train = np.concatenate(X_train)
    return X_train, Y_train


In [5]:
# Read all files
X_train, Y_train = read_files(train_files, len(train_files))
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of Y_train: {len(Y_train['Wind'])}")

Shape of X_train: (1538577, 768)
Shape of Y_train: 1538577


## Baseline Classifier

In [6]:
class Baseline_classifier():

    def __init__(self):
        self.majority_class = None

    def fit(self, x_train, y_train):
        '''x_train is a numpy array of features with shape NxD, where N is the number of datapoints and D the feature dimension
        y_train is a list of binary labels in the shape Nx1
        '''
        # choose whatever is the most common label
        self.majority_class = np.argmax(np.bincount(y_train))

    def predict(self, x):
        '''x is a numpy array of features with shape NxD, where N is the number of datapoints and D the feature dimension
        The function should return the predicted binary labels as a numpy array of shape Nx1
        '''
        # whenever the majority class is predicted, return 1, else return 0
        return np.array([self.majority_class] * x.shape[0])  # return an array of shape Nx1


In [7]:

# Whether a sound is wind or not
wind_x, wind_y = X_train, np.array(Y_train['Wind'])

baseline = Baseline_classifier()
baseline.fit(wind_x, wind_y.astype(int))

y_train_pred = baseline.predict(wind_x)

train_fraction_correct = np.mean(y_train_pred == wind_y)

print(f"Training Fraction Correct: {train_fraction_correct:.2f}")

Training Fraction Correct: 0.93


### Very imbalanced data
## Plotting decision boundaries (for wind)

In [8]:
# # Plotting function
# def plot_decision_boundary(knn, X, y, title, highlight_point=None):
#     h = 0.1
#     x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
#     y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
#     xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
#                          np.arange(y_min, y_max, h))
#     Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
#     Z = Z.reshape(xx.shape)
#
#     plt.figure(figsize=(5, 3))
#     plt.contourf(xx, yy, Z, cmap='coolwarm', alpha=0.3)
#     plt.scatter(X[:, 0], X[:, 1], c=y, cmap='bwr', edgecolor='k', s=100, label='Train data')
#     if highlight_point is not None:
#         plt.scatter(highlight_point[0][0], highlight_point[0][1], color='gold', edgecolor='k',
#                     marker='*', s=250, label='Test point (not in train)')
#     plt.title(title)
#     plt.xlabel("Feature 1")
#     plt.ylabel("Feature 2")
#     plt.legend()
#     plt.grid(True)
#     plt.show()

In [9]:

# # Plotting decision boundaries
#
# knn = KNeighborsClassifier(n_neighbors=1)
# knn.fit(wind_x, wind_y)
#
# plot_decision_boundary(baseline, wind_x, wind_y, "Baseline")

## Use train-test split and create confusion matrix

In [10]:
# # Split data
# X_train_wind, X_test_wind, y_train_wind, y_test_wind = train_test_split(
#     wind_x, wind_y, test_size=0.3, random_state=42
# )
#
# # Train model
# knn = KNeighborsClassifier(n_neighbors=1)
# knn.fit(X_train_wind, y_train_wind)
#
# # Predictions
# y_train_pred = knn.predict(X_train_wind)
# y_test_pred = knn.predict(X_test_wind)
#
# # Create subplots
# fig, axes = plt.subplots(1, 2, figsize=(8, 3))  # Side by side, smaller overall size
#
# # Train confusion matrix
# cm_train = confusion_matrix(y_train_wind, y_train_pred)
# disp_train = ConfusionMatrixDisplay(confusion_matrix=cm_train, display_labels=knn.classes_)
# disp_train.plot(cmap='Blues', ax=axes[0], colorbar=False)
# axes[0].set_title("Confusion Matrix - Train Set")
#
# # Test confusion matrix
# cm_test = confusion_matrix(y_test_wind, y_test_pred)
# disp_test = ConfusionMatrixDisplay(confusion_matrix=cm_test, display_labels=knn.classes_)
# disp_test.plot(cmap='Blues', ax=axes[1], colorbar=False)
# axes[1].set_title("Confusion Matrix - Test Set")
#
# plt.tight_layout()
# plt.show()


In [11]:
train_files = metadata.sample(len(metadata), random_state=42)["filename"].unique()[:int(len(metadata) * 0.8)]
validation_files = metadata.sample(len(metadata), random_state=42)["filename"].unique()[
                   int(len(metadata) * 0.8):int(len(metadata) * 0.9)]
test_files = metadata.sample(len(metadata), random_state=42)["filename"].unique()[int(len(metadata) * 0.9):]
#So final split: 80% train, 10% val, 10% test

print(f"Train files: {len(train_files)}")
print(f"Validation files: {len(validation_files)}")
print(f"Test files: {len(test_files)}")


Train files: 6584
Validation files: 823
Test files: 823


In [12]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, balanced_accuracy_score
)

# y_true: ground truth labels
# y_pred: predicted class labels (e.g., 0 or 1)
# y_scores: predicted probabilities or decision function scores

# accuracy      = accuracy_score(wind_y_test, y_test_pred)
# precision     = precision_score(wind_y_test, y_test_pred, zero_division=0)
# recall        = recall_score(wind_y_test, y_test_pred)
# f1            = f1_score(wind_y_test, y_test_pred)
# roc_auc       = roc_auc_score(wind_y_test, y_test_pred)
# pr_auc        = average_precision_score(wind_y_test, y_test_pred)
# weighted_acc  = balanced_accuracy_score(wind_y_test, y_test_pred)

# print(f"Accuracy:         {accuracy:.3f}")
# print(f"Weighted Accuracy:{weighted_acc:.3f}")
# print(f"Precision:        {precision:.3f}")
# print(f"Recall:           {recall:.3f}")
# print(f"F1 Score:         {f1:.3f}")
# print(f"ROC AUC:          {roc_auc:.3f}")
# print(f"PR AUC:           {pr_auc:.3f}")


## Training Classifiers

In [13]:
X_train, Y_train = read_files(train_files, len(train_files)) # 80%
X_val, Y_val = read_files(validation_files, len(validation_files)) # 10%
X_test, Y_test = read_files(test_files, len(test_files)) # 10%

# X_train, Y_train = read_files(train_files, 500)
# X_val, Y_val = read_files(validation_files, 200)
# X_test, Y_test = read_files(test_files, 200)

#subsampling the training data to reduce run time
# TODO: HERE YOU CAN CHANGE SAMPLE SIZE
sample_size = len(X_train)
indices = np.random.choice(len(X_train), size=sample_size, replace=False)

X_train = X_train[indices]
for c in categories:
    Y_train[c] = [Y_train[c][i] for i in indices]

# This is still wind
wind_x_train, wind_y_train = X_train, np.array(Y_train['Wind'])
wind_x_test, wind_y_test = X_test, np.array(Y_test['Wind'])
wind_x_val, wind_y_val = X_val, np.array(Y_val['Wind'])

In [14]:
# from sklearn.tree import DecisionTreeClassifier
#
# # Define the model
# dt = DecisionTreeClassifier(random_state=42)

In [15]:
# param_grid = {
#     'max_depth': [3, 5, 10, None],
#     'min_samples_split': [2, 5, 10],
#     'criterion': ['gini', 'entropy']
# }
# best_score = 0
# best_params = None
# best_model = None
#
# # Generate all combinations of hyperparameters
# for max_depth, min_samples_split, criterion in itertools.product(
#     param_grid['max_depth'],
#     param_grid['min_samples_split'],
#     param_grid['criterion']
# ):
#     model = DecisionTreeClassifier(max_depth=max_depth,min_samples_split=min_samples_split,criterion=criterion,random_state=42)
#     model.fit(wind_x_train, wind_y_train)
#
#     y_val_pred = model.predict(wind_x_val)
#     score = balanced_accuracy_score(wind_y_val, y_val_pred)
#     print(f"Params: max_depth={max_depth}, min_samples_split={min_samples_split}, criterion={criterion} --> Accuracy: {score:.4f}")
#
#     if score > best_score:
#         best_score = score
#         best_params = {
#             'max_depth': max_depth,
#             'min_samples_split': min_samples_split,
#             'criterion': criterion
#         }
#         best_model = model
# print("\nBest Parameters:")
# print(best_params)
# print(f"Best Validation Accuracy: {best_score:.4f}")


## Multilabel classification (what we need)

In [16]:
# select all classes
selected_classes = ['Airplane', 'Alarm', 'Beep/Bleep', 'Bell', 'Bicycle', 'Bird Chirp', 'Bus', 'Car', 'Cat Meow',
        'Chainsaw', 'Clapping', 'Cough', 'Cow Moo', 'Cowbell', 'Crying', 'Dog Bark', 'Doorbell', 'Drip',
        'Drums', 'Fire', 'Footsteps', 'Guitar', 'Hammer', 'Helicopter', 'Hiccup', 'Horn Honk', 'Horse Neigh',
        'Insect Buzz', 'Jackhammer', 'Laughter', 'Lawn Mower', 'Motorcycle', 'Piano', 'Pig Oink', 'Power Drill',
        'Power Saw', 'Rain', 'Rooster Crow', 'Saxophone', 'Sewing Machine', 'Sheep/Goat Bleat', 'Ship/Boat',
        'Shout', 'Singing', 'Siren', 'Sneeze', 'Snoring', 'Speech', 'Stream/River', 'Thunder', 'Train', 'Truck',
        'Trumpet', 'Vacuum Cleaner', 'Violin', 'Washing Machine', 'Waves', 'Wind']

# selected_classes = ['Alarm', 'Wind', 'Dog Bark']

y_train = np.array([Y_train[cls] for cls in selected_classes]).T
y_val = np.array([Y_val[cls] for cls in selected_classes]).T
y_test = np.array([Y_test[cls] for cls in selected_classes]).T

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import balanced_accuracy_score, f1_score, accuracy_score #, classification_report

X = X_train
Y = y_train

# Fit classifier
base_clf = DecisionTreeClassifier(random_state=42)
br_clf = MultiOutputClassifier(base_clf, n_jobs=30)
br_clf.fit(X, Y)

# Predict
Y_pred = br_clf.predict(X_val)

# Compute balanced accuracy
n_labels = y_val.shape[1]
balanced_accuracies = [balanced_accuracy_score(y_val[:, i], Y_pred[:, i]) for i in range(n_labels)]
balanced_accuracy_macro = np.mean(balanced_accuracies)
print("Macro-Averaged Balanced Accuracy:", balanced_accuracy_macro)

# Classification report
# print(classification_report(y_val, Y_pred, target_names=selected_classes, zero_division=0))

# Macro F1
f1_scores = [f1_score(y_val[:, i], Y_pred[:, i]) for i in range(n_labels)]
print("Macro-Averaged F1 Score:", np.mean(f1_scores))

# Equal weight to each class, good when imbalanced
macro_f1 = f1_score(y_val, Y_pred, average='macro')
print("Macro-Averaged F1 Score:", macro_f1)

# Aggregates total TP, FP, and FN across all classes
micro_f1 = f1_score(y_val, Y_pred, average='micro')
print("Micro-Averaged F1 Score:", micro_f1)

# all labels for a sample must match exactly
subset_acc = accuracy_score(y_val, Y_pred)
print("Subset Accuracy (Exact Match Ratio):", subset_acc)

# Per class F1 score
per_class_f1 = f1_score(y_val, Y_pred, average=None)
for cls, score in zip(selected_classes, per_class_f1):
    print(f"{cls}: F1 Score = {score:.3f}")

# ---------------------------------------
import joblib
from datetime import datetime as dt
minute = dt.now().time().minute
hour = dt.now().time().hour

joblib.dump(br_clf, f'MultiOutputClassifier{hour}-{minute}.joblib')

## Export/Import model and test on test set

In [24]:
# Load the model
br_clf_loaded = joblib.load('MultiOutputClassifier2.joblib')

# Use it to predict on the test set
Y_test_pred = br_clf_loaded.predict(X_test)

# Evaluate performance on the test set
from sklearn.metrics import classification_report, balanced_accuracy_score, f1_score

# Use the same selected_classes list to preserve order
selected_classes = ['Alarm', 'Wind', 'Dog Bark']

# True labels
y_test = np.array([Y_test[cls] for cls in selected_classes]).T

# Report
print(classification_report(y_test, Y_test_pred, target_names=selected_classes, zero_division=0))
balanced_accuracies = [balanced_accuracy_score(y_test[:, i], Y_test_pred[:, i]) for i in range(y_test.shape[1])]
print("Macro-Averaged Balanced Accuracy (Test):", np.mean(balanced_accuracies))
print("Macro-Averaged F1 Score (Test):",
      np.mean([f1_score(y_test[:, i], Y_test_pred[:, i]) for i in range(y_test.shape[1])]))

              precision    recall  f1-score   support

       Alarm       0.10      0.09      0.09       393
        Wind       0.21      0.15      0.18      3120
    Dog Bark       0.41      0.52      0.46       547

   micro avg       0.24      0.20      0.22      4060
   macro avg       0.24      0.25      0.24      4060
weighted avg       0.22      0.20      0.21      4060
 samples avg       0.02      0.02      0.02      4060

Macro-Averaged Balanced Accuracy (Test): 0.6148565158349437
Macro-Averaged F1 Score (Test): 0.24322226566988792


In [26]:
# from sklearn.metrics import classification_report
# n_labels = y_val.shape[1]
# balanced_accuracies = []
#
# for i in range(n_labels):
#     score = balanced_accuracy_score(y_val[:, i], Y_pred[:, i])
#     balanced_accuracies.append(score)
#
# # Macro-average across labels
# balanced_accuracy_macro = np.mean(balanced_accuracies)
#
# print("Macro-Averaged Balanced Accuracy:", balanced_accuracy_macro)
#
#
# print(classification_report(y_val, Y_pred, target_names=selected_classes, zero_division=0))
# f1_scores = [f1_score(y_val[:, i], Y_pred[:, i]) for i in range(n_labels)]
# print("Macro-Averaged F1 Score:", np.mean(f1_scores))

Macro-Averaged Balanced Accuracy: 0.709200995571798




ValueError: Classification metrics can't handle a mix of multilabel-indicator and multiclass-multioutput targets

## Using GPU (not working)

In [None]:
# from sklearn.multioutput import MultiOutputClassifier
# from xgboost import XGBClassifier
#
#
# for i in range(y_train.shape[1]):
#     vals = np.unique(y_train[:, i])
#     if not np.all(np.isin(vals, [0, 1])):
#         print(f"Unexpected values in column {i}: {vals}")

In [20]:
# Sanitize labels
# y_train = y_train.astype(np.int32)
# y_val = y_val.astype(np.int32)
#
# # Define GPU-enabled base classifier
# xgb_clf = XGBClassifier(tree_method='hist', eval_metric='logloss', verbosity=1, device='cuda')
#
# # Wrap in MultiOutputClassifier
# multi_clf = MultiOutputClassifier(xgb_clf)
#
# # Fit
# multi_clf.fit(X_train, y_train)
#
# # Predict and evaluate
# y_pred = multi_clf.predict(X_val)


In [21]:
# n_labels = y_val.shape[1]
# balanced_accuracies = []
#
# for i in range(n_labels):
#     score = balanced_accuracy_score(y_val[:, i], Y_pred[:, i])
#     balanced_accuracies.append(score)
#
# # Macro-average across labels
# balanced_accuracy_macro = np.mean(balanced_accuracies)
#
# print("Macro-Averaged Balanced Accuracy:", balanced_accuracy_macro)
#
#
# print(classification_report(y_val, Y_pred, target_names=selected_classes))
# f1_scores = [f1_score(y_val[:, i], Y_pred[:, i]) for i in range(n_labels)]
# print("Macro-Averaged F1 Score:", np.mean(f1_scores))