In [47]:
import pandas as pd
import seaborn as sns
from glob import glob
import json
import random

from IPython.display import Audio, display
import sys
import os

import torch

sys.path.append(os.path.abspath('../../'))
sns.color_palette("Blues", as_cmap=True)

UNKNOWN_PROJECT = '2_classes_run_optimal_configs_Conformer'
KNOWN_PROJECT = '11_classes_run_optimal_configs_Conformer'

In [48]:
log_dir_2 = os.path.join("..", "..", "logs", UNKNOWN_PROJECT)
if not os.path.exists(log_dir_2):
    print(f"Folder '{log_dir_2}' does not exist.")
else:
    run_dirs_2 = sorted(glob(os.path.join(log_dir_2, 'run_*')),
                      key=lambda x: int(os.path.basename(x).split('_')[1]))
    print(f"Found {len(run_dirs_2)} runs for unknown.")

Found 5 runs for unknown.


In [49]:
log_dir_11 = os.path.join("..", "..", "logs", KNOWN_PROJECT)
if not os.path.exists(log_dir_11):
    print(f"Folder '{log_dir_11}' does not exist.")
else:
    run_dirs_11 = sorted(glob(os.path.join(log_dir_11, 'run_*')),
                      key=lambda x: int(os.path.basename(x).split('_')[1]))
    print(f"Found {len(run_dirs_11)} runs for known.")

Found 5 runs for known.


In [50]:
def get_results(run_dirs):
    max_accuracies = []
    for run_dir in run_dirs[1:]:
        val_csv = os.path.join(run_dir, 'validation_metrics.csv')
        if os.path.exists(val_csv):
            df = pd.read_csv(val_csv)
            max_acc = df['acc/val'].max()
            run_num = os.path.basename(run_dir).split('_')[1]
            max_accuracies.append((int(run_num), max_acc))

    results = pd.DataFrame(max_accuracies, columns=['Run', 'Max Validation Accuracy'])
    return results

In [51]:
results_11 = get_results(run_dirs_11)
print("\nStatistical Summary of 11 class:")
print(results_11['Max Validation Accuracy'].describe())


Statistical Summary of 11 class:
count    4.000000
mean     0.863628
std      0.007673
min      0.852829
25%      0.861143
50%      0.865635
75%      0.868119
max      0.870413
Name: Max Validation Accuracy, dtype: float64


In [52]:
results_2 = get_results(run_dirs_2)
print("\nStatistical Summary of 2 class:")
print(results_2['Max Validation Accuracy'].describe())


Statistical Summary of 2 class:
count    4.000000
mean     0.915679
std      0.005098
min      0.908293
25%      0.914546
50%      0.917288
75%      0.918422
max      0.919848
Name: Max Validation Accuracy, dtype: float64


In [53]:
best_run = results_2.loc[results_2['Max Validation Accuracy'].idxmax()]

print("Best Performing Run:")
print(best_run)

Best Performing Run:
Run                        3.000000
Max Validation Accuracy    0.919848
Name: 1, dtype: float64


In [54]:
best_run = results_11.loc[results_11['Max Validation Accuracy'].idxmax()]

print("Best Performing Run:")
print(best_run)

Best Performing Run:
Run                        3.000000
Max Validation Accuracy    0.870413
Name: 1, dtype: float64


In [55]:
UKNOWN_RUN = 3
KNOWN_RUN = 3

LOGS_DIR = '../../logs'

CONFIG_UNKNOWN = f'{LOGS_DIR}/{UNKNOWN_PROJECT}/run_{UKNOWN_RUN}/config.json'
MODEL_PATH_UNKNOWN = f"{LOGS_DIR}/{UNKNOWN_PROJECT}/run_{UKNOWN_RUN}/best.pth"
CONFIG_KNOWN = f'{LOGS_DIR}/{KNOWN_PROJECT}/run_{KNOWN_RUN}/config.json'
MODEL_PATH_KNOWN = f"{LOGS_DIR}/{KNOWN_PROJECT}/run_{KNOWN_RUN}/best.pth"

In [56]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

sys.path.append(os.path.abspath('../'))

with open(CONFIG_KNOWN) as f:
    cfg = json.load(f)

class Config:
    def __init__(self, dictionary):
        for k, v in dictionary.items():
            if isinstance(v, dict):
                setattr(self, k, Config(v))
            else:
                setattr(self, k, v)

cfg = Config(cfg)
cfg.data.root = os.path.join('..', '..', 'data')

cfg.data.unknown_commands_included = True
cfg.data.silence_included = True

In [57]:
from dataset.dataset import SpeechCommandsDataset

test_dataset = SpeechCommandsDataset(
    root_dir=cfg.data.root,
    cfg=cfg,
    mode='validation'
)
print(test_dataset.target_commands)

num_classes:  12
Class balance in validation data:
  down: 264
  go: 260
  left: 247
  no: 270
  off: 256
  on: 257
  right: 256
  stop: 246
  up: 260
  yes: 261
  _silence_: 39
  _unknown_: 4221
  unknown percentage: 61.74%
['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', '_silence_']


In [58]:
import torch
from modeling.model import build_model

model_known = build_model(cfg, 11)

state_dict = torch.load(MODEL_PATH_KNOWN, map_location=torch.device('cpu'))
model_known.load_state_dict(state_dict)

model_unknown = build_model(cfg, 2)

state_dict = torch.load(MODEL_PATH_UNKNOWN, map_location=torch.device('cpu'))
model_unknown.load_state_dict(state_dict)

class EnsembleStrategy(torch.nn.Module):
    def __init__(self, model_unknown, model_known):
        super(EnsembleStrategy, self).__init__()
        self.model_unknown = model_unknown
        self.model_known = model_known

    def forward(self, x):
        output_unknown = self.model_unknown(x)
        output_known = self.model_known(x)
        if output_unknown[0][1] > 0.5:
            return torch.cat((output_known * 0., torch.tensor([[output_unknown[0][1]]], device=output_unknown.device)), dim=1)
        return torch.cat((output_known, torch.tensor([[0.]], device=output_unknown.device)), dim=1)
model = EnsembleStrategy(model_unknown, model_known)
model.eval()

print("Model loaded successfully!")

Model loaded successfully!


In [63]:
import torch
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import numpy as np
import matplotlib.pyplot as plt

def evaluate_and_plot_confusion_matrix(model, test_dataset):
    model.to(device)
    model.eval()

    true_labels = []
    predicted_labels = []

    for idx in range(len(test_dataset)):
        data, true_label = test_dataset[idx]
        data_tensor = torch.tensor(data, dtype=torch.float32).unsqueeze(0).to(device)

        with torch.no_grad():
            output = model(data_tensor)
            predicted_label = torch.argmax(output, dim=1).item()

        true_labels.append(true_label)
        predicted_labels.append(predicted_label)

    accuracy = np.mean(np.array(true_labels) == np.array(predicted_labels))
    print(f"Accuracy: {accuracy:.3f}")

    cm = confusion_matrix(true_labels, predicted_labels, normalize='true')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(test_dataset.label_mapping.keys()))

    plt.figure(figsize=(10, 10))
    fig, ax = plt.subplots(figsize=(8, 6))
    disp.plot(cmap=plt.cm.Blues, xticks_rotation='vertical', values_format=".2f", ax=ax)
    plt.gca().grid(False)
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.savefig(f'../../charts/conformer.png', dpi=600)
    plt.show()

In [None]:
evaluate_and_plot_confusion_matrix(model, test_dataset)

In [43]:
def evaluate_and_display_wrong_predictions(model, test_dataset, n=5):
    model.to(device)
    model.eval()

    indices = list(range(len(test_dataset)))
    random.shuffle(indices)

    wrong_predictions = []

    for idx in indices:
        data, true_label = test_dataset[idx]
        if true_label != 10:
            continue
        data_tensor = torch.tensor(data, dtype=torch.float32).unsqueeze(0).to(device)

        with torch.no_grad():
            output = model(data_tensor)
            probabilities = torch.softmax(output, dim=1)
            predicted_label = torch.argmax(output, dim=1).item()
            prediction_certainty = probabilities[0, predicted_label].item()

        if predicted_label != true_label:
            wrong_predictions.append((idx, true_label, predicted_label, prediction_certainty))

        if len(wrong_predictions) >= n:
            break

    for idx, true_label, predicted_label, prediction_certainty in wrong_predictions:
        print(f"Sample Index: {idx}")
        print(f"{true_label}, {predicted_label}, {prediction_certainty}")

        def get_key_by_value(dictionary, value):
            return next((k for k, v in dictionary.items() if v == value), '')
        true_label_name = get_key_by_value(test_dataset.label_mapping, true_label)
        predicted_label_name = get_key_by_value(test_dataset.label_mapping, predicted_label)
        print(f"True Label: {true_label_name}, Predicted Label: {predicted_label_name}")
        print(f"Prediction Certainty: {prediction_certainty:.2f}")

        waveform, _ = test_dataset.get_waveform(idx)
        display(Audio(waveform, rate=cfg.data.sample_rate))
        print("-" * 50)

In [44]:
evaluate_and_display_wrong_predictions(model, test_dataset)


Sample Index: 6800
10, 6, 0.509315013885498
True Label: _silence_, Predicted Label: on
Prediction Certainty: 0.51


--------------------------------------------------
Sample Index: 6802
10, 9, 0.7085205912590027
True Label: _silence_, Predicted Label: go
Prediction Certainty: 0.71


--------------------------------------------------
Sample Index: 6811
10, 11, 0.4244641065597534
True Label: _silence_, Predicted Label: _unknown_
Prediction Certainty: 0.42


--------------------------------------------------
Sample Index: 6830
10, 11, 0.18869343400001526
True Label: _silence_, Predicted Label: _unknown_
Prediction Certainty: 0.19


--------------------------------------------------
