In [None]:
import torch
import subprocess
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import torch.optim as optim
import numpy as np

from torch.utils.data import TensorDataset, DataLoader
from typing import List
from dataclasses import dataclass
from scipy.io import loadmat
from tqdm import tqdm
from torch import nn

In [None]:
class Net(nn.Sequential):
    def __init__(
        self,
        input_dim=85,
        hidden_dim=[
            4,
        ],
        activation=nn.ReLU(),
        out_dim=10,
        dropout=0.0,
    ):
        assert len(hidden_dim) > 0, "at least one hidden layer"
        layers = []

        # first layer
        layers.append(nn.Linear(input_dim, hidden_dim[0]))
        layers.append(nn.BatchNorm1d(hidden_dim[0]))

        # dropout
        if dropout > 0.0:
            layers.append(nn.Dropout(dropout))

        # activation
        layers.append(activation)

        # hidden layers
        if len(hidden_dim) > 1:
            for k in range(1, len(hidden_dim)):
                layers.append(nn.Linear(hidden_dim[k - 1], hidden_dim[k]))
                layers.append(nn.BatchNorm1d(hidden_dim[k]))
                layers.append(nn.Dropout(dropout))
                layers.append(activation)

        layers.append(nn.Linear(hidden_dim[-1], out_dim))
        super().__init__(*layers)

In [None]:
def train_epoch(network, loss_fn, dataloader, optimizer, device="cpu"):
    # set the network to train mode
    network.to(device)
    network.train()

    # initialize variables to keep track of loss and number of batches
    epoch_loss = 0.0
    num_batches = 0
    epoch_correct = 0.0
    num_samples = 0

    # iterate over the data loader
    for batch_inputs, batch_targets in dataloader:
        # move data to the appropriate device (e.g., GPU)
        batch_inputs = batch_inputs.to(device)
        batch_targets = batch_targets.to(device)

        # zero the gradients
        optimizer.zero_grad()

        # forward pass
        batch_outputs = network(batch_inputs)

        # compute the loss
        loss = loss_fn(batch_outputs, batch_targets)

        # backward pass
        loss.backward()

        # update the parameters
        optimizer.step()

        # accumulate the loss
        epoch_loss += loss.item()
        num_batches += 1

        # calculate the number of correct predictions in the batch
        _, predicted = torch.max(batch_outputs, 1)
        epoch_correct += (predicted == batch_targets).sum().item()
        num_samples += batch_targets.size(0)

    # calculate the average loss for the epoch
    average_loss = epoch_loss / num_batches

    # calculate the accuracy for the epoch
    accuracy = epoch_correct / num_samples

    return average_loss, accuracy

In [None]:
def validate_epoch(network, loss_fn, dataloader, device="cpu"):
    # Set the network to evaluation mode
    network.to(device)
    network.eval()

    # Initialize variables to keep track of loss and number of batches
    epoch_loss = 0.0
    num_batches = 0
    epoch_correct = 0.0
    num_samples = 0

    # Turn off gradients
    with torch.no_grad():
        # Iterate over the data loader
        for batch_inputs, batch_targets in dataloader:
            # Move data to the appropriate device (e.g., GPU)
            batch_inputs = batch_inputs.to(device)
            batch_targets = batch_targets.to(device)

            # Forward pass
            batch_outputs = network(batch_inputs)

            # Compute the loss
            loss = loss_fn(batch_outputs, batch_targets)

            # Accumulate the loss
            epoch_loss += loss.item()
            num_batches += 1

            # Calculate the number of correct predictions in the batch
            _, predicted = torch.max(batch_outputs, 1)
            epoch_correct += (predicted == batch_targets).sum().item()
            num_samples += batch_targets.size(0)

    # Calculate the average loss for the epoch
    average_loss = epoch_loss / num_batches

    # Calculate the accuracy for the epoch
    accuracy = epoch_correct / num_samples

    return average_loss, accuracy

In [None]:
def train(
    network,
    loss_fn,
    train_dataloader,
    val_dataloader,
    optimizer,
    num_epochs,
    device="cpu",
):
    train_losses = []
    val_losses = []
    val_acc = []
    train_acc = []

    # Initialize tqdm for progress tracking
    progress_bar = tqdm(range(num_epochs), desc="Training Progress")

    for epoch in progress_bar:
        # Training phase
        train_loss, train_accuracy = train_epoch(
            network, loss_fn, train_dataloader, optimizer, device=device
        )
        train_losses.append(train_loss)
        train_acc.append(train_accuracy)

        # Validation phase
        val_loss, val_accuracy = validate_epoch(
            network, loss_fn, val_dataloader, device=device
        )
        val_losses.append(val_loss)
        val_acc.append(val_accuracy)

        # Print the loss for each epoch
        # print(
        #     f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f},Val Loss: {val_loss:.4f} Val Acc: {accuracy:.4f}"
        # )

        # Update tqdm progress bar
        progress_bar.set_postfix(
            {
                "Train Loss": train_loss,
                "Val Loss": val_loss,
                "Train Acc": train_accuracy,
                "Val Acc": val_accuracy,
            }
        )

    return train_losses, val_losses, train_acc, val_acc

Primero cargando los datos de entrenamiento y de validación, los normalizamos y creamos los datasets correspondientes

In [None]:
# load the dataset
dataset_train = loadmat("training_data.mat")
dataset_validation = loadmat("validation_data.mat")

# extract the training and validation data
x_train = dataset_train["features_train"]
y_train = dataset_train["labels_train_int"].flatten()
x_val = dataset_validation["features_validation"]
y_val = dataset_validation["labels_validation_int"].flatten()

x_train_mean = x_train.mean(axis=0, keepdims=True)
x_train_std = x_train.std(axis=0, keepdims=True)

x_train_norm = (x_train - x_train_mean) / x_train_std
x_val_norm = (x_val - x_train_mean) / x_train_std

# convert lists to tensors
x_train_tensor = torch.tensor(x_train_norm, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

x_val_tensor = torch.tensor(x_val_norm, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

# create TensorDataset
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)

In [None]:
@dataclass
class NetParams:
    learning_rate: float
    batch_size: int
    num_epochs: int
    hidden_dim: List[int]
    activation: nn.Module
    dropout: float


def create_net(net_params: NetParams):
    # define network
    net = Net(
        hidden_dim=net_params.hidden_dim,
        activation=net_params.activation,
        dropout=net_params.dropout,
    )

    # create the optimizer and loss function
    optimizer = optim.SGD(net.parameters(), lr=net_params.learning_rate, momentum=0.85)
    loss_fn = nn.CrossEntropyLoss()

    # create the data loaders
    train_dataloader = DataLoader(
        train_dataset, batch_size=net_params.batch_size, shuffle=True
    )
    val_dataloader = DataLoader(
        val_dataset, batch_size=net_params.batch_size, shuffle=False
    )

    # train the net
    train_losses, val_losses, train_acc, val_acc = train(
        net,
        loss_fn,
        train_dataloader,
        val_dataloader,
        optimizer,
        net_params.num_epochs,
        device="cpu",
    )

    print(
        f"Best validation accuracy: {max(val_acc) * 100:.2f}% in epoch {val_acc.index(max(val_acc)) + 1}"
    )

    print(f"Last validation accuracy: {val_acc[-1] * 100:.2f}%")

    # plot the results
    plt.plot(train_losses, label="train loss")
    plt.plot(val_losses, label="val_loss")
    plt.legend()
    plt.xlabel("epoch")
    plt.ylabel("Loss")
    plt.show()

    fig, ax = plt.subplots()
    ax.plot(train_acc, label="train acc")
    ax.plot(val_acc, label="val acc")
    ax.legend()
    ax.set_xlabel("epoch")
    ax.set_ylabel("accuracy")
    plt.show()

    return val_acc[-1]

Creamos nuestra primera red con las siguientes características:
- Optimizador: SGD con momentum ('sgdm')
- Tasa de aprendizaje inicial: 0.01
- Número de épocas: 10
- Tamaño del batch: 512

In [None]:
net_accuracies = []

net_params = NetParams(
    learning_rate=0.01,
    batch_size=512,
    num_epochs=10,
    hidden_dim=[4],
    activation=nn.ReLU(),
    dropout=0.0,
)

val_accuracy = create_net(net_params)
net_accuracies.append((val_accuracy, net_params))

In [None]:
net_params = NetParams(
    learning_rate=10,
    batch_size=512,
    num_epochs=10,
    hidden_dim=[4],
    activation=nn.ReLU(),
    dropout=0.0,
)

val_accuracy = create_net(net_params)
net_accuracies.append((val_accuracy, net_params))

In [None]:
net_params = NetParams(
    learning_rate=0.01,
    batch_size=512,
    num_epochs=100,
    hidden_dim=[4],
    activation=nn.ReLU(),
    dropout=0.0,
)

val_accuracy = create_net(net_params)
net_accuracies.append((val_accuracy, net_params))

In [None]:
net_params = NetParams(
    learning_rate=0.01,
    batch_size=512,
    num_epochs=70,
    hidden_dim=[4],
    activation=nn.ReLU(),
    dropout=0.0,
)

val_accuracy = create_net(net_params)
net_accuracies.append((val_accuracy, net_params))

In [None]:
net_params = NetParams(
    learning_rate=0.01,
    batch_size=128,
    num_epochs=70,
    hidden_dim=[4],
    activation=nn.ReLU(),
    dropout=0.0,
)

val_accuracy = create_net(net_params)
net_accuracies.append((val_accuracy, net_params))

In [None]:
net_params = NetParams(
    learning_rate=0.01,
    batch_size=1024,
    num_epochs=70,
    hidden_dim=[4],
    activation=nn.ReLU(),
    dropout=0.0,
)

val_accuracy = create_net(net_params)
net_accuracies.append((val_accuracy, net_params))

In [None]:
net_params = NetParams(
    learning_rate=0.01,
    batch_size=1024,
    num_epochs=150,
    hidden_dim=[4],
    activation=nn.ReLU(),
    dropout=0.0,
)

val_accuracy = create_net(net_params)
net_accuracies.append((val_accuracy, net_params))

In [None]:
net_params = NetParams(
    learning_rate=0.01,
    batch_size=512,
    num_epochs=100,
    hidden_dim=[4, 4],
    activation=nn.ReLU(),
    dropout=0.0,
)

val_accuracy = create_net(net_params)
net_accuracies.append((val_accuracy, net_params))

In [None]:
net_params = NetParams(
    learning_rate=0.01,
    batch_size=512,
    num_epochs=100,
    hidden_dim=[128, 128],
    activation=nn.ReLU(),
    dropout=0.0,
)

val_accuracy = create_net(net_params)
net_accuracies.append((val_accuracy, net_params))

In [None]:
net_params = NetParams(
    learning_rate=0.01,
    batch_size=512,
    num_epochs=100,
    hidden_dim=[128, 128],
    activation=nn.ReLU(),
    dropout=0.25,
)

val_accuracy = create_net(net_params)
net_accuracies.append((val_accuracy, net_params))

In [None]:
# print the nets by accuracy order and their parameters
net_accuracies.sort(reverse=True)
for val_accuracy, net_params in net_accuracies:
    print(f"Validation accuracy: {val_accuracy * 100:.2f}%")
    print(net_params)
    print()

Para probar el modelo, dado que la red se ha entrenado con los vectores de features calculados en matlab, voy a calcular en matlab también el vector de features de los audios de prueba, ejecutando un script de matlab para extraer las features desde python.

In [None]:
MATLAB_PATH = "/Applications/MATLAB_R2024a.app/bin/matlab"  # path to MATLAB executable(could be different on your machine)
AUDIO_PATH = "../segmented_digits/"  # path to new audios not included in the training or the validation set

# Define learning rate
learning_rate = 0.02
batch_size = 64
num_epochs = 150
dropout = 0.15
hidden_dim = [128, 128, 128]
activation = nn.ELU()

# # extract features from the new audio files
# command = f"{MATLAB_PATH} -batch \"extract_features('{AUDIO_PATH}')\""
# subprocess.run(command, shell=True)

# load the new data
descriptors = loadmat("test_data.mat")["descriptor"]
ground_truths = loadmat("test_data.mat")["groundTruth"]

# normalize the data
descriptors = (descriptors - x_train_mean) / x_train_std

testset = TensorDataset(
    torch.tensor(descriptors, dtype=torch.float32),
    torch.tensor(ground_truths, dtype=torch.uint8),
)
testloader = DataLoader(testset, batch_size=1, shuffle=False)

net = Net(hidden_dim=hidden_dim, dropout=dropout, activation=activation)
net.load_state_dict(torch.load("model.pth"))
net.eval()

predictions = []

with torch.no_grad():
    for num, ground_truth in testloader:
        outputs = net(num)
        _, predicted = torch.max(outputs, 1)
        prediction = predicted.item()
        predictions.append(prediction)

        predictions_confidence = torch.nn.functional.softmax(outputs, dim=1)
        confidence = torch.max(predictions_confidence)

        # print(
        #     f"Predicted digit: {prediction} with confidence of {confidence * 100:.2f}%"
        # )
        # print(f"Ground truth: {ground_truth.item()}")
        # print("\n====================================================\n")

accuracy = np.mean(ground_truths.flatten() == predictions)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# confusion matrix

confusion_matrix = metrics.confusion_matrix(ground_truths, predictions)
cm_display = metrics.ConfusionMatrixDisplay(
    confusion_matrix=confusion_matrix,
    display_labels=[
        "cero",
        "uno",
        "dos",
        "tres",
        "cuatro",
        "cinco",
        "seis",
        "siete",
        "ocho",
        "nueve",
    ],
)

cm_display.plot()
cm_display.figure_.set_size_inches(10, 8)
plt.title("Confusion Matrix")
plt.show()