<a href="https://colab.research.google.com/github/bigirimanainnocent12/Reseaux_de_neurones_2D/blob/main/Untitled30.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Objectif du Projet**

Le dataset SPEECHCOMMANDS contient des fichiers audio associés à des commandes
vocales, ainsi que les transcriptions correspondantes. Ce projet s’appuie sur un tutoriel officiel disponible ici: speech_command_classification_with_torchaudio_tutorial.
Dans ce tutoriel, un réseau de neurones convolutionnel 1D est utilisé pour associer les données audio aux transcriptions. Cependant, il est courant d’effectuer la classification audio dans le domaine temps-fréquence, à partir d’un spectrogramme. L’objectif du projet est donc de concevoir un réseau de neurones convolutionnel 2D capable de travailler sur les spectrogrammes, tout en cherchant à optimiser ses performances. Pour ce faire, vous utiliserez la librairie torchaudio et la transformation MelSpectrogram pour générer les spectrogrammes à partir des fichiers audio.

# **Importation des librairies necessaires**

In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchaudio.datasets import SPEECHCOMMANDS
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB
from torch.utils.data import DataLoader
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

# **Chargement des ensembles d'entraînement, validation et test**

In [2]:
class SubsetSC(SPEECHCOMMANDS):
    def __init__(self, subset: str = None):
        super().__init__("./", download=True)

        def load_list(filename):
            filepath = os.path.join(self._path, filename)
            with open(filepath) as fileobj:
                return [os.path.normpath(os.path.join(self._path, line.strip())) for line in fileobj]

        if subset == "validation":
            self._walker = load_list("validation_list.txt")
        elif subset == "testing":
            self._walker = load_list("testing_list.txt")
        elif subset == "training":
            excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
            excludes = set(excludes)
            self._walker = [w for w in self._walker if w not in excludes]

# **Prétraitement : Transformation en spectrogrammes**

In [3]:
class AudioTransform:
    def __init__(self):
        self.mel_spectrogram = MelSpectrogram(
            sample_rate=16000, n_fft=1024, hop_length=512, n_mels=64
        )
        self.db_transform = AmplitudeToDB(top_db=80)

    def __call__(self, waveform):
        mel = self.mel_spectrogram(waveform)
        db_mel = self.db_transform(mel)
        return db_mel

# **Padding des spectrogrammes pour les rendre de taille fixe**

In [4]:
def pad_spectrogram(spectrogram, max_length=128):
    if spectrogram.size(2) < max_length:
        pad_size = max_length - spectrogram.size(2)
        spectrogram = torch.nn.functional.pad(spectrogram, (0, pad_size))
    elif spectrogram.size(2) > max_length:
        spectrogram = spectrogram[:, :, :max_length]
    return spectrogram

# **Fonction de gestion des batchs**

In [5]:
def collate_fn(batch):
    waveforms, labels = zip(*[(transform(waveform), label_to_index[label]) for waveform, _, label, _, _ in batch])
    waveforms = [pad_spectrogram(waveform) for waveform in waveforms]
    waveforms = torch.stack(waveforms)  # [batch_size, 1, n_mels, max_length]
    labels = torch.tensor(labels)
    return waveforms, labels

# **Modèle CNN 2D**

In [6]:
class ConvNet2D(nn.Module):
    def __init__(self, num_classes):
        super(ConvNet2D, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)  # [batch, 32, 64, 128]
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1) # [batch, 64, 64, 128]
        self.pool = nn.MaxPool2d(2, 2)  # Réduction de moitié dans les dimensions spatiales

        # Calcul de la taille d'entrée pour fc1
        # Après conv1 -> pool -> conv2 -> pool
        # Taille finale : [batch_size, 64, 16, 32] pour une entrée [batch_size, 1, 64, 128]
        self.flatten_size = 64 * 16 * 32

        self.fc1 = nn.Linear(self.flatten_size, 256)
        self.fc2 = nn.Linear(256, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.pool(x)  # [batch, 32, 32, 64]
        x = self.relu(self.conv2(x))
        x = self.pool(x)  # [batch, 64, 16, 32]
        x = x.view(x.size(0), -1)  # Flatten : [batch, 64 * 16 * 32]
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# **Chargement des données**

In [7]:
train_set = SubsetSC("training")
test_set = SubsetSC("testing")

transform = AudioTransform()
labels = sorted(list(set(label for _, _, label, _, _ in train_set)))
label_to_index = {label: i for i, label in enumerate(labels)}

train_loader = DataLoader(train_set, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_set, batch_size=32, shuffle=False, collate_fn=collate_fn)

100%|██████████| 2.26G/2.26G [00:28<00:00, 85.4MB/s]


# **Initialisation du modèle, de la perte et de l'optimiseur**

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes = len(labels)
model = ConvNet2D(num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# **Variables pour suivre les performances**

In [9]:
train_losses = []
test_losses = []
test_accuracies = []

# **Boucle d'entraînement et Évaluation sur le jeu de test**

In [11]:
for epoch in range(2):  # 3 époques, ajustez selon vos besoins
    model.train()
    running_loss = 0.0
    for spectrograms, targets in train_loader:
        spectrograms, targets = spectrograms.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(spectrograms)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    train_losses.append(running_loss / len(train_loader))
    print(f"Epoch {epoch + 1}, Loss: {train_losses[-1]:.4f}")

Epoch 1, Loss: 0.7332
Epoch 2, Loss: 0.5609


# **Évaluation sur le jeu de test**

In [12]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for spectrograms, targets in test_loader:
        spectrograms, targets = spectrograms.to(device), targets.to(device)
        outputs = model(spectrograms)
        _, predicted = torch.max(outputs, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

print(f"Test Accuracy: {100 * correct / total:.2f}%")

Test Accuracy: 79.32%
