In [59]:
import os
import torch
import torchaudio
from torch.utils.data import DataLoader, Dataset
from torchaudio.datasets import SPEECHCOMMANDS
import torchaudio.transforms as transforms
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as Func

In [18]:
# Define the dataset path
dataset_path = "./data/speech_commands"

# Ensure the directory exists
os.makedirs(dataset_path, exist_ok=True)

# Download and extract the dataset
torchaudio.datasets.SPEECHCOMMANDS(root=dataset_path, download=True)

<torchaudio.datasets.speechcommands.SPEECHCOMMANDS at 0x79653a4e0820>

In [61]:
class SubsetSC(SPEECHCOMMANDS):
    def __init__(self, subset: str = None, classes=None):
        super().__init__("./data/speech_commands", download=False)
        
        def load_list(filename):
            with open(filename) as f:
                return [os.path.join(self._path, line.strip()) for line in f]
        
        if subset == "validation":
            self._walker = load_list(self._path + "/validation_list.txt")
        elif subset == "testing":
            self._walker = load_list(self._path + "/testing_list.txt")
        elif subset == "training":
            excludes = load_list(self._path + "/validation_list.txt") + load_list(self._path + "/testing_list.txt")
            excludes = set(excludes)
            self._walker = [w for w in self._walker if w not in excludes]
        
        # Select only "yes" and "no" classes
        self.classes = classes if classes else ['yes', 'no']

    def __getitem__(self, index):
        waveform, sample_rate, label, *_ = super().__getitem__(index)
        
        # Ensure the label is in 'yes' or 'no'
        if label not in self.classes:
            return None  # Skip if it's not in 'yes' or 'no'
        
        # Pad or truncate the waveform
        waveform = self.pad_or_truncate_waveform(waveform)
        
        # Convert label to index
        label_idx = self.classes.index(label)
        
        return waveform, label_idx

    # Pad or truncate waveforms to a fixed length (e.g., 16000 for 1 second at 16 kHz)
    # Different audio files have differnet lengths
    # Therefore it is necessary to convert every file to same dimension
    def pad_or_truncate_waveform(self, waveform, target_length=16000):
        length = waveform.shape[-1] # length of number of samples
        if length > target_length:
            waveform = waveform[:, :target_length]  # Truncate
        elif length < target_length:
            padding = target_length - length
            waveform = Func.pad(waveform, (0, padding))  # Pad with zeros
        return waveform

In [37]:
# Reload the dataset
train_set = SubsetSC("training", classes=['yes', 'no'])
val_set = SubsetSC("validation", classes=['yes', 'no'])
test_set = SubsetSC("testing", classes=['yes', 'no'])

# Remove None items
train_set = [item for item in train_set if item]
val_set = [item for item in val_set if item]
test_set = [item for item in test_set if item]

train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
val_loader = DataLoader(val_set, batch_size=64, shuffle=False)
test_loader = DataLoader(test_set, batch_size=64, shuffle=False)

In [62]:
mel_spectrogram = transforms.MelSpectrogram(sample_rate=16000, # sample rate
                                            n_mels=128, # number of Mel filter banks to apply
                                            n_fft=1024, # Window size of FFT
                                            hop_length=512) # # The number of samples between successive frames

def transform_waveform(waveform):
    mel_spectrogram.to(waveform.device)
    return mel_spectrogram(waveform).log2()

In [63]:
# Load the pre-trained ResNet model
model = models.resnet18(pretrained=True)

# Modify the first convolution layer to accept 1-channel input
model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)

model = model.to(device)



In [53]:
# Loss function
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# ModelV0

In [55]:
# Training loop
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        running_loss = 0.0
        correct = 0
        total = 0

        # Iterate over the training data
        for waveforms, labels in train_loader:
            waveforms, labels = waveforms.to(device), labels.to(device)

            # Pad or truncate the waveforms
            waveforms = torch.stack([pad_or_truncate_waveform(w) for w in waveforms])

            # Transform the waveforms to Mel Spectrograms
            mel_specs = torch.stack([transform_waveform(w) for w in waveforms])

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(mel_specs)

            # Calculate loss
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Track loss and accuracy
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        # Validation step
        val_loss, val_accuracy = evaluate_model(model, val_loader, criterion)
        
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, "
              f"Accuracy: {100 * correct / total:.2f}%, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%")
    
    print("Training complete!")

In [56]:
# Evaluation function for validation/testing
def evaluate_model(model, loader, criterion):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0
    val_loss = 0.0
    with torch.no_grad():
        for waveforms, labels in loader:
            waveforms, labels = waveforms.to(device), labels.to(device)

            # Pad or truncate the waveforms
            waveforms = torch.stack([pad_or_truncate_waveform(w) for w in waveforms])

            # Transform the waveforms to Mel Spectrograms
            mel_specs = torch.stack([transform_waveform(w) for w in waveforms])

            # Forward pass
            outputs = model(mel_specs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Track accuracy
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    return val_loss / len(loader), accuracy

In [57]:
# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=20)

Epoch [1/10], Loss: nan, Accuracy: 50.64%, Val Loss: nan, Val Accuracy: 49.44%
Epoch [2/10], Loss: nan, Accuracy: 50.64%, Val Loss: nan, Val Accuracy: 49.44%
Epoch [3/10], Loss: nan, Accuracy: 50.64%, Val Loss: nan, Val Accuracy: 49.44%
Epoch [4/10], Loss: nan, Accuracy: 50.64%, Val Loss: nan, Val Accuracy: 49.44%
Epoch [5/10], Loss: nan, Accuracy: 50.64%, Val Loss: nan, Val Accuracy: 49.44%
Epoch [6/10], Loss: nan, Accuracy: 50.64%, Val Loss: nan, Val Accuracy: 49.44%
Epoch [7/10], Loss: nan, Accuracy: 50.64%, Val Loss: nan, Val Accuracy: 49.44%
Epoch [8/10], Loss: nan, Accuracy: 50.64%, Val Loss: nan, Val Accuracy: 49.44%
Epoch [9/10], Loss: nan, Accuracy: 50.64%, Val Loss: nan, Val Accuracy: 49.44%
Epoch [10/10], Loss: nan, Accuracy: 50.64%, Val Loss: nan, Val Accuracy: 49.44%
Training complete!


In [58]:
# Evaluate on the test set
test_loss, test_accuracy = evaluate_model(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")

Test Loss: nan, Test Accuracy: 50.85%


In [64]:
torch.save(model, 'ModelV0.pth')