In [None]:
# # Download and Unzip VC-PRG-1_5.zip
# !wget http://cmp.felk.cvut.cz/data/audio_vc/audio/VC-PRG-1_5.zip
# !unzip VC-PRG-1_5.zip
# !rm VC-PRG-1_5.zip

In [None]:
# # Download and Unzip VC-PRG-6.zip
# !wget http://cmp.felk.cvut.cz/data/audio_vc/audio/VC-PRG-6.zip
# !unzip VC-PRG-6.zip
# !rm VC-PRG-6.zip

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import torchaudio
from torchaudio.transforms import Resample, MelSpectrogram

import numpy as np

import os
import glob
import wget

import matplotlib.pyplot as plt
%matplotlib  inline

In [2]:
TRAIN_AUDIO_FOLDER = "../VC-PRG-1_5/"
TEST_AUDIO_FOLDER = "../VC-PRG-6/"

SAMPLE_RATE = 22050
NUM_SAMPLES = 22050

BATCH_SIZE = 10
NUM_EPOCHS = 50
LEARNING_RATE = 1e-4

In [3]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

In [5]:
path = TRAIN_AUDIO_FOLDER

class VehicleDataset(Dataset):
    def __init__(self, folder_path, device, target_sample_rate, num_samples):
        self.audio_files = sorted(glob.glob(path + "*.wav"))
        self.label_files = sorted(glob.glob(path + "*.txt"))
        self.device = device
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples
    
    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, item):
        label = self.__get_label(item)
        waveform, sample_rate = self.__load_audio(item)
        waveform = waveform.to(self.device)
        waveform = self.__resample_if_necessary(waveform, sample_rate)
        waveform = self.__mix_down_if_necessary(waveform)
        waveform = self.__cut_if_necessary(waveform)
        waveform = self.__right_pad_if_necessary(waveform)

        mel_spectrogram = self.__get_mel_spectrogram(waveform)
        
        return mel_spectrogram, label

    def __get_label(self, item):
        label = 0
        with open(self.label_files[item], 'r') as f:
            content = f.readlines()
            if -1 in content:
                return label
            else:
                label = len(content)
        return label
    
    def __load_audio(self, item):
        waveform, sample_rate = torchaudio.load(self.audio_files[item])
        return waveform, sample_rate
    
    def __resample_if_necessary(self, waveform, sample_rate):
        if sample_rate != self.target_sample_rate:
            resampler = Resample(sample_rate, self.target_sample_rate).to(self.device)
            waveform = resampler(waveform)
        return waveform

    def __mix_down_if_necessary(self, waveform):
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        return waveform
    
    def __cut_if_necessary(self, waveform):
        if waveform.shape[1] > self.num_samples:
            waveform = waveform[:, :self.num_samples]
        return waveform
    
    def __right_pad_if_necessary(self, waveform):
        if waveform.shape[1] < self.num_samples:
            num_missing_samples = self.num_samples - waveform.shape[1]
            waveform = F.pad(waveform, (0, num_missing_samples))
        return waveform
    
    def __get_mel_spectrogram(self, waveform):
        mel_spec_transformer = MelSpectrogram(
            sample_rate=self.target_sample_rate,
            n_fft=1024,
            win_length=None,
            hop_length=512,
            n_mels=64
        ).to(self.device)
        mel_spec = mel_spec_transformer(waveform)
        return mel_spec


vcd  = VehicleDataset(path, device, SAMPLE_RATE, NUM_SAMPLES)

print("Feature shape: ", vcd[0][0].shape)
print("Label: ", vcd[0][1])

Feature shape:  torch.Size([1, 64, 44])
Label:  3


In [6]:
class CNNNetwork(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(32),
        )

        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=128, kernel_size=3, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(128),
        )

        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(128),
        )

        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(128),
        )

        self.flatten = nn.Flatten()

        self.linear = nn.Linear(in_features=128 * 5 * 4, out_features=10)

        self.dropout = nn.Dropout(0.25)

        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        x = self.dropout(x)
        logits = self.linear(x)
        predictions = self.softmax(logits)
        return predictions

In [7]:
train_data = vcd
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE)

model = CNNNetwork().to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

train_loss = list()
train_acc = list()

for epoch in range(NUM_EPOCHS):
    running_loss = 0.0
    correct = 0.0
    total = 0.0
    for i, data in enumerate(train_loader, 0):
        features, labels = data
        features, labels = features.to(device), labels.to(device)
        print(features.shape, labels.shape)

        labels = nn.functional.one_hot(labels, num_classes=10).float()

        predictions = model(features)
        loss = loss_fn(predictions, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        total += labels.size(0)
        correct += (torch.argmax(predictions, 1) == torch.argmax(labels, 1)).sum().item()
        if i % 10 == 0:
            print(f"Epoch [{epoch + 1} / {NUM_EPOCHS}] loss: {running_loss / 10:.3f}")
    
    train_loss.append(running_loss / len(train_loader))
    train_acc.append(correct / total * 100.0)

torch.save(model.state_dict(), "vcd_cnn_model.pth")
print("Finishshed Training")

torch.Size([10, 1, 64, 44]) torch.Size([10])


KeyboardInterrupt: 

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
ax1.plot(train_acc, '-o')
ax1.set_title = 'Train Accuracy'
ax1.set_xlabel = 'Epoch'
ax1.set_ylabel = 'Accuracy'

ax2.plot(train_loss, '-o')
ax2.set_title = 'Train Loss'
ax2.set_xlabel = 'Epoch'
ax2.set_ylabel = 'Loss'
plt.show()

In [None]:
path = TEST_AUDIO_FOLDER
test_data  = VehicleDataset(path, device, SAMPLE_RATE, NUM_SAMPLES) 
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

correct = 0
total = 0

model.eval()
with torch.no_grad():
    for data in test_loader:
        features, labels = data
        features, labels = features.to(device), labels.to(device)
        
        predictions = model(features)
        _, predictions = torch.max(predictions, 1)
        
        total += labels.size(0)
        correct += (predictions == labels).sum().item()
print(f'Accuracy: {100 * correct // total} %')