In [1]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
import os
from UrbanSoundDataset import UrbanSoundDataset
from FeedForward import CNNNetwork
import torchaudio
from torch.utils.data import DataLoader
from torch import nn
from datetime import datetime

  torchaudio.set_audio_backend("soundfile")


In [3]:
ANNOTATION_FILE = "./data/UrbanSound8K/metadata/UrbanSound8K.csv"
AUDIO_DIR = os.path.join("data", "Training", "audio")
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050
BATCH_SIZE = 128
EPOCHS = 10
LEARNING_RATE = 0.001

In [4]:
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=1024,
    hop_length=512,
    n_mels=64
)

In [5]:
usd = UrbanSoundDataset(ANNOTATION_FILE, AUDIO_DIR, mel_spectrogram, SAMPLE_RATE, NUM_SAMPLES, device)
cnn = CNNNetwork().to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn.parameters(), lr=LEARNING_RATE)

In [6]:
from torchsummary import summary
summary(cnn, input_size=(1, 64, 44))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 66, 46]             160
              ReLU-2           [-1, 16, 66, 46]               0
         MaxPool2d-3           [-1, 16, 33, 23]               0
            Conv2d-4           [-1, 32, 35, 25]           4,640
              ReLU-5           [-1, 32, 35, 25]               0
         MaxPool2d-6           [-1, 32, 17, 12]               0
            Conv2d-7           [-1, 64, 19, 14]          18,496
              ReLU-8           [-1, 64, 19, 14]               0
         MaxPool2d-9             [-1, 64, 9, 7]               0
           Conv2d-10           [-1, 128, 11, 9]          73,856
             ReLU-11           [-1, 128, 11, 9]               0
        MaxPool2d-12            [-1, 128, 5, 4]               0
          Flatten-13                 [-1, 2560]               0
           Linear-14                   

In [7]:
def create_data_loader(data, batch_size):
    dataloader = DataLoader(data, batch_size=batch_size)
    return dataloader

In [8]:
train_dataloader = create_data_loader(usd, BATCH_SIZE)

In [9]:
def train_one_epoch(model, data_loader, loss_function, optimizer, device):
    acc = 0
    for inputs, targets in data_loader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        # calculate loss
        predictions = model(inputs)
        loss = loss_function(predictions, targets)
        # backpropagate error and update weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # calculate accuracy
        acc += (predictions.argmax(1) == targets).sum().item()
    return acc / len(data_loader.dataset)

In [10]:
def train(model, train_data, loss_function, optimizer, device):
    tac = []
    i = 0
    patience = 0
    scr = 0
    start_time = datetime.now()
    while True:
        i+=1
        print(f"\nEpoch : {i:4} | ", end=" ")
        train_acc = train_one_epoch(model, train_data, loss_function, optimizer, device)
        tac.append(train_acc)

        print(f"train acc : {train_acc:.4f} | patience : {patience} | best acc : {scr:.4f}", end=" ") 
        if train_acc > scr:
            scr = train_acc
            patience = 0
            torch.save(model.state_dict(), "models/cnn.pth")
            log = {
            "train_acc": tac,
            }
            torch.save(log, "models/logs.pth")
        else:
            patience +=1

        if patience >= 5:
            break
    end_time = datetime.now()
    print(f"\nTraining completed in {(end_time-start_time).seconds} seconds")

In [11]:
train(cnn, train_dataloader, loss_function, optimizer, device)


Epoch :    1 |  

LibsndfileError: Error opening 'data\\Training\\audio\\fold5\\100032-3-0-0.wav': System error.