## Code for pytorch for audio
This jupyter notebook is used to store the code in pytorch for audio and music processing. Here is the original link:

https://www.youtube.com/watch?v=gp2wZqDoJ1Y&list=PL-wATfeyAMNoirN4idjev6aRu8ISZYVWm&index=1

In [1]:
## 1. Train a feed forwarrd network
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

BATCH_SIZE = 128
EPOCHS = 10
LEARNING_RATE = .001

class FeedForwardNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.dense_layers = nn.Sequential(
            nn.Linear(28*28, 256),
            nn.ReLU(),
            nn.Linear(256, 10)

        )
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, input_data):
        flattened_data = self.flatten(input_data)
        logits = self.dense_layers(flattened_data)
        predictions = self.softmax(logits)
        return predictions
    

# download datasets MNIST
def download_mnist_datasets():
    train_data = datasets.MNIST(
        root="D:\Coding\Tutorial_DL_Acoustic_Signal\DATA\MNIST",
        download=True,
        train=True,
        transform = ToTensor()
    )
    validation_data = datasets.MNIST(
    root="D:\Coding\Tutorial_DL_Acoustic_Signal\DATA\MNIST",
    download=True,
    train=False,
    transform = ToTensor()
    )
    return train_data, validation_data

def train_one_epoch(model, data_loader, loss_fn, optimizer, device):
    for inputs, targets in data_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        # calculate loss
        predictions = model(inputs)
        loss = loss_fn(predictions, targets)

        # backpropogate loss and update weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Loss: {loss.item()}")


def train(model, data_loader, loss_fn, optimizer, device, epochs):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_one_epoch(model, data_loader, loss_fn, optimizer, device)
        print("-----------------------")
    print("Training is done")
    

if __name__ == "__main__":
    # download MNIST dataset
    train_data, _ = download_mnist_datasets()
    print("MNIST dataset downloaded")

    # create a data loader for the train set
    train_data_loader = DataLoader(train_data, batch_size=BATCH_SIZE)

    # build model
    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
    print(f"Using {device} device")   
    feed_forward_net = FeedForwardNet().to(device)

    # instantiate loss function + optimizerr
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(feed_forward_net.parameters(), lr=LEARNING_RATE)

    # train model
    train(feed_forward_net, train_data_loader, loss_fn, optimizer, device, EPOCHS)

    # Store the model
    torch.save(feed_forward_net.state_dict(), "feedforwardnet.pth")
    print("Model trained and stored at feedforwardnet.pth")

    

MNIST dataset downloaded
Using cuda device
Epoch 1
Loss: 1.5076099634170532
-----------------------
Epoch 2
Loss: 1.4944872856140137
-----------------------
Epoch 3
Loss: 1.486358642578125
-----------------------
Epoch 4
Loss: 1.4830163717269897
-----------------------
Epoch 5
Loss: 1.4748579263687134
-----------------------
Epoch 6
Loss: 1.4730068445205688
-----------------------
Epoch 7
Loss: 1.4728569984436035
-----------------------
Epoch 8
Loss: 1.4737215042114258
-----------------------
Epoch 9
Loss: 1.473598599433899
-----------------------
Epoch 10
Loss: 1.4722856283187866
-----------------------
Training is done
Model trained and stored at feedforwardnet.pth


In [2]:
## 2. Making predictions/Inference
import torch

class_mapping = [
    "0",
    "1",
    "2",
    "3",
    "4",
    "5",
    "6",
    "7",
    "8",
    "9",
]

def predict(model, input, target, class_mapping):
    model.eval()
    with torch.no_grad():
        predictions = model(input)
        # Tensor (1,10) -> [[0.1, 0.01,...,.0.6]]
        predicted_index = predictions[0].argmax(0)
        predicted = class_mapping[predicted_index]
        expected = class_mapping[target]
    return predicted, expected



if __name__ == "__main__":
    # load back the model
    feed_forward_net = FeedForwardNet()
    state_dict = torch.load("feedforwardnet.pth")
    feed_forward_net.load_state_dict(state_dict)

    # load MNIST validation dataset
    _, validation_data = download_mnist_datasets()

    # get a sample from the validation dataset for inference
    input, target = validation_data[0][0], validation_data[0][1]

    # make an inference
    predicted, expected = predict(feed_forward_net, input, target, class_mapping)

    print(f"Predicted: '{predicted}', expected :'{expected}'")

Predicted: '7', expected :'7'


# CNN for Classification of Urbansound 8k

In [9]:
## Custome Datasets Urbandsound8k dataset
## Transform the orginial signal into MelSpectrogram
## Preprocessing audio with different durations
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
import os
import torch

class UrbanSoundDataset(Dataset):
    def __init__(self, annotations_file, audio_dir, transformation, target_sample_rate, num_samples,device):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device)
        # signal -> (num_channels, samples) -> (2, 16000) 
        signal = self._resample_if_necessary(signal, sr)
        # signal -> (1, 16000)
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)
        signal = self.transformation(signal)
        return signal, label
    
    def _cut_if_necessary(self, signal):
        # signal -> Tensor -> (1, num_samples) -> (1, 50000) -> (0, 22050)
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return signal
    
    def _right_pad_if_necessary(self, signal):
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            num_missing_samples = self.num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            # (1, num_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal


    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate).to(self.device)
            signal = resampler(signal) 
        return signal
    
    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:  # (2, 1000)
            signal = torch.mean(signal,dim=0, keepdim=True)
        return signal
    
    def _get_audio_sample_path(self, index):
        fold = f"fold{self.annotations.iloc[index, 5]}"
        path = os.path.join(self.audio_dir, fold, self.annotations.iloc[index, 0])
        return path
    
    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 6]
    
    
if __name__ == "__main__":
    ANNOTATIONS_FILE = r"D:\Coding\Tutorial_DL_Acoustic_Signal\DATA\UrbanSound8K\UrbanSound8K\metadata\UrbanSound8K.csv"
    AUDIO_DIR = r"D:\Coding\Tutorial_DL_Acoustic_Signal\DATA\UrbanSound8K\UrbanSound8K\audio"
    SAMPLE_RATE = 22050
    NUM_SAMPLES = 22050
    # The NUM_SAMPLES = SAMPLE_RATE  ->   1s Duration
    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
    print(f"Using device {device}")
    
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate = SAMPLE_RATE,
        n_fft = 1024,
        hop_length = 512,
        n_mels = 64
    )
    # mel_spectrogram(signal)
    

    usd = UrbanSoundDataset(ANNOTATIONS_FILE, AUDIO_DIR, mel_spectrogram, SAMPLE_RATE, NUM_SAMPLES,device)


    print(f"There are {len(usd)} samples in the dataset.")

    signal, label = usd[0]

print(signal.shape)
        

Using device cuda
There are 8732 samples in the dataset.
torch.Size([1, 64, 44])


After pre-processing of the Urbandataset, the next step is to set a suitable model like CNN to classify the sound


In [11]:
from torch import nn
from torchsummary import summary
class CNNNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        # Vgg network
        # 4 conv blocks / flatten/ Linear / softmax
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1,padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1,padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1,padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1,padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(128*5*4, 10)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        logits = self.linear(x)
        predictions = self.softmax(logits)
        return predictions
    

if __name__ == "__main__":
    cnn = CNNNetwork()
    summary(cnn.cuda(), (1, 64, 44))
    
    


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 66, 46]             160
              ReLU-2           [-1, 16, 66, 46]               0
         MaxPool2d-3           [-1, 16, 33, 23]               0
            Conv2d-4           [-1, 32, 35, 25]           4,640
              ReLU-5           [-1, 32, 35, 25]               0
         MaxPool2d-6           [-1, 32, 17, 12]               0
            Conv2d-7           [-1, 64, 19, 14]          18,496
              ReLU-8           [-1, 64, 19, 14]               0
         MaxPool2d-9             [-1, 64, 9, 7]               0
           Conv2d-10           [-1, 128, 11, 9]          73,856
             ReLU-11           [-1, 128, 11, 9]               0
        MaxPool2d-12            [-1, 128, 5, 4]               0
          Flatten-13                 [-1, 2560]               0
           Linear-14                   

In [21]:
## Train the model
import torch
import torchaudio
from torch import nn
from torch.utils.data import DataLoader

BATCH_SIZE = 1024
EPOCHS = 40
LEARNING_RATE = .0005
ANNOTATIONS_FILE = r"D:\Coding\Tutorial_DL_Acoustic_Signal\DATA\UrbanSound8K\UrbanSound8K\metadata\UrbanSound8K.csv"
AUDIO_DIR = r"D:\Coding\Tutorial_DL_Acoustic_Signal\DATA\UrbanSound8K\UrbanSound8K\audio"
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050


def create_data_loader(train_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=batch_size)
    return train_dataloader

def train_one_epoch(model, data_loader, loss_fn, optimizer, device):
    for inputs, targets in data_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        # calculate loss
        predictions = model(inputs)
        loss = loss_fn(predictions, targets)

        # backpropogate loss and update weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Loss: {loss.item()}")


def train(model, data_loader, loss_fn, optimizer, device, epochs):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_one_epoch(model, data_loader, loss_fn, optimizer, device)
        print("-----------------------")
    print("Training is done")
    

if __name__ == "__main__":
    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
    print(f"Using {device}")

    # instantiating our dataset object and crreate data loader
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate = SAMPLE_RATE,
        n_fft = 1024,
        hop_length = 512,
        n_mels = 64
    )
    
    usd = UrbanSoundDataset(ANNOTATIONS_FILE, AUDIO_DIR, mel_spectrogram, SAMPLE_RATE, NUM_SAMPLES,device)
    
    train_dataloader = create_data_loader(usd, BATCH_SIZE)

    # build model
    cnn = CNNNetwork().to(device)
    print(cnn)

    # instantiate loss function + optimizerr
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(cnn.parameters(), lr=LEARNING_RATE)

    # train model
    train(cnn, train_dataloader, loss_fn, optimizer, device, EPOCHS)

    # Store the model
    torch.save(cnn.state_dict(), "cnn.pth")
    print("Model trained and stored at cnn.pth")

    

Using cuda
CNNNetwork(
  (conv1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv4): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear): Linear(in_features=2560, out_features=10, bias=True)
  (softmax): Softmax(dim=1)
)
Epoch 1
Loss: 2.208237409591

In [35]:
## 2. Making predictions/Inference
import torch
import torchaudio

class_mapping = [
    "air_conditioner",
    "car_horn",
    "children_playing",
    "dog_bark",
    "drilling",
    "engine_idling",
    "gun_shot",
    "jackhammer",
    "siren",
    "street_music",
]

def predict(model, input, target, class_mapping):
    model.eval()
    with torch.no_grad():
        predictions = model(input)
        # Tensor (1,10) -> [[0.1, 0.01,...,.0.6]]
        predicted_index = predictions[0].argmax(0)
        predicted = class_mapping[predicted_index]
        expected = class_mapping[target]
    return predicted, expected



if __name__ == "__main__":
    # load back the model
    cnn = CNNNetwork()
    state_dict = torch.load("cnnnet.pth")
    cnn.load_state_dict(state_dict)

    # load Urban sound dataset
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate = SAMPLE_RATE,
        n_fft = 1024,
        hop_length = 512,
        n_mels = 64
    )
    
    usd = UrbanSoundDataset(ANNOTATIONS_FILE, AUDIO_DIR, mel_spectrogram, SAMPLE_RATE, NUM_SAMPLES, "cpu")
    
    # get a sample from the urban sound dataset for inference
    input, target = usd[0][0], usd[0][1]
    input.unsqueeze_(0)
    # make an inference
    predicted, expected = predict(cnn, input, target, class_mapping)

    print(f"Predicted: '{predicted}', expected :'{expected}'")

Predicted: 'dog_bark', expected :'dog_bark'


tensor([[[8.1443e-04, 2.1588e-04, 9.1436e-04,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [2.4870e-03, 1.5724e-03, 4.2855e-04,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [2.9685e-03, 6.0449e-03, 4.1659e-03,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         ...,
         [1.7061e-04, 5.7996e-02, 7.9969e-01,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [2.1515e-04, 1.5781e-02, 3.7936e-01,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [3.0015e-04, 1.4416e-02, 3.1233e-01,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00]]])