# Imports

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pydub
import os 
import librosa
import soundfile as sf
import torch
import torchaudio
from torchaudio import transforms
import random
from torch.utils.data import DataLoader, Dataset, random_split
import torch.nn.functional as F
from torch.nn import init
import torch.nn as nn

In [4]:
df = pd.read_csv('../Dataset/ESC-50-master/meta/esc50.csv')
df.head(10)


Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A
5,1-101296-B-19.wav,1,19,thunderstorm,False,101296,B
6,1-101336-A-30.wav,1,30,door_wood_knock,False,101336,A
7,1-101404-A-34.wav,1,34,can_opening,False,101404,A
8,1-103298-A-9.wav,1,9,crow,False,103298,A
9,1-103995-A-30.wav,1,30,door_wood_knock,False,103995,A


In [4]:
# audio_dir = r'../../Dataset/ESC-50-master/audio'

# out_dir = r'../../Dataset/ESC-50-master/split_audio'
# #os.makedirs(out_dir, exist_ok=True)

# audio_file = os.path.join(audio_dir, '3-155556-A-31.wav')

# wave, sr = librosa.load(audio_file, sr=None)  

In [5]:
# frame_length = 500/1000 #the lengh of the frame, I set it as 500ms for testing
# segment_length = int(sr * frame_length)
# #print(f"sr {sr}")
# #print(segment_length)
# num_sections = int(np.ceil(len(wave) / segment_length))  #the number of sections after splitting
# #print(num_sections)
# split = []    
# for s in range(0, len(wave), segment_length):
#     t = wave[s: s + segment_length]
#     split.append(t)

# recording_name = os.path.basename(audio_file[:-4])
# for i, segment in enumerate(split):
#     out_file = f"{recording_name}_{i}.wav"
#     sf.write(os.path.join(out_dir, out_file), segment, sr)

START HERE

In [5]:
class AudioUtil():
    @staticmethod
    def open(audio_file):
        sig, sr = torchaudio.load(audio_file)
        return (sig,sr)
    
    # ----------------------------
    # Standardizing sample rate to 44100Hz
    # ----------------------------
    def resample(audio, srate):
        sig, sr = audio
        if (sr == srate):
            return audio
        no_channels = sig.shape[0]

        #Resample 1st channel:
        resig = torchaudio.transforms.Resample(sr, srate)(sig[:1,:])
        if (no_channels > 1):
            #Resample 2nd channel and merge both
            retwo = torchaudio.transforms.Resample(sr, srate)(sig[1:,:])
            resig = torch.cat([resig, retwo])

        return ((resig, srate))


    # ----------------------------
    # Some audios are mono, some are stereo. We need everything to have the same dimensions.
    # Thus, we can either only select the first channel of stereo or duplicate the first channel of mono
    # ----------------------------
    @staticmethod
    def rechannel(audio, channel):
        sig, sr = audio
        if (sig.shape[0]==channel):
            return audio
        if (channel==1):
            resig = sig[:1,:]
        else:
            resig = torch.cat([sig,sig])

        return ((resig, sr))

    

    # ----------------------------
    # Standardize the length of the audio - that is, either pad or truncate the audio
    # ----------------------------
    @staticmethod
    def resize_aud(audio, ms):
        sig, sr = audio
        no_rows, sig_len = sig.shape
        max_len = sr // 1000 * ms

        #Truncate
        if (sig_len > max_len):
            sig = sig[:, :max_len]
        #Padding
        elif (sig_len < max_len):
            #Length of the paddings at the start and end of the signal
            len_start = random.randint(0, max_len-sig_len)
            len_end = max_len - len_start - sig_len

            pad_start = torch.zeros((no_rows, len_start))
            pad_end = torch.zeros((no_rows, len_end))

            sig = torch.cat((pad_start, sig, pad_end), 1)

        return (sig, sr)


    # ----------------------------
    # Refer to textbox_1 for the reasoning of this method
    # ----------------------------
    @staticmethod
    def time_shift(aud, shift_limit):
        sig,sr = aud
        _, sig_len = sig.shape
        shift_amt = int(random.random() * shift_limit * sig_len)
        return (sig.roll(shift_amt), sr)

    # ----------------------------
    # Generating Spectrogram
    # ----------------------------
    @staticmethod
    def spectro_gram(audio, n_mels=64, n_fft=1024, hop_len=None):
        sig, sr = audio
        top_db = 40 #if we have more time, we can try 80
        spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
        #shape of spec is [channel (mono or stereo etc), n_mels, time]
        spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
        return (spec)


    # ----------------------------
    # Augment the Spectrogram by masking out some sections of it in both the frequency
    # dimension (ie. horizontal bars) and the time dimension (vertical bars) to prevent
    # overfitting and to help the model generalise better. The masked sections are
    # replaced with the mean value.
    # ----------------------------
    @staticmethod
    def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
        _, n_mels, n_steps = spec.shape
        mask_value = spec.mean()
        aug_spec = spec

        freq_mask_param = max_mask_pct * n_mels
        for _ in range(n_freq_masks):
            aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

        time_mask_param = max_mask_pct * n_steps
        for _ in range(n_time_masks):
            aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

        return aug_spec
    

                

Textbox_1: We might also need to perform augmentation for raw data. "Natural sounds" such as traffic, sea waves, dog barks,.. usually have no particular order thus the audio could wrap around. On the other hand, human speech and alike sounds, the order matters. We can fill the gaps with silence. There are some options: time shift, pitch shift, time stretch, add noise,... We will try out the first option: time shift

Next, we build a customized Dataloader

In [6]:
class SoundDS(Dataset):
  def __init__(self, df, path):
    self.df = df
    self.path = str(path)
    self.duration = 5000 #our audio is 5 seconds
    self.sr = 44100
    self.channel = 2
    self.shift_pct = 0.4


  def __len__(self):
    return len(self.df)
  
  def __shape__(self):
    return self.df.shape
  
  def __getitem__(self, index):
    file = self.path + self.df.loc[index, 'filename']
    class_id = self.df.loc[index, 'target'] #the index of the label aka target
    fold = self.df.loc[index, 'fold']

    audio = AudioUtil.open(file)
    #print(f"Original shape {audio[0].shape} and sample rate of {audio[1]}")
    rechannel = AudioUtil.rechannel(audio, self.channel)
    #print(f"Rechanneling shape {rechannel[0].shape} and sample rate of {rechannel[1]}")
    resamp = AudioUtil.resample(rechannel, self.sr)
    #print(f"Resampling shape {resamp[0].shape} and sample rate of {resamp[1]}")
    padded = AudioUtil.resize_aud(resamp, self.duration)
    #print(f"Padded shape {padded[0].shape} and sample rate of {padded[1]}")
    shifted = AudioUtil.time_shift(padded, self.shift_pct)
    #print(f"Time shift shape {shifted[0].shape} and sample rate of {shifted[1]}")
    sgram = AudioUtil.spectro_gram(shifted, n_mels=64, n_fft=1024, hop_len=None)
    #print(f"Mel spectrogram shape {sgram.shape}")
    aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)
    #print(f"Augmented spectrogram shape {aug_sgram.shape} of (num_channels, Mel freq_bands, time_steps)")
    return aug_sgram, class_id


In [7]:
data_path = r'../Dataset/ESC-50-master/audio/'
df_train = df[df.fold != (1)]
data = SoundDS(df, data_path)
data[0]
# test = data.__getsgram__()

(tensor([[[-1.6471, -1.6471, -1.6471,  ..., -1.6471, -1.6471, -1.6471],
          [-1.6471, -1.6471, -1.6471,  ..., -1.6471, -1.6471, -1.6471],
          [-1.6471, -1.6471, -1.6471,  ..., -1.6471, -1.6471, -1.6471],
          ...,
          [-1.6471, -1.6471, -1.6471,  ..., -1.6471, -1.6471, -1.6471],
          [-1.6471, -1.6471, -1.6471,  ..., -1.6471, -1.6471, -1.6471],
          [-1.6471, -1.6471, -1.6471,  ..., -1.6471, -1.6471, -1.6471]],
 
         [[-1.6471, -1.6471, -1.6471,  ..., -1.6471, -1.6471, -1.6471],
          [-1.6471, -1.6471, -1.6471,  ..., -1.6471, -1.6471, -1.6471],
          [-1.6471, -1.6471, -1.6471,  ..., -1.6471, -1.6471, -1.6471],
          ...,
          [-1.6471, -1.6471, -1.6471,  ..., -1.6471, -1.6471, -1.6471],
          [-1.6471, -1.6471, -1.6471,  ..., -1.6471, -1.6471, -1.6471],
          [-1.6471, -1.6471, -1.6471,  ..., -1.6471, -1.6471, -1.6471]]]),
 0)

Splitting for SVM

In [10]:
data_path = r'../Dataset/ESC-50-master/audio/'
save_path = r'../Preprocessed Dataset/'
np.random.seed(138)
data = SoundDS(df, data_path)
for i in range(5):
    # df_train = df[df.fold != (i+1)]
    # df_test = df[df.fold == (i+1)]
    #data_train = SoundDS(df, data_path)

    #print(type(data_train))
    #data_test = SoundDS(df_test, data_path)
    #print("Data train", data_train.__shape__())
    # Random split of 80:10:10 between testing and validating
    num_items = len(data)
    num_train = round(num_items * 0.8)
    num_val = num_items - num_train
    num_test = round(num_val * 0.5)
    num_val_2 = num_val - num_test
    train_ds, val = random_split(data, [num_train, num_val])
    val_ds, test_ds = random_split(val, [num_val_2, num_test])
    print(type(train_ds), len(train_ds))

    # Create training and validation data loaders
    # train_dl = torch.utils.data.DataLoader(data_train, batch_size=16, shuffle=True)
    # test_dl = torch.utils.data.DataLoader(test_ds, batch_size=16, shuffle=False)
    # val_dl = torch.utils.data.DataLoader(val_ds, batch_size=16, shuffle=False)
    # print(train_dl)
    # torch.save(train_dl, save_path + f'X_train_{i+1}.npy')
    # torch.save(test_dl, save_path + f'X_test_{i+1}.npy')
    # torch.save(val_dl, save_path + f'X_val_{i+1}.npy')

    np.save(save_path + f'X_train_{i+1}.npy', train_ds)
    np.save(save_path + f'X_test_{i+1}.npy', test_ds)
    np.save(save_path + f'X_val_{i+1}.npy', val_ds)

<class 'torch.utils.data.dataset.Subset'> 1600


  arr = np.asanyarray(arr)
  arr = np.asanyarray(arr)


<class 'torch.utils.data.dataset.Subset'> 1600
<class 'torch.utils.data.dataset.Subset'> 1600
<class 'torch.utils.data.dataset.Subset'> 1600
<class 'torch.utils.data.dataset.Subset'> 1600


In [11]:
# test = np.load(r'../../Preprocessed Data/X_train_1.npy', allow_pickle=True)
# testdf = pd.DataFrame(test, columns = ["Tensor", "Target"])
# testdf.Tensor[0].shape



Trying out a CNN model online

In [12]:
myds = SoundDS(df, data_path)

# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])

# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=16, shuffle=False)

In [13]:


# ----------------------------
# Audio Classification Model
# ----------------------------
class AudioClassifier (nn.Module):
    # ----------------------------
    # Build the model architecture
    # ----------------------------
    def __init__(self):
        super().__init__()
        conv_layers = []

        # First Convolution Block with Relu and Batch Norm. Use Kaiming Initialization
        self.conv1 = nn.Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(8)
        init.kaiming_normal_(self.conv1.weight, a=0.1)
        self.conv1.bias.data.zero_()
        conv_layers += [self.conv1, self.relu1, self.bn1]

        # Second Convolution Block
        self.conv2 = nn.Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm2d(16)
        init.kaiming_normal_(self.conv2.weight, a=0.1)
        self.conv2.bias.data.zero_()
        conv_layers += [self.conv2, self.relu2, self.bn2]

        # Second Convolution Block
        self.conv3 = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu3 = nn.ReLU()
        self.bn3 = nn.BatchNorm2d(32)
        init.kaiming_normal_(self.conv3.weight, a=0.1)
        self.conv3.bias.data.zero_()
        conv_layers += [self.conv3, self.relu3, self.bn3]

        # Second Convolution Block
        self.conv4 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu4 = nn.ReLU()
        self.bn4 = nn.BatchNorm2d(64)
        init.kaiming_normal_(self.conv4.weight, a=0.1)
        self.conv4.bias.data.zero_()
        conv_layers += [self.conv4, self.relu4, self.bn4]

        # Linear Classifier
        self.ap = nn.AdaptiveAvgPool2d(output_size=1)
        self.lin = nn.Linear(in_features=64, out_features=50)

        # Wrap the Convolutional Blocks
        self.conv = nn.Sequential(*conv_layers)
 
    # ----------------------------
    # Forward pass computations
    # ----------------------------
    def forward(self, x):
        # Run the convolutional blocks
        x = self.conv(x)

        # Adaptive pool and flatten for input to linear layer
        x = self.ap(x)
        x = x.view(x.shape[0], -1)

        # Linear layer
        x = self.lin(x)

        # Final output
        return x

# Create the model and put it on the GPU if available
myModel = AudioClassifier()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = myModel.to(device)
# Check that it is on Cuda
next(myModel.parameters()).device

device(type='cuda', index=0)

In [14]:
# ----------------------------
# Training Loop
# ----------------------------
def training(model, train_dl, num_epochs):
  # Loss Function, Optimizer and Scheduler
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')

  # Repeat for each epoch
  for epoch in range(num_epochs):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    # Repeat for each batch in the training set
    for i, data in enumerate(train_dl):
        # Get the input features and target labels, and put them on the GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # Zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Keep stats for Loss and Accuracy
        running_loss += loss.item()


        # Get the predicted class with the highest score
        _, prediction = torch.max(outputs,1)
        # Count of predictions that matched the target label
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]

        #if i % 10 == 0:    # print every 10 mini-batches
        #    print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10))
    
    # Print stats at the end of the epoch
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    print(f'Epoch: {epoch}, Loss: {avg_loss:.2f}, Accuracy: {acc:.2f}')

  print('Finished Training')
  
num_epochs=50   # Just for demo, adjust this higher.
training(myModel, train_dl, num_epochs)

Epoch: 0, Loss: 3.86, Accuracy: 0.03
Epoch: 1, Loss: 3.78, Accuracy: 0.05
Epoch: 2, Loss: 3.69, Accuracy: 0.06
Epoch: 3, Loss: 3.60, Accuracy: 0.07
Epoch: 4, Loss: 3.50, Accuracy: 0.10
Epoch: 5, Loss: 3.42, Accuracy: 0.11
Epoch: 6, Loss: 3.34, Accuracy: 0.12
Epoch: 7, Loss: 3.26, Accuracy: 0.15
Epoch: 8, Loss: 3.14, Accuracy: 0.18
Epoch: 9, Loss: 3.03, Accuracy: 0.20
Epoch: 10, Loss: 2.92, Accuracy: 0.22
Epoch: 11, Loss: 2.81, Accuracy: 0.24
Epoch: 12, Loss: 2.74, Accuracy: 0.27
Epoch: 13, Loss: 2.63, Accuracy: 0.29
Epoch: 14, Loss: 2.57, Accuracy: 0.31
Epoch: 15, Loss: 2.48, Accuracy: 0.32
Epoch: 16, Loss: 2.43, Accuracy: 0.35
Epoch: 17, Loss: 2.33, Accuracy: 0.38
Epoch: 18, Loss: 2.28, Accuracy: 0.37
Epoch: 19, Loss: 2.22, Accuracy: 0.40
Epoch: 20, Loss: 2.16, Accuracy: 0.41
Epoch: 21, Loss: 2.07, Accuracy: 0.44
Epoch: 22, Loss: 2.05, Accuracy: 0.43
Epoch: 23, Loss: 2.01, Accuracy: 0.45
Epoch: 24, Loss: 1.95, Accuracy: 0.46
Epoch: 25, Loss: 1.93, Accuracy: 0.47
Epoch: 26, Loss: 1.86,

### Save Pre-trained Model

In [17]:
# ENTER YOUR CODE
PATH = '../Models/cnn_model-50.pt'
torch.save(myModel.state_dict(), PATH)

### Load Pre-Trained Model

In [21]:
device =  torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = AudioClassifier()
model.load_state_dict(torch.load(PATH))
model.eval()
model.to(device)

AudioClassifier(
  (conv1): Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
  (relu1): ReLU()
  (bn1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu2): ReLU()
  (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu3): ReLU()
  (bn3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu4): ReLU()
  (bn4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (ap): AdaptiveAvgPool2d(output_size=1)
  (lin): Linear(in_features=64, out_features=50, bias=True)
  (conv): Sequential(
    (0): Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (1): ReLU()
    (2): BatchNorm2d(8, eps=1e-05, mome

In [22]:
# ----------------------------
# Inference
# ----------------------------
def inference (model, val_dl):
  correct_prediction = 0
  total_prediction = 0

  # Disable gradient updates
  with torch.no_grad():
    for data in val_dl:
      # Get the input features and target labels, and put them on the GPU
      inputs, labels = data[0].to(device), data[1].to(device)

      # Normalize the inputs
      inputs_m, inputs_s = inputs.mean(), inputs.std()
      inputs = (inputs - inputs_m) / inputs_s

      # Get predictions
      outputs = model(inputs)

      # Get the predicted class with the highest score
      _, prediction = torch.max(outputs,1)
      # Count of predictions that matched the target label
      correct_prediction += (prediction == labels).sum().item()
      total_prediction += prediction.shape[0]
    
  acc = correct_prediction/total_prediction
  print(f'Accuracy: {acc:.2f}, Total items: {total_prediction}')

# Run inference on trained model with the validation set

inference(model, val_dl)

Accuracy: 0.47, Total items: 400


In [None]:
run 100 epochs in 22 min with accuracy 0.75
test accuracy 0.53