In [46]:
# !pip install pydub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pydub
import os 
import librosa
import soundfile as sf
import torch
import torchaudio
from torchaudio import transforms
import random
from torch.utils.data import DataLoader, Dataset, random_split
import torch.nn.functional as F
from torch.nn import init
import torch.nn as nn

In [82]:
def set_seed(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)

In [48]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [49]:
sound_label = ["Dog", "Rooster", "Pig", "Cow", "Frog", "Cat", "Hen", "Insects (flying)", "Sheep", "Crow"
                ,"Rain", "Sea waves", "Crackling fire", "Crickets", "Chirping birds", "Water drops", "Wind", "Pouring water", "Toilet flush", "Thunderstorm"
                ,"Crying baby", "Sneezing", "Clapping", "Breathing", "Coughing", "Footsteps", "Laughing", "Brushing teeth", "Snoring", "Drinking, sipping"
                , "Door knock", "Mouse click", "Keyboard typing", "Door, wood creaks", "Can opening", "washing machine", "Vacuum cleaner", "Clock alarm", "Clock tick", "Glass breaking"
                , "Helicopter", "Chainsaw", "Siren", "Car horn", "Engine", "Train", "Church bells", "Airplane", "Fireworks", "Hand saw"]
sounds = dict(zip(range(50), sound_label))

In [50]:
df = pd.read_csv('/content/drive/MyDrive/ASR Project/ESC-50-master/meta/esc50.csv')
df.head(10)


Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A
5,1-101296-B-19.wav,1,19,thunderstorm,False,101296,B
6,1-101336-A-30.wav,1,30,door_wood_knock,False,101336,A
7,1-101404-A-34.wav,1,34,can_opening,False,101404,A
8,1-103298-A-9.wav,1,9,crow,False,103298,A
9,1-103995-A-30.wav,1,30,door_wood_knock,False,103995,A


In [51]:
class AudioUtil():
    @staticmethod
    def open(audio_file):
        sig, sr = torchaudio.load(audio_file)
        return (sig,sr)
    
    # ----------------------------
    # Standardizing sample rate to 44100Hz
    # ----------------------------
    def resample(audio, srate):
        sig, sr = audio
        if (sr == srate):
            return audio
        no_channels = sig.shape[0]

        #Resample 1st channel:
        resig = torchaudio.transforms.Resample(sr, srate)(sig[:1,:])
        if (no_channels > 1):
            #Resample 2nd channel and merge both
            retwo = torchaudio.transforms.Resample(sr, srate)(sig[1:,:])
            resig = torch.cat([resig, retwo])

        return ((resig, srate))


    # ----------------------------
    # Some audios are mono, some are stereo. We need everything to have the same dimensions.
    # Thus, we can either only select the first channel of stereo or duplicate the first channel of mono
    # ----------------------------
    @staticmethod
    def rechannel(audio, channel):
        sig, sr = audio
        if (sig.shape[0]==channel):
            return audio
        if (channel==1):
            resig = sig[:1,:]
        else:
            resig = torch.cat([sig,sig])

        return ((resig, sr))

    

    # ----------------------------
    # Standardize the length of the audio - that is, either pad or truncate the audio
    # ----------------------------
    @staticmethod
    def resize_aud(audio, ms):
        sig, sr = audio
        no_rows, sig_len = sig.shape
        max_len = sr // 1000 * ms

        #Truncate
        if (sig_len > max_len):
            sig = sig[:, :max_len]
        #Padding
        elif (sig_len < max_len):
            #Length of the paddings at the start and end of the signal
            len_start = random.randint(0, max_len-sig_len)
            len_end = max_len - len_start - sig_len

            pad_start = torch.zeros((no_rows, len_start))
            pad_end = torch.zeros((no_rows, len_end))

            sig = torch.cat((pad_start, sig, pad_end), 1)

        return (sig, sr)


    # ----------------------------
    # Refer to textbox_1 for the reasoning of this method
    # ----------------------------
    @staticmethod
    def time_shift(aud, shift_limit):
        sig,sr = aud
        _, sig_len = sig.shape
        shift_amt = int(random.random() * shift_limit * sig_len)
        return (sig.roll(shift_amt), sr)

    # ----------------------------
    # Generating Spectrogram
    # ----------------------------
    @staticmethod
    def spectro_gram(audio, n_mels=64, n_fft=1024, hop_len=None):
        sig, sr = audio
        top_db = 80 #if we have more time, we can try 80
        spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
        #shape of spec is [channel (mono or stereo etc), n_mels, time]
        spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
        return (spec)


    # ----------------------------
    # Augment the Spectrogram by masking out some sections of it in both the frequency
    # dimension (ie. horizontal bars) and the time dimension (vertical bars) to prevent
    # overfitting and to help the model generalise better. The masked sections are
    # replaced with the mean value.
    # ----------------------------
    @staticmethod
    def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
        _, n_mels, n_steps = spec.shape
        mask_value = spec.mean()
        aug_spec = spec

        freq_mask_param = max_mask_pct * n_mels
        for _ in range(n_freq_masks):
            aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

        time_mask_param = max_mask_pct * n_steps
        for _ in range(n_time_masks):
            aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

        return aug_spec
    

                

In [52]:
class SoundDS(Dataset):
  def __init__(self, df, path):
    self.df = df
    self.path = str(path)
    self.duration = 5000 #our audio is 5 seconds
    self.sr = 44100
    self.channel = 2
    self.shift_pct = 0.4


  def __len__(self):
    return len(self.df)
  
  def __shape__(self):
    return self.df.shape
  
  def __getitem__(self, index):
    file = self.path + self.df.loc[index, 'filename']
    class_id = self.df.loc[index, 'target'] #the index of the label aka target
    fold = self.df.loc[index, 'fold']

    audio = AudioUtil.open(file)
    #print(f"Original shape {audio[0].shape} and sample rate of {audio[1]}")
    rechannel = AudioUtil.rechannel(audio, self.channel)
    #print(f"Rechanneling shape {rechannel[0].shape} and sample rate of {rechannel[1]}")
    resamp = AudioUtil.resample(rechannel, self.sr)
    #print(f"Resampling shape {resamp[0].shape} and sample rate of {resamp[1]}")
    padded = AudioUtil.resize_aud(resamp, self.duration)
    #print(f"Padded shape {padded[0].shape} and sample rate of {padded[1]}")
    shifted = AudioUtil.time_shift(padded, self.shift_pct)
    #print(f"Time shift shape {shifted[0].shape} and sample rate of {shifted[1]}")
    sgram = AudioUtil.spectro_gram(shifted, n_mels=64, n_fft=1024, hop_len=None)
    #print(f"Mel spectrogram shape {sgram.shape}")
    aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)
    #print(f"Augmented spectrogram shape {aug_sgram.shape} of (num_channels, Mel freq_bands, time_steps)")
    return aug_sgram, class_id


In [53]:
data_path = r'/content/drive/MyDrive/ASR Project/ESC-50-master/audio/'


In [54]:
myds = SoundDS(df, data_path)

# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])

# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=16, shuffle=False)

In [72]:
a, b = myds[0]
print(a.shape)
print(a.shape)
# inputs, classes = next(iter(train_dl))
# print("inputs:", inputs)
# print("classes:", classes)

torch.Size([2, 64, 430])
torch.Size([2, 64, 430])


In [100]:
# ----------------------------
# Audio Classification Model
# ----------------------------
class AudioClassifierE2E(nn.Module):
    # ----------------------------
    # Build the model architecture
    # ----------------------------
    def __init__(self):
        super().__init__()
        conv_layers = []

        # First Convolution Block with Relu and Batch Norm. Use Kaiming Initialization
        self.conv1 = nn.Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2)) #change first param if change no_channels
        self.drop1 = nn.Dropout(0.1)
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(8)
        init.kaiming_normal_(self.conv1.weight, a=0.1)
        self.conv1.bias.data.zero_()
        conv_layers += [self.conv1, self.relu1, self.bn1]
        # conv_layers += [self.conv1, self.drop1, self.relu1, self.bn1]

        # Second Convolution Block
        self.conv2 = nn.Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.drop2 = nn.Dropout(0.2)
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm2d(16)
        init.kaiming_normal_(self.conv2.weight, a=0.1)
        self.conv2.bias.data.zero_()
        conv_layers += [self.conv2, self.relu2, self.bn2]
        # conv_layers += [self.conv2, self.drop2, self.relu2, self.bn2]

        # Third Convolution Block
        self.conv3 = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.drop3 = nn.Dropout(0.2)
        self.relu3 = nn.ReLU()
        self.bn3 = nn.BatchNorm2d(32)
        init.kaiming_normal_(self.conv3.weight, a=0.1)
        self.conv3.bias.data.zero_()
        conv_layers += [self.conv3, self.relu3, self.bn3]
        # conv_layers += [self.conv3, self.drop3, self.relu3, self.bn3]

        # Fourth Convolution Block
        self.conv4 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.drop4 = nn.Dropout(0.1)
        self.relu4 = nn.ReLU()
        self.bn4 = nn.BatchNorm2d(64)
        init.kaiming_normal_(self.conv4.weight, a=0.1)
        self.conv4.bias.data.zero_()
        conv_layers += [self.conv4, self.relu4, self.bn4]
        # conv_layers += [self.conv4, self.drop4, self.relu4, self.bn4]

        # Linear Classifier
        self.ap = nn.AdaptiveAvgPool2d(output_size=1)
        self.lin = nn.Linear(in_features=64, out_features=50)

        # Wrap the Convolutional Blocks
        self.conv = nn.Sequential(*conv_layers)
 
    # ----------------------------
    # Forward pass computations
    # ----------------------------
    def forward(self, x):
        # Run the convolutional blocks
        x = self.conv(x)

        # Adaptive pool and flatten for input to linear layer
        x = self.ap(x)
        x = x.view(x.shape[0], -1)

        # Linear layer
        x = self.lin(x)

        # Final output
        return x

# Create the model and put it on the GPU if available
myModel = AudioClassifierE2E()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = myModel.to(device)
# Check that it is on Cuda
next(myModel.parameters()).device

device(type='cuda', index=0)

In [101]:
set_seed(100)
# ----------------------------
# Training Loop
# ----------------------------
def training(model, train_dl, num_epochs):
  # Loss Function, Optimizer and Scheduler
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.AdamW(model.parameters(),lr=0.001, weight_decay=1e-6)
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')

#   scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=100)
  best_accuracy = 0
  # Repeat for each epoch
  for epoch in range(num_epochs):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    # Repeat for each batch in the training set
    for i, data in enumerate(train_dl):
        # Get the input features and target labels, and put them on the GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # Zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Keep stats for Loss and Accuracy
        running_loss += loss.item()


        # Get the predicted class with the highest score
        _, prediction = torch.max(outputs,1)
        # Count of predictions that matched the target label
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]

        #if i % 10 == 0:    # print every 10 mini-batches
        #    print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10))
    
    
    # Print stats at the end of the epoch
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    print(f'Epoch: {epoch}, Loss: {avg_loss:.2f}, Accuracy: {acc:.2f}')
    if acc > best_accuracy:
        state = {'epoch': epoch+1, 'state_dict': model.state_dict(),'optimizer': optimizer.state_dict(), 'scheduler':scheduler.state_dict()}
        path = f'cnn-{num_epochs}_epoch_{epoch+1}.pt'
        torch.save(state, path)

  print('Finished Training')
  
num_epochs=100
training(myModel, train_dl, num_epochs)

Epoch: 0, Loss: 3.94, Accuracy: 0.02
Epoch: 1, Loss: 3.84, Accuracy: 0.05
Epoch: 2, Loss: 3.74, Accuracy: 0.06
Epoch: 3, Loss: 3.65, Accuracy: 0.07
Epoch: 4, Loss: 3.57, Accuracy: 0.08
Epoch: 5, Loss: 3.50, Accuracy: 0.10
Epoch: 6, Loss: 3.45, Accuracy: 0.10
Epoch: 7, Loss: 3.40, Accuracy: 0.11
Epoch: 8, Loss: 3.34, Accuracy: 0.14
Epoch: 9, Loss: 3.28, Accuracy: 0.14
Epoch: 10, Loss: 3.21, Accuracy: 0.16
Epoch: 11, Loss: 3.13, Accuracy: 0.18
Epoch: 12, Loss: 3.05, Accuracy: 0.19
Epoch: 13, Loss: 2.98, Accuracy: 0.22
Epoch: 14, Loss: 2.95, Accuracy: 0.22
Epoch: 15, Loss: 2.84, Accuracy: 0.25
Epoch: 16, Loss: 2.77, Accuracy: 0.26
Epoch: 17, Loss: 2.70, Accuracy: 0.27
Epoch: 18, Loss: 2.62, Accuracy: 0.29
Epoch: 19, Loss: 2.55, Accuracy: 0.32
Epoch: 20, Loss: 2.49, Accuracy: 0.34
Epoch: 21, Loss: 2.41, Accuracy: 0.34
Epoch: 22, Loss: 2.32, Accuracy: 0.35
Epoch: 23, Loss: 2.27, Accuracy: 0.37
Epoch: 24, Loss: 2.20, Accuracy: 0.38
Epoch: 25, Loss: 2.11, Accuracy: 0.41
Epoch: 26, Loss: 2.08,

In [98]:
# !rm -r cnn-*

In [102]:
PATH = r'/content/cnn-100_epoch_100.pt'
#device =  torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = AudioClassifierE2E()
checkpoint = torch.load(PATH,map_location=torch.device('cpu'))
model.load_state_dict(checkpoint["state_dict"])
model.eval()
model.to(device)

AudioClassifierE2E(
  (conv1): Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
  (drop1): Dropout(p=0.1, inplace=False)
  (relu1): ReLU()
  (bn1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (drop2): Dropout(p=0.2, inplace=False)
  (relu2): ReLU()
  (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (drop3): Dropout(p=0.2, inplace=False)
  (relu3): ReLU()
  (bn3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (drop4): Dropout(p=0.1, inplace=False)
  (relu4): ReLU()
  (bn4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (ap): AdaptiveAvgPool2d(output_size=1)
  (lin): Linear(in_features=64, out_feature

In [103]:
def endtoend(model, audiofile):
    audio = AudioUtil.open(audiofile)
    rechannel = AudioUtil.rechannel(audio, 2) #change number of channel
    resamp = AudioUtil.resample(rechannel, 44100)

    padded = AudioUtil.resize_aud(resamp, 5000)
    shifted = AudioUtil.time_shift(padded, 0.4)
    sgram = AudioUtil.spectro_gram(shifted, n_mels=64, n_fft=1024, hop_len=None)
    aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)
    input_loader = torch.utils.data.DataLoader(aug_sgram, batch_size=16, shuffle=False)
    with torch.no_grad():
        for input in input_loader:
            #print(input.shape)
            input = input.to(device)
            input = input.reshape([-1,2,64,430]) #change 2nd number as u change number of channel
            #print(input.shape)
            input_m, input_s = input.mean(), input.std()
            input = (input - input_m) / input_s
            output = model(input)
            _, prediction = torch.max(output, 1)
            prediction = prediction.cpu().numpy()[0]
            #print(f"I think this is the sound of a {sounds[prediction]}")
    return prediction 
    

In [104]:
set_seed(100)
file = r'/content/drive/MyDrive/ASR Project/ESC-50-master/audio/5-200334-B-1.wav'
real_label = int(file.split("-")[-1].split(".")[0])
prediction = endtoend(model, file)
print(f"I think this is the sound of a {sounds[prediction]}")
print(f"This is actually the sound of a {sounds[real_label]}")

I think this is the sound of a Rooster
This is actually the sound of a Rooster


In [62]:
# ifile = r'../../Internet Audio/dogbark_pcm24.wav'
# prediction_online = endtoend(model, ifile)
# print(f"I think this is the sound of a {sounds[prediction_online]}")


In [105]:
def split_audio(audio_dir, out_dir, audiofile):
    audio_file = os.path.join(audio_dir, audiofile)
    wave, sr = librosa.load(audio_file, sr=None) 
    frame_length = 5 #cut into 5s 
    segment_length = int(sr * frame_length)
    num_sections = int(np.ceil(len(wave) / segment_length)) #the number of sections after splitting
    split = []    
    for s in range(0, len(wave), segment_length):
        t = wave[s: s + segment_length]
        split.append(t)
    outfiles = []
    recording_name = os.path.basename(audio_file[:-4])
    for i, segment in enumerate(split):
        out_file = f"{recording_name}_{i}.wav"
        outfiles.append(out_file)
        sf.write(os.path.join(out_dir, out_file), segment, sr)
    splittedfiles = dict(zip(range(len(outfiles)), outfiles))
    return splittedfiles

In [90]:
def audioWithTime(model, audiofile):
    audio_dir = r'../../Long Audio/original'
    out_dir = r'../../Long Audio/split'
    splittedfiles = split_audio(audio_dir, out_dir, audiofile)
    #predictions = []
    for f in splittedfiles:
        prediction = endtoend(model,out_dir + "/" + splittedfiles[f])
        print(f"From {f*5}s to {(f+1)*5}s, it sounds like a {sounds[prediction]}")

        

In [65]:
# longfile = "keyboard30s.wav"
# audioWithTime(model,longfile)