In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import torchaudio
import torch
from scipy.io.wavfile import read as read_wav

from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import os

import matplotlib.pyplot as plt

<div style='font-size:150%;
            font-family:fantasy;
            background-color:#F3F3F3;
            padding:10px;
            margin:5px;
            letter-spacing:1px'><h1 style='text-align:center;'>Building dataset</h1>
   </div>

In [None]:
path = "/kaggle/input/cats-vs-dogs-vs-birds-audio-classification/Animals/bird"
birdarr = [x[2] for x in os.walk(path)]
birds_path = [os.path.join(path, i) for i in birdarr[0]]
birds = ['bird']*len(birds_path)

In [None]:
path = "/kaggle/input/cats-vs-dogs-vs-birds-audio-classification/Animals/cat"
catarr = [x[2] for x in os.walk(path)]
cats_path = [os.path.join(path, i) for i in catarr[0]]
cats = ['cat']*len(cats_path)

In [None]:
path = "/kaggle/input/cats-vs-dogs-vs-birds-audio-classification/Animals/dog"
dogarr = [x[2] for x in os.walk(path)]
dogs_path = [os.path.join(path, i) for i in dogarr[0]]
dogs = ['dog']*len(dogs_path)

In [None]:
dd = dict()
dd['images'] = birds_path + cats_path + dogs_path
dd['labels'] = birds + cats + dogs

In [None]:
df = pd.DataFrame(dd)

In [None]:
plt.pie(df['labels'].value_counts(),
       labels=df['labels'].value_counts().index,
       autopct='%0.2f%%')

In [None]:
temp = df.iloc[0, 0]
rate, data = read_wav(temp)

In [None]:
new_rate = 8000

transform = torchaudio.transforms.Resample(orig_freq=rate, new_freq=new_rate)

In [None]:
label_index = {
    'bird': 0,
    'cat': 1,
    'dog': 2
}

index_label = {
    0: 'bird',
    1: 'cat',
    2: 'dog'
}

In [None]:
df['labels'] = df['labels'].map(label_index)

<div style='font-size:150%;
            font-family:fantasy;
            background-color:#F3F3F3;
            letter-spacing:1px;
            margin:5px;
            padding:15px'>
    <h1 style='text-align:center;'>
        Dataset pipeline, collate batch for DataLoader
    </h1>
</div>

In [None]:
def pad_sequence(batch):
    # Make all tensor in a batch the same length by padding with zeros
    batch = [item.t() for item in batch]
    batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0.)
    return batch.permute(0, 2, 1)


def collate_fn(batch):
    tensors, labels = [], []
    
    for audio, label in batch:
        au, _ = torchaudio.load(audio)
        au = transform(au)
        tensors += [au]
        labels += [label]
        
    tensors = pad_sequence(tensors)
    labels = torch.tensor(labels, dtype=torch.int64)
    
    return tensors, labels

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE=16
LR=0.1
EPOCHS = 20

In [None]:
tr = df.values

In [None]:
train, val = train_test_split(tr, random_state=42, test_size=0.2)

train_loader = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

<div style='font-size:150%;
            font-family:fantasy;
            background-color:#F3F3F3;
            letter-spacing:1px;
            margin:10px;
            padding:15px'><h1 style='text-align:center;'>Defining model</h1>
</div>

In [None]:
class Simple(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Simple, self).__init__()
        self.conv1 = torch.nn.Conv1d(in_channels, out_channels=32, kernel_size=80, stride=16)
        self.batch1 = torch.nn.BatchNorm1d(32)
        self.pool1 = torch.nn.MaxPool1d(4)
        self.conv2 = torch.nn.Conv1d(32, 64, kernel_size=3)
        self.batch2 = torch.nn.BatchNorm1d(64)
        self.pool2 = torch.nn.MaxPool1d(4)
        self.conv3 = torch.nn.Conv1d(64, 128, kernel_size=3)
        self.batch3 = torch.nn.BatchNorm1d(128)
        self.pool3 = torch.nn.MaxPool1d(4)
        self.fc1 = torch.nn.Linear(128, 64)
        self.fc2 = torch.nn.Linear(64, 32)
        self.fc3 = torch.nn.Linear(32, out_channels)
        
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.batch1(x)
        x = torch.nn.functional.relu(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.batch2(x)
        x = torch.nn.functional.relu(x)
        x = self.pool2(x)
        x = self.conv3(x)
        x = self.batch3(x)
        x = torch.nn.functional.relu(x)
        x = self.pool3(x)
        x = torch.nn.functional.avg_pool1d(x, x.shape[-1])
        x = x.permute(0, 2, 1)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        
        return torch.nn.functional.softmax(x, dim=2)

In [None]:
model = Simple(1, 3)

model = model.to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=LR)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
from copy import deepcopy
import time

In [None]:
best_model = deepcopy(model)
best_acc = 0
history = []
acc = []
start = time.time()
for i in range(1, EPOCHS+1):
    model.train()
    train_loss = 0
    train_total = 0
    
    for ids, (audio, label) in enumerate(train_loader):
        optimizer.zero_grad()
        
        out = model(audio)
        
        loss = criterion(out.squeeze(1), label)
        
        loss.backward()
        train_loss += loss.item()
        train_total = out.size(0)
        
        optimizer.step()
        
    model.eval()
    
    val_acc = 0
    val_total = 0
    
    with torch.no_grad():
        for ids, (audio, label) in enumerate(val_loader):
            out = model(audio)
            
            val_acc += (out.squeeze(1).argmax(1) == label).sum().item()
            
            val_total += out.size(0)
            
    
    trainl = train_loss/train_total
    vall = val_acc/val_total
    
    if vall > best_acc:
        best_acc = vall
        best_model = deepcopy(model)
        
    history += [trainl]
    acc += [vall]
        
        
    print("Epoch {} train loss: {} val: {}".format(i, trainl, vall))
    
end = time.time()

print("Time required {}".format(end - start))

<div style='font-size:150%;
            background-color:#F3F3F3;
            font-family:fantasy;
            padding:20px;
            letter-spacing:1px;'>
    <h1 style='text-align:center;'>
        Benchmark results
    </h1>
    <div style='font-family:cursive;'>
        <p style='text-align:center;'>Training loss history and validation accuracy progression 🚀</p>
    </div>
</div>

In [None]:
epochs = list(range(1, EPOCHS+1))
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
axes[0].plot(epochs, history)
axes[0].set_title("Training loss")
axes[1].plot(epochs, acc)
axes[1].set_title("Accuracy history")

<div style='font-size:150%;
            font-family:fantasy;
            letter-spacing:1px;
            padding:15px;
            margin:5px;
            background-color:#F3F3F3;'>
    <h1 style='text-align:center'>Prediction pipeline and testing trained model</h1>
</div>

In [None]:
def predict(value):
    a, _ = torchaudio.load(value)
    transformed = transform(a)
    padded = pad_sequence([transformed])
    ml = best_model
    ml.eval()
    out = ml(padded)
    label = out.squeeze(1).argmax(1).detach().numpy()
    lb = label[0]
    return index_label[lb]

In [None]:
to_predict = df.iloc[45, 0]
predict(to_predict)

In [None]:
index_label[df.iloc[45, 1]]