In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import torch
import torchaudio
import os
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split

from scipy.io.wavfile import read as read_wav
import os

In [None]:
df = pd.read_excel("/kaggle/input/it-spectrum-dataset/sample_audio.xlsx")

In [None]:
df

In [None]:
df['text'].unique()

In [None]:
values = df['text'].unique()
i = 0
label_id = dict()
id_label = dict()
for j in values:
    label_id[j] = i
    id_label[i] = j
    i += 1

In [None]:
df['text'] = df['text'].map(label_id)

In [None]:
temp = "/kaggle/input/it-spectrum-dataset/sample/eu.0124f456-13b8-4765-936a-36bfd483683e.wav"
rate, data = read_wav(temp)
print(rate)

# Transforming audio to tensors

In [None]:
old_rate = rate
new_rate = 8000
def audio_process(x):
    x, _ = torchaudio.load(x)
    tr = torchaudio.transforms.Resample(orig_freq=old_rate, new_freq=new_rate)
    return tr(x)
transforms = lambda x: audio_process(x)

In [None]:
def pad_sequence(batch):
    # Make all tensor in a batch the same length by padding with zeros
    batch = [item.t() for item in batch]
    batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0.)
    return batch.permute(0, 2, 1)

def collate_batch(batch):
    tensors = []
    targets = []
    dir_path = "/kaggle/input/it-spectrum-dataset"
    for target, audio in batch:
        path = os.path.join(dir_path, audio+".wav")
        tensors += [transforms(path)]
        targets += [torch.tensor(target, dtype=torch.int64)]
        
    tensors = pad_sequence(tensors)
    targets = torch.stack(targets)
    
    return tensors, targets

In [None]:
dataset = df.iloc[:, [0, 5]]

In [None]:
BATCH_SIZE=8
EPOCHS = 10
LR = 0.001

# Dataset pipeline

In [None]:
y = dataset.iloc[:, 0].values
x = dataset.iloc[:, 1].values
train, test = train_test_split(dataset.values, random_state=42, test_size=0.1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if device == "cuda":
    num_workers = 1
    pin_memory = True
else:
    num_workers = 0
    pin_memory = False

train_loader = torch.utils.data.DataLoader(
    train,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_batch,
    num_workers=num_workers,
    pin_memory=pin_memory,
)
    
    
test_loader = torch.utils.data.DataLoader(
    test,
    batch_size=BATCH_SIZE,
    shuffle=False,
    drop_last=False,
    collate_fn=collate_batch,
    num_workers=num_workers,
    pin_memory=pin_memory,
)

# Defining model

In [None]:
class SpeechRecognition(torch.nn.Module):
    def __init__(self, n_input=1, n_output=11, stride=16, n_channel=32):
        super().__init__()
        self.conv1 = torch.nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
        self.bn1 = torch.nn.BatchNorm1d(n_channel)
        self.pool1 = torch.nn.MaxPool1d(4)
        self.conv2 = torch.nn.Conv1d(n_channel, n_channel, kernel_size=3)
        self.bn2 = torch.nn.BatchNorm1d(n_channel)
        self.pool2 = torch.nn.MaxPool1d(4)
        self.conv3 = torch.nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
        self.bn3 = torch.nn.BatchNorm1d(2 * n_channel)
        self.pool3 = torch.nn.MaxPool1d(4)
        self.conv4 = torch.nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
        self.bn4 = torch.nn.BatchNorm1d(2 * n_channel)
        self.pool4 = torch.nn.MaxPool1d(4)
        self.fc1 = torch.nn.Linear(2 * n_channel, n_output)

    def forward(self, x):
        x = self.conv1(x)
        x = torch.nn.functional.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = torch.nn.functional.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = torch.nn.functional.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = torch.nn.functional.relu(self.bn4(x))
        x = self.pool4(x)
        x = torch.nn.functional.avg_pool1d(x, x.shape[-1])
        x = x.permute(0, 2, 1)
        x = self.fc1(x)
        return torch.nn.functional.log_softmax(x, dim=2)

In [None]:
model = SpeechRecognition()

# Optimizers, loss functions

In [None]:
model = model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
from copy import deepcopy

# Low accuracy model, just benchmark

In [None]:
best_model = deepcopy(model)
accs = 0
for i in range(1, EPOCHS+1):
    model.train()
    
    train_loss = 0
    train_count = 0
    
    for ids, (value, label) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(value)
#         print(output.shape)
        loss = criterion(output.squeeze(1), label)
        loss.backward()
        optimizer.step()
        train_loss = loss.item()
        train_count += output.size(0)
        
    print("Epoch {} training loss: {} || ".format(i, train_loss/train_count), end='')
        
    model.eval()
    val_acc = 0
    val_count = 0
    with torch.no_grad():
        for ids, (value, label) in enumerate(test_loader):
            output = model(value)
            loss = criterion(output.squeeze(1), label)
            val_acc += (output.squeeze().argmax(1) == label).sum().item()
            val_count += label.size(0)
            
    if val_acc/val_count > accs:
        accs = val_acc/val_count
        best_model = deepcopy(model)
            
    print("val accuracy: {}".format(val_acc/val_count))

In [None]:
case = os.path.join("/kaggle/input/it-spectrum-dataset", df.iloc[0, -1]+".wav")
sample = pad_sequence([transforms(case)])
res = best_model(sample)
id_label[res.squeeze(1).argmax(1).item()]

# Afterword

This is the first time I am doing Speech Recognition Task

Given bigger dataset with more values - that would lead to better model and higher accuracy