# Training a Small Network
In this Implementation, we will be using `UrbanSound8K` dataset to demonstrate how speech recognition can be done and what all components are usually required in such pipelines.

## Importing  Requirements

In [None]:
import librosa
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from torch import nn
import torch
from tensorboardX import SummaryWriter


In [None]:
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#  Dataset

UrbanSound8K dataset is having 8732 labeled sound recordings of 10 classes namely air_conditioner, car_horn, children_playing, dog_bark, drilling, enginge_idling, gun_shot, jackhammer, siren, and street_music. These files are in .wav format.

UrbanSound8K dataset is available at https://urbansounddataset.weebly.com/urbansound8k.html. Make sure to download and uncompress this dataset in to `../data` folder before running this implementation.

In [None]:
#forming a panda dataframe from the metadata file
data=pd.read_csv("../data/UrbanSound8K/metadata/UrbanSound8K.csv")

In [None]:
# Looking at first few records
data.head()

In [None]:
# Statistics: count of datapoints in each of the folders
data["fold"].value_counts()

# Generating features
Various features can be extracted from such sound files such as :
1. Melspectrogram  : Compute a mel-scaled spectrogram.
2. MFCC (Mel-frequency cepstral coefficients)
3. chroma_stft :  Compute a chromagram from a waveform or power spectrogram.
4. chroma_cq :  Constant-Q chromagram
5. chroma_cens : Computes the chroma variant “Chroma Energy Normalized” (CENS)



In [None]:
#feature set
#This file is of a dog bark
y,sr=librosa.load("../data/UrbanSound8K/audio/fold5/100032-3-0-0.wav")
mfccs = librosa.feature.mfcc(y, sr, n_mfcc=40)
melspectrogram =librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40,fmax=8000)
chroma_stft=librosa.feature.chroma_stft(y=y, sr=sr,n_chroma=40)
chroma_cq =librosa.feature.chroma_cqt(y=y, sr=sr,n_chroma=40)
chroma_cens =librosa.feature.chroma_cens(y=y, sr=sr,n_chroma=40)
melspectrogram.shape,chroma_stft.shape,chroma_cq.shape,chroma_cens.shape,mfccs.shape

Once all these features are generated then you can viisualize each individual feature as given below

### MFCC 


In [None]:
#MFCC of dog bark
import matplotlib.pyplot as plt
plt.figure(figsize=(10,4))
librosa.display.specshow(mfccs, x_axis='time')
plt.colorbar()
plt.title('MFCC')
plt.tight_layout()
plt.show()

### Melspectrogram

In [None]:
plt.figure(figsize=(10,4))
librosa.display.specshow(librosa.power_to_db(melspectrogram,ref=np.max),y_axis='mel', fmax=8000,x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel spectrogram')
plt.tight_layout()
plt.show()

### Chromagram

In [None]:
plt.figure(figsize=(10,4))
librosa.display.specshow(chroma_stft, y_axis='chroma', x_axis='time')
plt.colorbar()
plt.title('Chromagram')
plt.tight_layout()
plt.show()

### Chroma cqt

In [None]:
plt.figure(figsize=(10,4))
librosa.display.specshow(chroma_cq, y_axis='chroma', x_axis='time')
plt.colorbar()
plt.title('chroma_cqt')
plt.tight_layout()
plt.show()

### Chroma cens

In [None]:
plt.figure(figsize=(10,4))
librosa.display.specshow(chroma_cens, y_axis='chroma', x_axis='time')
plt.colorbar()
plt.title('chroma_cens')
plt.tight_layout()
plt.show()

## Stacking all the features togather

In [None]:
#feature set
y,sr=librosa.load("../data/UrbanSound8K/audio/fold5/100263-2-0-137.wav")
mfccs = np.mean(librosa.feature.mfcc(y, sr, n_mfcc=40).T,axis=0)
melspectrogram = np.mean(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40,fmax=8000).T,axis=0)
chroma_stft=np.mean(librosa.feature.chroma_stft(y=y, sr=sr,n_chroma=40).T,axis=0)
chroma_cq = np.mean(librosa.feature.chroma_cqt(y=y, sr=sr,n_chroma=40).T,axis=0)
chroma_cens = np.mean(librosa.feature.chroma_cens(y=y, sr=sr,n_chroma=40).T,axis=0)
melspectrogram.shape,chroma_stft.shape,chroma_cq.shape,chroma_cens.shape,mfccs.shape

#stacking and reshaping
features=np.reshape(np.vstack((mfccs,melspectrogram,chroma_stft,chroma_cq,chroma_cens)),(40,5))
features.shape

# Writting features to disk

This will help in one time feature generation and then can be used in N number of experiments

In [None]:
#preprocessing using only mfcc
x_train=[]
x_test=[]
y_train=[]
y_test=[]
path="../data/UrbanSound8K/audio/fold"
for i in tqdm(range(len(data))):
    fold_no=str(data.iloc[i]["fold"])
    file=data.iloc[i]["slice_file_name"]
    label=data.iloc[i]["classID"]
    filename=path+fold_no+"/"+file
    #print(filename)
    y,sr=librosa.load(filename)
    mfccs = np.mean(librosa.feature.mfcc(y, sr, n_mfcc=40).T,axis=0)
    #print(mfccs.shape,mfccs.max(),mfccs.min())
    if(fold_no!='10'):
        x_train.append(mfccs)
        y_train.append(label)
    else:
        x_test.append(mfccs)
        y_test.append(label)

In [None]:
#preprocessing using entire feature set
x_train=[]
x_test=[]
y_train=[]
y_test=[]
path="../data/UrbanSound8K/audio/fold"
for i in tqdm(range(len(data))):
    fold_no=str(data.iloc[i]["fold"])
    file=data.iloc[i]["slice_file_name"]
    label=data.iloc[i]["classID"]
    filename=path+fold_no+"/"+file
    y,sr=librosa.load(filename)
    mfccs = np.mean(librosa.feature.mfcc(y, sr, n_mfcc=40).T,axis=0)
    melspectrogram = np.mean(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40,fmax=8000).T,axis=0)
    chroma_stft=np.mean(librosa.feature.chroma_stft(y=y, sr=sr,n_chroma=40).T,axis=0)
    chroma_cq = np.mean(librosa.feature.chroma_cqt(y=y, sr=sr,n_chroma=40).T,axis=0)
    chroma_cens = np.mean(librosa.feature.chroma_cens(y=y, sr=sr,n_chroma=40).T,axis=0)
    features=np.reshape(np.vstack((mfccs,melspectrogram,chroma_stft,chroma_cq,chroma_cens)),(40,5))
    if(fold_no!='10'):
      x_train.append(features)
      y_train.append(label)
    else:
      x_test.append(features)
      y_test.append(label)

Converting features in to numpy array, these features will be then used in the final model

In [None]:
#converting the lists into numpy arrays
x_train=np.array(x_train)
x_test=np.array(x_test)
y_train=np.array(y_train)
y_test=np.array(y_test)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

In [None]:
#reshaping into 2d to save in csv format
x_train_2d=np.reshape(x_train,(x_train.shape[0],x_train.shape[1]*x_train.shape[2]))
x_test_2d=np.reshape(x_test,(x_test.shape[0],x_test.shape[1]*x_test.shape[2]))
x_train_2d.shape,x_test_2d.shape

In [None]:
#saving the data numpy arrays
np.savetxt("train_data.csv", x_train_2d, delimiter=",")
np.savetxt("test_data.csv",x_test_2d,delimiter=",")
np.savetxt("train_labels.csv",y_train,delimiter=",")
np.savetxt("test_labels.csv",y_test,delimiter=",")

## Loading features for futher experimentation
Now onwards with experimentation instead of calculating features all again, you can directly load these files

In [None]:
#extracting data from csv files into numpy arrays
from numpy import genfromtxt
x_train = genfromtxt('train_data.csv', delimiter=',')
y_train = genfromtxt('train_labels.csv', delimiter=',')
x_test = genfromtxt('test_data.csv', delimiter=',')
y_test = genfromtxt('test_labels.csv', delimiter=',')

In [None]:
# chacking shape of train and test files
print("Train shape : ",x_train.shape," Test shape : ",x_test.shape," Train label shape : ",y_train.shape," Test label shape : ",y_test.shape)

# Constructing Dataloaders

Data Loaders will perform following functions before loading data:

1. One hot conversion of the label
2. Reshaping to insert in to Conv2D

In [None]:
class MyDataset(Dataset):
    def __init__(self, x_train, y_train, class_num = 10):
        self.x_train = x_train
        self.y_train = y_train
        self.class_num = class_num

    def __len__(self):
        return len(self.x_train)
    
    def _to_categorical(self,y):
        zero_array = [0  for i in range(0,self.class_num)]
        zero_array[int(y)] = 1
        return np.asarray(zero_array)

    def __getitem__(self, index):
        selected_x = self.x_train[index]
        selected_y = self._to_categorical(self.y_train[index])
        return selected_x.reshape(40,5,1), selected_y

# Model
 Model is a very simple convolutional network with various layers like Convolution 2D, Batch Normalization, Mappooling, Linear/ Dense layers along with Relu activation function.

The model accepts shape $ [m, 40,5,1] $ as produced by the data loader. where $m$ is the batch size. Two convolutional transformations with intermediate batch normalization and ReLu activation is applied to it. Eventually by using final shape is convergerged in to shape $[m, 10]$. where 10 is the number of class and $m$ is the batch size.

This model is too simple to produce exceptionable accuracy but will provide an idea of how voice recognition pipelines are designed.

In [None]:
class simple_network(nn.Module):
    def __init__(self):
        super(simple_network, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=40, out_channels=64, kernel_size=3, padding=1,stride=1)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()
        self.drop= nn.Dropout(0.2)
        self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1, stride=1)
        self.bn2 = nn.BatchNorm2d(128)
        self.maxpool = nn.MaxPool2d(kernel_size=2, padding=1)
        self.dense1 = nn.Linear(in_features=128*3, out_features=128*2)
        self.dense2 = nn.Linear(in_features=128*2, out_features=10)
        
        
    def forward(self, input_):
        ""
        conv1_out = self.conv1(input_)
        conv1_out = self.bn1(conv1_out)
        relu_applied_1 = self.relu(conv1_out)
        maxpol_out  = self.maxpool(relu_applied_1)
        conv_2_out = self.conv2(maxpol_out)
        conv_2_out = self.bn2(conv_2_out)
        relu_applied_2 = self.relu(conv_2_out)
        drop_applied = self.drop(relu_applied_2)
        
        dense1_out = self.dense1(drop_applied.view(drop_applied.shape[0],drop_applied.shape[1]*drop_applied.shape[2]))
        relu_applied_3 = self.relu(dense1_out)
        drop_applied = self.drop(relu_applied_3)
        
        dense2_out = self.dense2(drop_applied)
        relu_applied_4 = self.relu(dense2_out)
        drop_applied = self.drop(relu_applied_4)
        
        return torch.softmax(drop_applied, dim =1 )
        

In [None]:
model = simple_network()
model = model.to(device)

# Training
## Supporting Functions

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded_preds = torch.argmax(preds, dim=1)
#     print(rounded_preds)
    correct = (rounded_preds == torch.argmax(y, dim=1)).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [None]:
def test(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0    
    for x, y in iterator:
        x = x.type(torch.FloatTensor)
        predictions = model(x.to(device))
        loss = criterion(predictions.type(torch.FloatTensor), y.type(torch.FloatTensor))
        acc = binary_accuracy(predictions.type(torch.FloatTensor), y.type(torch.FloatTensor))
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0    
    for x, y in iterator:
        optimizer.zero_grad()
        x = x.type(torch.FloatTensor)
        predictions = model(x.to(device))
        loss = criterion(predictions.type(torch.FloatTensor), y.type(torch.FloatTensor))
        acc = binary_accuracy(predictions.type(torch.FloatTensor), y.type(torch.FloatTensor))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

## Constructing data loaders

In [None]:
training_set = MyDataset(x_train, y_train)
training_generator = DataLoader(training_set,batch_size=32, shuffle=True, num_workers=1)
test_set = MyDataset(x_test, y_test)
test_generator = DataLoader(test_set,batch_size=32, shuffle=True, num_workers=1)

In [None]:
criteria  =  nn.BCEWithLogitsLoss()
optimizer =  torch.optim.Adam(model.parameters(), lr=0.001)

## Training process

In [None]:
writer = SummaryWriter()
for epoch in tqdm(range(0,100)):
    if (epoch != 0 and epoch%20 == 0 ):
        # chnaging learning rate for rnn_model
        for param_group in optimizer.param_groups:
            param_group['lr'] = param_group['lr']/2
    
    train_loss, train_acc = train(model, training_generator, optimizer, criteria)
    test_loss, test_acc = test(model, test_generator, criteria)
    writer.add_scalar('Test/Loss', test_loss, epoch)
    writer.add_scalar('Test/Accuracy', test_acc,epoch)
    writer.add_scalar('Train/Loss', train_loss,epoch)
    writer.add_scalar('Train/Accuracy', train_acc,epoch)
writer.close()
