# Wavenet for raw audio classification
Implementation from this [kaggle notebook](https://www.kaggle.com/hanjoonchoe/wavenet-lstm-pytorch-ignite-ver)

## Import libraries

In [1]:
import pandas as pd
import numpy as np
import os
import librosa
import librosa.display
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, recall_score

## Prepare data set

In [2]:
folder_path = 'C:/native_language/wav/'
label_file = "C:/native_language/lab/ComParE2016_Nativeness.tsv"

In [3]:
data_file = pd.read_csv(label_file, sep="\t")
train_files = data_file['file_name'].str.contains('train')
train_df = data_file[train_files]
dev_files = data_file['file_name'].str.contains('devel')
dev_df = data_file[dev_files]

In [4]:
def get_raw_audio(file_path):
    wav,sr = librosa.load(file_path,sr=None)
    
    # trim wav (5-10 seconds)
    return wav[80000:160000]

In [5]:
class Data(Dataset):
    def __init__(self, df, base='C:/native_language/wav/', in_col='file_name', out_col='L1'):
        self.df = df
        self.data = []
        self.labels = []
        self.c2i={}
        self.i2c={}
        self.categories = sorted(df[out_col].unique())
        for i, category in enumerate(self.categories):
            self.c2i[category]=i
            self.i2c[i]=category

        for ind in range(len(df)):
            row = df.iloc[ind]
            file_path = os.path.join(base,row[in_col])
            label = row[out_col]
            self.data.append(get_raw_audio(file_path)[np.newaxis,...])
            self.labels.append(self.c2i[row[out_col]])
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [6]:
train_data = Data(train_df, base='C:/native_language/wav/',
                  in_col='file_name', out_col='L1')

In [7]:
dev_data = Data(dev_df, base='C:/native_language/wav/',
                  in_col='file_name', out_col='L1')

In [8]:
BATCH_SIZE = 8 #16
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(dev_data, batch_size=BATCH_SIZE, shuffle=True)

In [9]:
if torch.cuda.is_available():
    device=torch.device('cuda:0')
else:
    device=torch.device('cpu')

## Build Wavenet

In [10]:
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)

        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0

        weight = torch.zeros(feature_dim, 1)
        nn.init.kaiming_uniform_(weight)
        self.weight = nn.Parameter(weight)

        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))

    def forward(self, x, mask=None):
        feature_dim = self.feature_dim
        step_dim = self.step_dim
        
        eij = torch.mm(
            x.contiguous().view(-1, feature_dim),
            self.weight
        ).view(-1, step_dim)
    
        if self.bias:
            eij = eij + self.b

        eij = torch.tanh(eij)
        a = torch.exp(eij)

        if mask is not None:
            a = a * mask

        a = a / (torch.sum(a, 1, keepdim=True) + 1e-10)

        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)

In [11]:
class Wave_Block(nn.Module):
    
    def __init__(self,in_channels,out_channels,dilation_rates):
        super(Wave_Block,self).__init__()
        self.num_rates = dilation_rates
        self.convs = nn.ModuleList()
        self.filter_convs = nn.ModuleList()
        self.gate_convs = nn.ModuleList()
        
        self.convs.append(nn.Conv1d(in_channels,out_channels,kernel_size=1))
        dilation_rates = [2**i for i in range(dilation_rates)]
        for dilation_rate in dilation_rates:
            self.filter_convs.append(nn.Conv1d(out_channels,out_channels,kernel_size=3,padding=dilation_rate,dilation=dilation_rate))
            self.gate_convs.append(nn.Conv1d(out_channels,out_channels,kernel_size=3,padding=dilation_rate,dilation=dilation_rate))
            self.convs.append(nn.Conv1d(out_channels,out_channels,kernel_size=1))
            
    def forward(self,x):
        x = self.convs[0](x)
        res = x
        for i in range(self.num_rates):
            x = torch.tanh(self.filter_convs[i](x))*torch.sigmoid(self.gate_convs[i](x))
            x = self.convs[i+1](x)
            x += res
        return x

In [12]:
class Wave_LSTM(nn.Module):
    def __init__(self):
        super().__init__()
        in_channels = 80000 #150000 result of number of frames
        
        self.wave_block1 = Wave_Block(1,16,8)
        self.wave_block2 = Wave_Block(16,32,5)
        self.wave_block3 = Wave_Block(32,64,3)
        self.avgpool1d = nn.AvgPool1d(10)
        
        self.LSTM = nn.GRU(input_size=in_channels//10**3, hidden_size=in_channels//10**3,
                           num_layers=64, batch_first=True, bidirectional=True)
        #self.attention = Attention(300,64)
        self.attention = Attention(160,64) # result of in_channels / 500
        
        #self.conv1 = nn.Linear(300,128)
        self.conv1 = nn.Linear(160,128) # it has to match the output from attention
        #self.conv2 = nn.Linear(128,1)
        self.conv2 = nn.Linear(128,11) # needs to have the same number of outputs as categories
            
    def forward(self,x):
        x = self.wave_block1(x)
        #shrinking
        x = self.avgpool1d(x)
        x = self.wave_block2(x)
        #shrinking
        x = self.avgpool1d(x)
        x = self.wave_block3(x)
        #shrinking
        x = self.avgpool1d(x)
        #print("Before LSTM", x.size())
        x,_ = self.LSTM(x)
        #print("x.size() after LSTM", x.size())
        x = self.attention(x)
        x = F.dropout(x,0.2)
        x = self.conv1(x)
        x = self.conv2(x)
        #print("FINAL OUTPUT", x.size())
        return x

In [13]:
"""
Before LSTM torch.Size([2, 64, 150])
After LSTM torch.Size([2, 64, 300])
feature_dim 300 step_dim 64
After attention torch.Size([2, 300])
"""

'\nBefore LSTM torch.Size([2, 64, 150])\nAfter LSTM torch.Size([2, 64, 300])\nfeature_dim 300 step_dim 64\nAfter attention torch.Size([2, 300])\n'

In [14]:
model = Wave_LSTM().to(device)

In [15]:
loss_fn = nn.CrossEntropyLoss()
learning_rate = 2e-5
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
epochs = 30
train_losses = []
valid_losses = []

In [16]:
def train(model, loss_fn, train_loader, valid_loader, epochs, optimizer, train_losses, valid_losses, change_lr=None):
    for epoch in range(1,epochs+1):
        model.train()
        batch_losses=[]
        if change_lr:
            optimizer = change_lr(optimizer, epoch)
        for i, data in enumerate(train_loader):
            x, y = data
            optimizer.zero_grad()
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x)
            
            loss = loss_fn(y_hat, y)
            loss.backward()
            batch_losses.append(loss.item())
            optimizer.step()
        train_losses.append(batch_losses)
        print(f'Epoch - {epoch} Train-Loss : {np.mean(train_losses[-1]):.3f}')
        model.eval()
        batch_losses=[]
        trace_y = []
        trace_yhat = []
        for i, data in enumerate(valid_loader):
            x, y = data
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            trace_y.append(y.cpu().detach().numpy())
            trace_yhat.append(y_hat.cpu().detach().numpy())      
            batch_losses.append(loss.item())
        valid_losses.append(batch_losses)
        trace_y = np.concatenate(trace_y)
        trace_yhat = np.concatenate(trace_yhat)
        accuracy = np.mean(trace_yhat.argmax(axis=1)==trace_y)
        print(f'Epoch - {epoch} Valid-Loss : {np.mean(valid_losses[-1]):.3f} Valid-Accuracy : {accuracy:.3f}')

In [None]:
def setlr(optimizer, lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return optimizer

In [17]:
def lr_decay(optimizer, epoch):
    if epoch % 5==0:
        new_lr = learning_rate / (10**(epoch//20))
        optimizer = setlr(optimizer, new_lr)
        print(f'Changed learning rate to {new_lr}')
    return optimizer

In [18]:
"""
Before LSTM torch.Size([2, 64, 150])
After LSTM torch.Size([2, 64, 300])
feature_dim 300 step_dim 64
After attention torch.Size([2, 300])
"""

'\nBefore LSTM torch.Size([2, 64, 150])\nAfter LSTM torch.Size([2, 64, 300])\nfeature_dim 300 step_dim 64\nAfter attention torch.Size([2, 300])\n'

In [19]:
train(model, loss_fn, train_loader, valid_loader, epochs, optimizer, train_losses, valid_losses, lr_decay)

Epoch - 1 Train-Loss : 2.401
Epoch - 1 Valid-Loss : 2.400 Valid-Accuracy : 0.089
Epoch - 2 Train-Loss : 2.399
Epoch - 2 Valid-Loss : 2.398 Valid-Accuracy : 0.101
Epoch - 3 Train-Loss : 2.400
Epoch - 3 Valid-Loss : 2.399 Valid-Accuracy : 0.088
Epoch - 4 Train-Loss : 2.399
Epoch - 4 Valid-Loss : 2.399 Valid-Accuracy : 0.083


NameError: name 'setlr' is not defined

In [None]:
del model