# Imports

In [None]:
%pip install wandb

Note: you may need to restart the kernel to use updated packages.


In [None]:
import wandb

wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33martgoldman[0m (use `wandb login --relogin` to force relogin)


True

In [None]:
%pip install torchaudio==0.9.1

Note: you may need to restart the kernel to use updated packages.


In [None]:
from typing import Tuple, Union, List, Callable, Optional
from tqdm import tqdm
from itertools import islice
import pathlib
import dataclasses

import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F
from torch import nn
from torch import distributions
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from torch.nn.utils.rnn import pad_sequence

import torchaudio
from IPython import display as display_

# Task
In this notebook we will implement a model for finding a keyword in a stream.

We will implement the version with CRNN because it is easy and improves the model. (from https://www.dropbox.com/s/22ah2ba7dug6pzw/KWS_Attention.pdf)

In [None]:
@dataclasses.dataclass
class TaskConfig:
    keyword: Tuple[str, str, str] = ('sheila', 'happy', 'eight')
    batch_size: int = 128
    learning_rate: float = 3e-3
    momentum: float = 0.9
    weight_decay: float = 1e-5
    num_epochs: int = 30
    n_mels: int = 40
    cnn_out_channels: int = 8
    kernel_size: Tuple[int, int] = (5, 20)
    stride: Tuple[int, int] = (2, 8)
    hidden_size: int = 64
    gru_num_layers: int = 2
    bidirectional: bool = False
    num_classes: int = 4
    sample_rate: int = 16000
    device: torch.device = torch.device(
        'cuda:0' if torch.cuda.is_available() else 'cpu')

# Data

In [None]:
!wget http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz -O speech_commands_v0.01.tar.gz
!mkdir speech_commands && tar -C speech_commands -xvzf speech_commands_v0.01.tar.gz 1> log

--2022-01-25 10:05:52--  http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz
Resolving download.tensorflow.org (download.tensorflow.org)... 173.194.222.128, 2a00:1450:4010:c05::80
Connecting to download.tensorflow.org (download.tensorflow.org)|173.194.222.128|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1489096277 (1,4G) [application/gzip]
Saving to: ‘speech_commands_v0.01.tar.gz’


2022-01-25 10:06:21 (49,7 MB/s) - ‘speech_commands_v0.01.tar.gz’ saved [1489096277/1489096277]

mkdir: cannot create directory ‘speech_commands’: File exists


In [None]:
class SpeechCommandDataset(Dataset):

    def __init__(
        self,
        transform: Optional[Callable] = None,
        path2dir: str = None,
        keywords: Union[str, List[str]] = None,
        csv: Optional[pd.DataFrame] = None
    ):        
        self.transform = transform

        if csv is None:
            path2dir = pathlib.Path(path2dir)
            if isinstance(keywords, tuple):
                keywords = list(keywords)
            else:
                keywords = keywords if isinstance(keywords, list) else [keywords]
            
            all_keywords = [
                p.stem for p in path2dir.glob('*')
                if p.is_dir() and not p.stem.startswith('_')
            ]

            triplets = []
            ind = 1
            for keyword in all_keywords:
                paths = (path2dir / keyword).rglob('*.wav')
                if keyword in keywords:
                    for path2wav in paths:
                        triplets.append((path2wav.as_posix(), keyword, ind))
                    ind += 1
                else:
                    for path2wav in paths:
                        triplets.append((path2wav.as_posix(), keyword, 0))
            
            self.csv = pd.DataFrame(
                triplets,
                columns=['path', 'keyword', 'label']
            )

        else:
            self.csv = csv
    
    def __getitem__(self, index: int):
        instance = self.csv.iloc[index]

        path2wav = instance['path']
        wav, sr = torchaudio.load(path2wav)
        wav = wav.sum(dim=0)
        
        if self.transform:
            wav = self.transform(wav)

        return {
            'wav': wav,
            'keywors': instance['keyword'],
            'label': instance['label']
        }

    def __len__(self):
        return len(self.csv)

In [None]:
dataset = SpeechCommandDataset(
    path2dir='speech_commands', keywords=TaskConfig.keyword
)

## Augmentations

In [None]:
class AugsCreation:

    def __init__(self):
        self.background_noises = [
            'speech_commands/_background_noise_/white_noise.wav',
            'speech_commands/_background_noise_/dude_miaowing.wav',
            'speech_commands/_background_noise_/doing_the_dishes.wav',
            'speech_commands/_background_noise_/exercise_bike.wav',
            'speech_commands/_background_noise_/pink_noise.wav',
            'speech_commands/_background_noise_/running_tap.wav'
        ]

        self.noises = [
            torchaudio.load(p)[0].squeeze()
            for p in self.background_noises
        ]

    def add_rand_noise(self, audio):

        # randomly choose noise
        noise_num = torch.randint(low=0, high=len(
            self.background_noises), size=(1,)).item()
        noise = self.noises[noise_num]

        noise_level = torch.Tensor([1])  # [0, 40]

        noise_energy = torch.norm(noise)
        audio_energy = torch.norm(audio)
        alpha = (audio_energy / noise_energy) * \
            torch.pow(10, -noise_level / 20)

        start = torch.randint(
            low=0,
            high=max(int(noise.size(0) - audio.size(0) - 1), 1),
            size=(1,)
        ).item()
        noise_sample = noise[start: start + audio.size(0)]

        audio_new = audio + alpha * noise_sample
        audio_new.clamp_(-1, 1)
        return audio_new

    def __call__(self, wav):
        aug_num = torch.randint(low=0, high=4, size=(1,)).item()   # choose 1 random aug from augs
        augs = [
            lambda x: x,
            lambda x: (x + distributions.Normal(0, 0.01).sample(x.size())).clamp_(-1, 1),
            lambda x: torchaudio.transforms.Vol(.25)(x),
            lambda x: self.add_rand_noise(x)
        ]

        return augs[aug_num](wav)

In [None]:
torch.manual_seed(68)
indexes = torch.randperm(len(dataset))
train_indexes = indexes[:int(len(dataset) * 0.8)]
val_indexes = indexes[int(len(dataset) * 0.8):]

train_df = dataset.csv.iloc[train_indexes].reset_index(drop=True)
val_df = dataset.csv.iloc[val_indexes].reset_index(drop=True)

In [None]:
# Sample is a dict of utt, word and label
train_set = SpeechCommandDataset(csv=train_df, transform=AugsCreation())
val_set = SpeechCommandDataset(csv=val_df)

## Sampler for oversampling:

In [None]:
# We should provide to WeightedRandomSampler _weight for every sample_; by default it is 1/len(target)

def get_sampler(target):
    class_sample_count = np.array(
        [len(np.where(target == t)[0]) for t in np.unique(target)])   # for every class count it's number of occ.
    weight = 1. / class_sample_count
    samples_weight = np.array([weight[t] for t in target])
    samples_weight = torch.from_numpy(samples_weight)
    samples_weigth = samples_weight.float()
    sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
    return sampler

In [None]:
train_sampler = get_sampler(train_set.csv['label'].values)


In [None]:
class Collator:
    
    def __call__(self, data):
        wavs = []
        labels = []    

        for el in data:
            wavs.append(el['wav'])
            labels.append(el['label'])

        # torch.nn.utils.rnn.pad_sequence takes list(Tensors) and returns padded (with 0.0) Tensor
        wavs = pad_sequence(wavs, batch_first=True)    
        labels = torch.Tensor(labels).long()
        return wavs, labels

## Dataloaders

In [None]:
# Here we are obliged to use shuffle=False because of our sampler with randomness inside.

train_loader = DataLoader(train_set, batch_size=TaskConfig.batch_size,
                          shuffle=False, collate_fn=Collator(),
                          sampler=train_sampler,
                          num_workers=2)

val_loader = DataLoader(val_set, batch_size=TaskConfig.batch_size,
                        shuffle=False, collate_fn=Collator(),
                        num_workers=2)

In [None]:
batch, labels = next(iter(train_loader))

In [None]:
from IPython import display


for i in range(10):
    print("~~~~")
    display.display(display.Audio(batch[i], rate=16_000))
    print(labels[i].item())

~~~~


1
~~~~


1
~~~~


0
~~~~


0
~~~~


0
~~~~


1
~~~~


2
~~~~


0
~~~~


1
~~~~


0


## Creating MelSpecs on GPU for speeeed:

In [None]:
class LogMelspec:

    def __init__(self, is_train, config):
        # with augmentations
        if is_train:
            self.melspec = nn.Sequential(
                torchaudio.transforms.MelSpectrogram(
                    sample_rate=config.sample_rate,
                    n_fft=400,
                    win_length=400,
                    hop_length=160,
                    n_mels=config.n_mels
                ),
                torchaudio.transforms.FrequencyMasking(freq_mask_param=15),
                torchaudio.transforms.TimeMasking(time_mask_param=35),
            ).to(config.device)

        # no augmentations
        else:
            self.melspec = torchaudio.transforms.MelSpectrogram(
                sample_rate=config.sample_rate,
                n_fft=400,
                win_length=400,
                hop_length=160,
                n_mels=config.n_mels
            ).to(config.device)

    def __call__(self, batch):
        # already on device
        return torch.log(self.melspec(batch).clamp_(min=1e-9, max=1e9))

In [None]:
melspec_train = LogMelspec(is_train=True, config=TaskConfig)
melspec_val = LogMelspec(is_train=False, config=TaskConfig)

## Quality measurment functions:

In [None]:
# FA - true: 0, model: 1
# FR - true: 1, model: 0

def count_FA_FR(preds, labels):
    FA = torch.sum(preds[labels == 0])
    FR = torch.sum(labels[preds == 0])
    
    # torch.numel - returns total number of elements in tensor
    return FA.item() / torch.numel(preds), FR.item() / torch.numel(preds)

In [None]:
def get_au_fa_fr(probs, labels):
    sorted_probs, _ = torch.sort(probs)
    sorted_probs = torch.cat((torch.Tensor([0]), sorted_probs, torch.Tensor([1])))
    labels = torch.cat(labels, dim=0)
        
    FAs, FRs = [], []
    for prob in sorted_probs:
        preds = (probs >= prob) * 1
        FA, FR = count_FA_FR(preds, labels)        
        FAs.append(FA)
        FRs.append(FR)
    # plt.plot(FAs, FRs)
    # plt.show()

    # ~ area under curve using trapezoidal rule
    return -np.trapz(FRs, x=FAs)


# Model

In [None]:
class Attention(nn.Module):

    def __init__(self, hidden_size: int):
        super().__init__()

        self.energy = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, 1)
        )
    
    def forward(self, input):
        energy = self.energy(input)
        alpha = torch.softmax(energy, dim=-2)
        return (input * alpha).sum(dim=-2)

class CRNN(nn.Module):

    def __init__(self, config: TaskConfig):
        super().__init__()
        self.config = config

        self.conv = nn.Sequential(
            nn.Conv2d(
                in_channels=1, out_channels=config.cnn_out_channels,
                kernel_size=config.kernel_size, stride=config.stride
            ),
            nn.Flatten(start_dim=1, end_dim=2),
        )

        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \
            config.stride[0] + 1
        
        self.gru = nn.GRU(
            input_size=self.conv_out_frequency * config.cnn_out_channels,
            hidden_size=config.hidden_size,
            num_layers=config.gru_num_layers,
            dropout=0.1,
            bidirectional=config.bidirectional,
            batch_first=True
        )

        self.attention = Attention(config.hidden_size)
        self.classifier = nn.Linear(config.hidden_size, config.num_classes)
    
    def forward(self, input):
        input = input.unsqueeze(dim=1)
        conv_output = self.conv(input).transpose(-1, -2)
        gru_output, _ = self.gru(conv_output)
        contex_vector = self.attention(gru_output)
        output = self.classifier(contex_vector)
        return output

config = TaskConfig()
model = CRNN(config)
model

CRNN(
  (conv): Sequential(
    (0): Conv2d(1, 8, kernel_size=(5, 20), stride=(2, 8))
    (1): Flatten(start_dim=1, end_dim=2)
  )
  (gru): GRU(144, 64, num_layers=2, batch_first=True, dropout=0.1)
  (attention): Attention(
    (energy): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=1, bias=True)
    )
  )
  (classifier): Linear(in_features=64, out_features=4, bias=True)
)

In [None]:
def train_epoch(model, opt, sched, loader, log_melspec, device, writer, clip_grad=None):
    model.train()
    all_loss = []
    for i, (batch, labels) in tqdm(enumerate(loader), total=len(loader)):
        batch, labels = batch.to(device), labels.to(device)
        batch = log_melspec(batch)

        opt.zero_grad()

        # run model # with autocast():
        logits = model(batch)
        # we need probabilities so we use softmax & CE separately
        probs = F.softmax(logits, dim=-1)
        loss = F.cross_entropy(logits, labels)

        loss.backward()
        if clip_grad is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)

        opt.step()
        sched.step()
        writer.log({"lr": sched.get_last_lr()[0]})

        # logging
        argmax_probs = torch.argmax(probs, dim=-1)
        #FA, FR = count_FA_FR(argmax_probs, labels)
        #acc = torch.sum(argmax_probs == labels) / torch.numel(argmax_probs)
        all_loss.append(loss.item())

    return sum(all_loss)/len(all_loss)

In [None]:
@torch.no_grad()
def validation(model, loader, log_melspec, device):
    model.eval()

    val_losses, accs, FAs, FRs = [], [], [], []
    all_probs, all_labels = [], []
    for i, (batch, labels) in tqdm(enumerate(loader)):
        batch, labels = batch.to(device), labels.to(device)
        batch = log_melspec(batch)

        output = model(batch)
        # we need probabilities so we use softmax & CE separately
        probs = F.softmax(output, dim=-1)
        loss = F.cross_entropy(output, labels)

        # logging
        argmax_probs = torch.argmax(probs, dim=-1)
        all_probs.append(probs[:, 1].cpu())
        all_labels.append(labels.cpu())
        val_losses.append(loss.item())
        """
        accs.append(
            torch.sum(argmax_probs == labels).item()
        )
        """
        accs += (argmax_probs == labels).tolist()
        #FA, FR = count_FA_FR(argmax_probs, labels)
        #FAs.append(FA)
        #FRs.append(FR)

    # area under FA/FR curve for whole loader
    #au_fa_fr = get_au_fa_fr(torch.cat(all_probs, dim=0).cpu(), all_labels)
    mean_val_loss = sum(val_losses)/len(val_losses)
    mean_acc = sum(accs)/len(accs)
    return mean_val_loss, mean_acc

In [None]:
@torch.no_grad()
def get_grad_norm(model, norm_type=2):
    parameters = model.parameters()
    if isinstance(parameters, torch.Tensor):
        parameters = [parameters]
    parameters = [p for p in parameters if p.grad is not None]
    total_norm = torch.norm(
        torch.stack(
            [torch.norm(p.grad.detach(), norm_type).cpu() for p in parameters]
        ),
        norm_type,
    )
    return total_norm.item()

In [None]:
from collections import defaultdict
from IPython.display import clear_output
from matplotlib import pyplot as plt

history = defaultdict(list)

# Training SGD

In [None]:
config = TaskConfig()
model = CRNN(config).to(config.device)

print(model)

opt = torch.optim.SGD(
    model.parameters(),
    lr=config.learning_rate,
    momentum=config.momentum,
    nesterov=True,
    weight_decay=config.weight_decay
)

"""
sched = torch.optim.lr_scheduler.OneCycleLR(opt, max_lr = 1e-2, epochs = config.num_epochs,
                                           steps_per_epoch = len(train_loader), anneal_strategy='cos',
                                           pct_start=0.3)
"""
sched = torch.optim.lr_scheduler.ExponentialLR(opt, gamma=1.)

CRNN(
  (conv): Sequential(
    (0): Conv2d(1, 8, kernel_size=(5, 20), stride=(2, 8))
    (1): Flatten(start_dim=1, end_dim=2)
  )
  (gru): GRU(144, 64, num_layers=2, batch_first=True, dropout=0.1)
  (attention): Attention(
    (energy): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=1, bias=True)
    )
  )
  (classifier): Linear(in_features=64, out_features=4, bias=True)
)


In [None]:
sum([p.numel() for p in model.parameters()])

70573

In [None]:
# TRAIN
wandb.init(project="FullGD-HSE-RS", name = 'SGD_audio_multi')
for n in range(config.num_epochs):

    train_loss = train_epoch(model, opt, sched, train_loader,
                melspec_train, config.device, wandb, None)
    
    gr_norm = get_grad_norm(model)

    val_loss, acc = validation(model, val_loader,
                        melspec_val, config.device)
    wandb.log({"grad_norm": gr_norm,"train_loss": train_loss,
               "val_loss": val_loss, "val_acc": acc})
    clear_output()


    print('END OF EPOCH', n)
wandb.finish()
torch.save(model, 'base_model.pth')

END OF EPOCH 29


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
grad_norm,▁▁▃▄▂▅▄▄█▄▄▆▄▇▅▄▄▅▆▅▅▆▇▄▂▄▅▃▄▄
lr,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,██▇▆▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
val_acc,▅▆▁▂▃▄▆▆▆▆▇▇▇▇▇▇▇█▇▇█▇████████
val_loss,█▆█▇▇▆▄▄▃▃▃▂▂▃▂▂▂▂▂▂▁▂▁▂▁▂▁▁▁▁

0,1
grad_norm,3.44612
lr,0.003
train_loss,0.48616
val_acc,0.89378
val_loss,0.27599


# Training Full GD

In [None]:
def train_epoch_full(model, opt, sched, loader, log_melspec, device, writer, clip_grad=None):
    model.train()
    opt.zero_grad()
    all_loss = []
    for i, (batch, labels) in tqdm(enumerate(loader), total=len(loader)):
        batch, labels = batch.to(device), labels.to(device)
        batch = log_melspec(batch)

        

        # run model # with autocast():
        logits = model(batch)
        # we need probabilities so we use softmax & CE separately
        probs = F.softmax(logits, dim=-1)
        loss = F.cross_entropy(logits, labels)

        loss.backward()
        all_loss.append(loss.item())
        if clip_grad is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)

        
        writer.log({"lr": sched.get_last_lr()[0]})

        # logging
        argmax_probs = torch.argmax(probs, dim=-1)
        """
        FA, FR = count_FA_FR(argmax_probs, labels)
        acc = torch.sum(argmax_probs == labels) / torch.numel(argmax_probs)
        """
    opt.step()
    sched.step()

    return sum(all_loss)/len(all_loss)

In [None]:
@dataclasses.dataclass
class FullGDConfig(TaskConfig):
    batch_size: int = 128
    learning_rate: float = 3e-3
    momentum: float = 0.9
    weight_decay: float = 1e-5
    num_epochs: int = 300
    nesterov: bool = True
    grad_clip: int = 50

In [None]:
config = FullGDConfig()
model = CRNN(config).to(config.device)

print(model)

opt = torch.optim.SGD(
    model.parameters(),
    lr=config.learning_rate,
    momentum=config.momentum,
    nesterov=config.nesterov,
    weight_decay=config.weight_decay
)

"""
sched = torch.optim.lr_scheduler.OneCycleLR(opt, max_lr = 1e-2, epochs = config.num_epochs,
                                           steps_per_epoch = len(train_loader), anneal_strategy='cos',
                                           pct_start=0.3)
"""
sched = torch.optim.lr_scheduler.ExponentialLR(opt, gamma=1.)

CRNN(
  (conv): Sequential(
    (0): Conv2d(1, 8, kernel_size=(5, 20), stride=(2, 8))
    (1): Flatten(start_dim=1, end_dim=2)
  )
  (gru): GRU(144, 64, num_layers=2, batch_first=True, dropout=0.1)
  (attention): Attention(
    (energy): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=1, bias=True)
    )
  )
  (classifier): Linear(in_features=64, out_features=4, bias=True)
)


In [None]:
sum([p.numel() for p in model.parameters()])

70573

In [None]:
# TRAIN
wandb.init(project="FullGD-HSE-RS", name = 'FullGD_audio_multi')
for n in range(config.num_epochs):

    train_loss = train_epoch_full(model, opt, sched, train_loader,
                melspec_train, config.device, wandb, config.grad_clip)
    
    gr_norm = get_grad_norm(model)

    val_loss, acc = validation(model, val_loader,
                        melspec_val, config.device)
    wandb.log({"grad_norm": gr_norm,"train_loss": train_loss,
               "val_loss": val_loss, "val_acc": acc})
    clear_output()


    print('END OF EPOCH', n)
wandb.finish()
torch.save(model, 'base_model.pth')

END OF EPOCH 299


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
grad_norm,█▁█▆██▆██▇█████████▇████████▆██████████▁
lr,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,████▇▇▆▆▆▅▅▅▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▁▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▂▁▅▆▅▄▅▅▆▅▅▆▆▇▆▇▇▇▇▇▇▇█▇▇███▇▇█████████
val_loss,███▇▅▅▆▆▆▆▅▅▄▄▄▄▃▃▂▃▂▂▂▂▂▂▁▂▂▂▂▁▁▁▁▁▁▁▂▁

0,1
grad_norm,49.52197
lr,0.003
train_loss,0.50347
val_acc,0.89386
val_loss,0.28053
