# Task description
- Classify the speakers of given features.
- Baselines:
  - Easy: Run sample code and know how to use transformer.
  - Medium: Know how to adjust parameters of transformer.
  - Strong: Construct [conformer](https://arxiv.org/abs/2005.08100) which is a variety of transformer. 
  - Boss: Implement [Self-Attention Pooling](https://arxiv.org/pdf/2008.01077v1.pdf) & [Additive Margin Softmax](https://arxiv.org/pdf/1801.05599.pdf) to further boost the performance.


# Download dataset
- Data is [here](https://drive.google.com/drive/folders/1vI1kuLB-q1VilIftiwnPOCAeOOFfBZge?usp=sharing)

In [1]:
# import module
import os
import glob
import random
from datetime import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch import optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
from torchsummary import summary

import matplotlib.pyplot as plt
import numpy as np
import logging
from tqdm import tqdm


# seed setting
def same_seeds(seed):
    # Python built-in random module
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Torch
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

same_seeds(1212)

  from .autonotebook import tqdm as notebook_tqdm


# Data

## Dataset
- Original dataset is [Voxceleb2](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox2.html).
- The [license](https://creativecommons.org/licenses/by/4.0/) and [complete version](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/files/license.txt) of Voxceleb2.
- We randomly select 600 speakers from Voxceleb2.
- Then preprocess the raw waveforms into mel-spectrograms.

- Args:
  - data_dir: The path to the data directory.
  - metadata_path: The path to the metadata.
  - segment_len: The length of audio segment for training. 
- The architecture of data directory \\
  - data directory \\
  |---- metadata.json \\
  |---- testdata.json \\
  |---- mapping.json \\
  |---- uttr-{random string}.pt \\

- The information in metadata
  - "n_mels": The dimention of mel-spectrogram.
  - "speakers": A dictionary. 
    - Key: speaker ids.
    - value: "feature_path" and "mel_len"


For efficiency, we segment the mel-spectrograms into segments in the traing step.

In [2]:
import json

## Create my dataset

class Mydataset(Dataset):
    def __init__(self, data_dir, segment_length=128):
        self.data_dir = data_dir
        self.segment_length = segment_length
    
        # Load the mapping from speaker name to their corresponding id. 
        mapping_path = os.path.join(data_dir, 'mapping.json')
        self.speaker2id = json.load(open(mapping_path))['speaker2id']
        
        # Load metadata of training data.
        metadata_path = os.path.join(data_dir, 'metadata.json')
        metadata = json.load(open(metadata_path))['speakers']
        
        # Get the total number of speaker
        self.speaker_num = len(metadata)
        self.data = []
        for speaker in metadata.keys():
            for voice in metadata[speaker]:
                self.data.append([voice['feature_path'], self.speaker2id[speaker]])
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        fpath, speaker = self.data[idx]
        # Load preprocessed mel-spectrogram.
        mel = torch.load(os.path.join(self.data_dir, fpath))
        
        # Segmemt mel-spectrogram into "segment_len" frames.
        if len(mel) > self.segment_length:
            # Randomly get the starting point of the segment.
            start = random.randint(0, len(mel) - self.segment_length)
            # Get a segment with "segment_len" frames.
            mel = torch.Tensor(mel[start:(start+self.segment_length)])
        else:
            mel = torch.Tensor(mel)
        # Turn the speaker id into long for computing loss later.
        speaker = torch.tensor(speaker)
        return mel, speaker
    
    def get_speaker_num(self):
        return self.speaker_num
    

        

### Dataloader
- Split dataset into training dataset(90%) and validation dataset(10%).
- Create dataloader to iterate the data.

In [3]:
def collate_batch(batch):
    # Process features within a batch.
    """Collate a batch of data."""
    mel, speaker = list(zip(*batch))
    # Because we train the model batch by batch, we need to pad 
    # the features in the same batch to make their lengths the same.
    
    mel = pad_sequence(mel, batch_first = True, padding_value = -20)
    return mel, torch.Tensor(speaker).long()


def get_dataloader(data_dir, batch_size):
    """
    Generate dataloader
    """
    dataset = Mydataset(data_dir)
    speaker_num = dataset.get_speaker_num()
    # Split dataset into training dataset and validation dataset
    train_size = int(0.9*len(dataset))
    valid_size = len(dataset) - train_size
    trainset, validset = random_split(dataset, lengths=[train_size, valid_size])
    
    train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True,
                              drop_last=True, pin_memory=True, 
                              collate_fn=collate_batch)
    
    valid_loader = DataLoader(validset, batch_size=batch_size, shuffle=True,
                              drop_last=True, pin_memory=True, 
                              collate_fn=collate_batch)
    
    return train_loader, valid_loader, speaker_num

# Model
- TransformerEncoderLayer:
  - Base transformer encoder layer in [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
  - Parameters:
    - d_model: the number of expected features of the input (required).

    - nhead: the number of heads of the multiheadattention models (required).

    - dim_feedforward: the dimension of the feedforward network model (default=2048).

    - dropout: the dropout value (default=0.1).

    - activation: the activation function of intermediate layer, relu or gelu (default=relu).

- TransformerEncoder:
  - TransformerEncoder is a stack of N transformer encoder layers
  - Parameters:
    - encoder_layer: an instance of the TransformerEncoderLayer() class (required).

    - num_layers: the number of sub-encoder-layers in the encoder (required).

    - norm: the layer normalization component (optional).

In [3]:
## Baseline model

class Classifier(nn.Module):
    def __init__(self, d_model=80, n_spks=600, dropout=0.1):
        super().__init__()
        
        self.prenet = nn.Linear(40, d_model)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, batch_first=True, 
                                                        dim_feedforward=256, nhead=4, 
                                                        norm_first=True)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=2)
        
        self.pred_layer = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, n_spks)
        )
    
    def forward(self, x):
        """
        x: (batch, length, 40)
        out: (batch, size, n_spks)
        """
        y = self.prenet(x)
        y = self.encoder(y)
        # mean pooling
        y = y.mean(dim = 1)
        y = self.pred_layer(y)
        return y

### Build the [conformer](https://arxiv.org/abs/2005.08100)

In [22]:
## Feedforward Module
class Feedforwardmodule(nn.Module):
    def __init__(self, embedding_dim, dropout_p=0.2, expansion_factor=2):
        super().__init__()
        self.layer = nn.Sequential(
            nn.LayerNorm(embedding_dim),
            nn.Linear(embedding_dim, embedding_dim * expansion_factor),
            nn.SiLU(),
            nn.Dropout(dropout_p),
            nn.Linear(embedding_dim * expansion_factor, embedding_dim),
            nn.Dropout(dropout_p)
        )
    
    def forward(self, x):
        y = self.layer(x)
        return x + y

## Multi-head self-attention module
class MHST(nn.Module):
    def __init__(self, embedding_dim, num_heads, dropout_p=0.2):
        super().__init__()
        self.layer_norm = nn.LayerNorm(embedding_dim)
        
        self.Mulithead = nn.MultiheadAttention(embedding_dim, num_heads=num_heads,
                                               batch_first=True)
        self.dropout = nn.Dropout(p=dropout_p)

        self.Q_layer = nn.Linear(embedding_dim, embedding_dim)
        self.K_layer = nn.Linear(embedding_dim, embedding_dim)
        self.V_layer = nn.Linear(embedding_dim, embedding_dim)
    
    def forward(self, inputs):
        x = self.layer_norm(inputs)
        Q = self.Q_layer(x)
        K = self.K_layer(x)
        V = self.V_layer(x)
        y, x_weight = self.Mulithead(Q,K,V)
        y = self.dropout(y)
        
        return inputs + y

    
## Convolution module
class Transpose(nn.Module):
    def __init__(self, dim0, dim1):
        super().__init__()
        self.dim0 = dim0
        self.dim1 = dim1
    def forward(self, x):
        return x.transpose(self.dim0, self.dim1)

class ConvolutionModule(nn.Module):
    """
    input shape: (B, L, in_channels)
    """  
    def __init__(self, in_channels, kernal_size=3, dropout_p=0.2):
        super().__init__()
        self.layer = nn.Sequential(
            nn.LayerNorm(in_channels), ## shape: (B, L, in_channels)
            Transpose(1,2), ## shape: (B, in_channels, L),
            nn.Conv1d(in_channels=in_channels, out_channels=in_channels*2,
                      kernel_size=1), ## shape: (B, 2*in_channels, L)
            nn.GLU(dim=1), ## shape: (B, in_channels, L)
            nn.Conv1d(in_channels=in_channels, out_channels=in_channels,
                      kernel_size=kernal_size, groups=in_channels,padding=1),## shape : (B, out_channels, L)
            nn.BatchNorm1d(in_channels), ## shape: (B, out_channels, L)
            nn.SiLU(),
            nn.Conv1d(in_channels=in_channels,out_channels=in_channels, kernel_size=1),
            nn.Dropout(p=dropout_p)
        )
        
    def forward(self, x):
        y = self.layer(x)
        y = y.transpose(1,2)
        return x+y
    
        

## Conformer layer    
class Conformer(nn.Module):
    def __init__(self, embedding_dim, dropout=0.1):
        super().__init__()
        self.FFL1 = Feedforwardmodule(embedding_dim=embedding_dim)
        self.mhst = MHST(embedding_dim=embedding_dim, num_heads = 5)
        self.convolution = ConvolutionModule(in_channels = embedding_dim)
        self.FFL2 = Feedforwardmodule(embedding_dim=embedding_dim)
        self.layernorm = nn.LayerNorm(embedding_dim)
    
    def forward(self, x):
        y1 = 0.5*self.FFL1(x) + x
        y2 = self.mhst(y1) +y1
        y3 = self.convolution(y2) + y2
        y4 = 0.5*self.FFL2(y3) + y3
        output = self.layernorm(y4)
        
        return output


## conformer encode model

class ConformerEncoder(nn.Module):
    def __init__(self, d_model=80, n_spks=600, dropout=0.1):
        super().__init__()
        self.conformers = nn.ModuleList([Conformer(embedding_dim=80) for i in range(6)])
        self.prenet = nn.Sequential(
            nn.Linear(40, d_model),
            nn.Dropout(p=dropout)
        )
        self.pred_layer = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, n_spks)
        )
    
    def forward(self, x):
        x1 = self.prenet(x)
        
        for layer in self.conformers:
            x1 = layer(x1)
        
        x2 = x1.mean(dim=1)
        output = self.pred_layer(x2)
        
        return output

In [23]:
conformer_model = ConformerEncoder()
x = torch.randn(32, 108, 40)
y = conformer_model(x)
print(y.shape)

num_pars = 0
for par in conformer_model.parameters():
    num_pars += par.numel()
print(num_pars)

torch.Size([32, 600])
764920


## Learning rate schedule
- For transformer architecture, the design of learning rate schedule is different from that of CNN.
-The warmup of learning rate is useful for training models with transformer architectures.
- The warmup schedule
  - Set learning rate to 0 in the beginning.
  - The learning rate increases linearly from 0 to initial learning rate during warmup period.

In [24]:
import math
from torch.optim import Optimizer

def lr_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps,
                            num_cycles=0.5, last_epoch=-1):
    
    """
    Create a schedule with a learning rate that decreases following the values 
    of the cosine function between the initial lr set in the optimizer to 0, 
    after a warmup period during which it increases linearly between 0 and the
    initial lr set in the optimizer.

    Args:
        optimizer (:class:`~torch.optim.Optimizer`):
        The optimizer for which to schedule the learning rate.
        num_warmup_steps (:obj:`int`):
        The number of steps for the warmup phase.
        num_training_steps (:obj:`int`):
        The total number of training steps.
        num_cycles (:obj:`float`, `optional`, defaults to 0.5):
        The number of waves in the cosine schedule 
        (the defaults is to just decrease from the max value to 0
        following a half-cosine).
        last_epoch (:obj:`int`, `optional`, defaults to -1):
        The index of the last epoch when resuming training.

    Return:
        :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """  

    def lr_lambda(current_step):
        # warm up
        if current_step <= num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        
        # after warm up
        progress = float(current_step - num_warmup_steps) / float(
            max(1, num_training_steps - num_warmup_steps)
        )
        return max(
            0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
        )
    
    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda, last_epoch)

## Train&Validate process

In [25]:
## model forward function

def model_fn(batch, model, criterion, device):
    model.train()
    mels, labels = batch
    mels = mels.to(device)
    labels = labels.to(device)
    
    out = model(mels)
    
    loss = criterion(out, labels)
    
    pred = out.argmax(dim = 1)
    accuracy = torch.mean((pred == labels).float())
    
    return loss, accuracy

def valid(dataloader, model, criterion, device):
    
    model.eval()
    
    loss_seq = []
    accuracy_seq = []
    bar = tqdm(dataloader, desc = 'Valid')
    
    for mels, labels in bar:
        mels = mels.to(device)
        labels = labels.to(device)
        outputs = model(mels)
        
        loss = criterion(outputs, labels)
        
        preds = outputs.argmax(dim = 1)
        
        accuracy = torch.mean((preds == labels).float())
        bar.set_postfix(loss = loss.item(), accuracy = accuracy.item())
        loss_seq.append(loss.item())
        accuracy_seq.append(accuracy.item())
    bar.close()    
    model.train()
    return np.mean(loss_seq), np.mean(accuracy_seq)
    


def train(config):
    
    data_dir = config['data_dir']
    save_path = config['save_path']
    batch_size = config['batch_size']
    warmup_steps = config['warmup_steps']
    valid_steps = config['valid_steps']
    train_steps = config['train_steps']
    early_stops = config['early_stops']
    model_type = config['model_type']
    
    # set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'[Info]: {device} is used')
    
    # load data
    trainloader, validloader, speaker_num = get_dataloader(data_dir, batch_size)
    
    # creating model
    if model_type == 'classic':
        model = Classifier(n_spks = speaker_num).to(device)
    elif model_type == 'conformer':
        model = ConformerEncoder(n_spks=speaker_num).to(device)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
    lr_scheduler = lr_schedule_with_warmup(optimizer,warmup_steps, train_steps)
    
    best_accuracy = -1
    early_step = 0
    
    for e in range(train_steps):
        
        train_bar = tqdm(trainloader)
        train_bar.set_description(f'Epoch: {e}/{train_steps}')
        for batch in train_bar:
            loss, accuracy = model_fn(batch, model, criterion, device)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            
            train_bar.set_postfix(loss = loss.item(), accuracy = accuracy.item())
            
    
        if (e + 1)%valid_steps == 0:
            valid_loss, valid_accuracy = valid(validloader, model, criterion, device)
        
            if valid_accuracy > best_accuracy:
                best_accuracy = valid_accuracy
                best_state_dict = model.state_dict()
                torch.save(best_state_dict, os.path.join(save_path, f'{model_type}_{e}.pth'))
                print('The best model is saved')
                early_step = 0
            else:
                early_step += 1
    
        if early_step > early_stops:
            print("The model didn't improve, so we decide to shut down it")
            return


In [27]:
config = {
    "data_dir": "D://Datasets/Dataset/Dataset/",
    "save_path": "D:\Dropbox\Deep learning\Deep Learning Hongyi Li\\4\save_models",
    "batch_size": 32,
    "valid_steps": 2,
    "warmup_steps": 1000,
    "save_steps": 10000,
    "train_steps": 70000,
    'early_stops':5,
    'model_type':'conformer'
    }

In [None]:
train(config)

## Inference

In [89]:
import csv

## Inference dataset
class Inferencedataset(Dataset):
    def __init__(self, data_dir):
        testdata_path = os.path.join(data_dir, 'testdata.json')
        self.data = json.load(open(testdata_path))['utterances']
        self.data_dir = data_dir
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        utterance = self.data[idx]
        feat_path = utterance['feature_path']
        mel = torch.load(os.path.join(self.data_dir, feat_path))
        return feat_path, mel

def inference_collate_bath(batch):
    feat_pathes, mel = list(zip(*batch))
    return feat_pathes, torch.stack(mel)

def Inference(config):
    
    data_dir = config['data_dir']
    model_path = config['model_path']
    output_path = config['output_path']
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    mapping = json.load(open(os.path.join(data_dir, 'mapping.json')))
    
    dataset = Inferencedataset(data_dir)
    dataloader = DataLoader(dataset, batch_size=1, shuffle=False, drop_last=False,
                            collate_fn=inference_collate_bath)
    
    speaker_num = len(mapping['id2speaker'])
    
    model = Classifier(n_spks=speaker_num).to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    
    results = [['Id', 'Category']]
    for feat_path, mel in tqdm(dataloader):
        with torch.no_grad():
            mel = mel.to(device)
            out = model(mel)
            pred = out.argmax(dim = 1).cpu().numpy()
            for f, p in zip(feat_path, pred):
                results.append([f, mapping["id2speaker"][str(pred[0])]])
        
    with open(output_path, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerows(results)

In [90]:
config_inf = {
    'data_dir': "D://Datasets/Dataset/Dataset/",
    'model_path': 'D:\Dropbox\Deep learning\Deep Learning Hongyi Li\\4\save_models\model_51.pth',
    'output_path': 'D:\Dropbox\Deep learning\Deep Learning Hongyi Li\\4\prediction.csv'
}

In [91]:
Inference(config_inf)

100%|█████████████████████████████████████████████████████████████████████████████| 8000/8000 [00:14<00:00, 559.24it/s]


In [7]:
dataset = Mydataset(config['data_dir'])

In [40]:
x = torch.randn(32, 108, 40)