# **Homework 2 Phoneme Classification**

* Slides: https://docs.google.com/presentation/d/1v6HkBWiJb8WNDcJ9_-2kwVstxUWml87b9CnA16Gdoio/edit?usp=sharing
* Kaggle: https://www.kaggle.com/c/ml2022spring-hw2
* Video: TBA


In [1]:
import numpy as np
import torch
import os


os.environ['NEPTUNE_API_TOKEN'] = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJhMmJkNDllNC0yYjEzLTQwZDQtYmUyZi02YmM3MTA3YzMzZTEifQ=="
seed = 87                        # random seed

#fix seed
def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  
    np.random.seed(seed)  
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

same_seeds(seed)
# !pip install pytorch-lightning
!nvidia-smi

Fri Mar 11 01:20:27 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 511.65       Driver Version: 511.65       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:09:00.0  On |                  N/A |
|  0%   44C    P8    30W / 215W |    859MiB /  8192MiB |     19%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Hyper-parameters

In [2]:
# data prarameters
train_ratio = 0.8  # the ratio of data used for training, the rest will be used for validation

# training parameters
batch_size = 8  # batch size
num_epoch = 100  # maximum number of training epoch
learning_rate = 5e-4  # learning rate
weight_decay = 1e-3
model_path = './model.ckpt'  # the path where the checkpoint will be saved
early_stop_patient = 5

# model parameters

input_dim = 39  # the input dim of the model, you should not change the value
hidden_layers = 3  # the number of hidden layers
hidden_dim = 512  # the hidden dim
bidirectional = True
dropout = 0.3
reconstruct_alpha = 0.
random_swap = 10
# pooling_method = 'attn'

## Download Data
Download data from google drive, then unzip it.

You should have
- `libriphone/train_split.txt`
- `libriphone/train_labels`
- `libriphone/test_split.txt`
- `libriphone/feat/train/*.pt`: training feature<br>
- `libriphone/feat/test/*.pt`:  testing feature<br>

after running the following block.

> **Notes: if the links are dead, you can download the data directly from [Kaggle](https://www.kaggle.com/c/ml2022spring-hw2/data) and upload it to the workspace, or you can use [the Kaggle API](https://www.kaggle.com/general/74235) to directly download the data into colab.**


### Download train/test metadata

In [3]:
# Main link
# !wget -O libriphone.zip "https://github.com/xraychen/shiny-robot/releases/download/v1.0/libriphone.zip"

# # Backup Link 0
# # !pip install --upgrade gdown
# # !gdown --id '1o6Ag-G3qItSmYhTheX6DYiuyNzWyHyTc' --output libriphone.zip

# # Backup link 1
# # !pip install --upgrade gdown
# # !gdown --id '1R1uQYi4QpX0tBfUWt2mbZcncdBsJkxeW' --output libriphone.zip

# # Backup link 2
# # !wget -O libriphone.zip "https://www.dropbox.com/s/wqww8c5dbrl2ka9/libriphone.zip?dl=1"

# # Backup link 3
# # !wget -O libriphone.zip "https://www.dropbox.com/s/p2ljbtb2bam13in/libriphone.zip?dl=1"

# !unzip -q libriphone.zip
# !ls libriphone

### Preparing Data

**Helper functions to pre-process the training data from raw MFCC features of each utterance.**

A phoneme may span several frames and is dependent to past and future frames. \
Hence we concatenate neighboring phonemes for training to achieve higher accuracy. The **concat_feat** function concatenates past and future k frames (total 2k+1 = n frames), and we predict the center frame.

Feel free to modify the data preprocess functions, but **do not drop any frame** (if you modify the functions, remember to check that the number of frames are the same as mentioned in the slides)

In [4]:
import os
import random
import torch
from tqdm import tqdm


def load_feat(path):
    feat = torch.load(path)
    return feat


def shift(x, n):
    if n < 0:
        left = x[0].repeat(-n, 1)
        right = x[:n]

    elif n > 0:
        right = x[-1].repeat(n, 1)
        left = x[n:]
    else:
        return x

    return torch.cat((left, right), dim=0)


def concat_feat(x, concat_n: int):
    assert concat_n % 2 == 1  # n must be odd
    if concat_n < 2:
        return x
    seq_len, feature_dim = x.size(0), x.size(1)
    x = x.repeat(1, concat_n)
    x = x.view(seq_len, concat_n,
               feature_dim).permute(1, 0, 2)  # concat_n, seq_len, feature_dim
    mid = (concat_n // 2)
    for r_idx in range(1, mid + 1):
        x[mid + r_idx, :] = shift(x[mid + r_idx], r_idx)
        x[mid - r_idx, :] = shift(x[mid - r_idx], -r_idx)

    return x.permute(1, 0, 2).view(seq_len, concat_n * feature_dim)


def preprocess_data(split,
                    feat_dir,
                    phone_path,
                    train_ratio=0.8,
                    train_val_seed=1337):
    class_num = 41  # NOTE: pre-computed, should not need change
    mode = 'train' if (split == 'train' or split == 'val') else 'test'

    label_dict = {}
    if mode != 'test':
        phone_file = open(os.path.join(phone_path,
                                       f'{mode}_labels.txt')).readlines()

        for line in phone_file:
            line = line.strip('\n').split(' ')
            label_dict[line[0]] = [int(p) for p in line[1:]]

    if split == 'train' or split == 'val':
        # split training and validation data
        usage_list = open(os.path.join(phone_path,
                                       'train_split.txt')).readlines()
        random.seed(train_val_seed)
        random.shuffle(usage_list)
        percent = int(len(usage_list) * train_ratio)
        usage_list = usage_list[:percent] if split == 'train' else usage_list[
            percent:]
    elif split == 'test':
        usage_list = open(os.path.join(phone_path,
                                       'test_split.txt')).readlines()
    else:
        raise ValueError(
            'Invalid \'split\' argument for dataset: PhoneDataset!')

    usage_list = [line.strip('\n') for line in usage_list]
    print('[Dataset] - # phone classes: ' + str(class_num) +
          ', number of utterances for ' + split + ': ' + str(len(usage_list)))

    X = []
    if mode != 'test':
        y = []

    idx = 0
    for i, fname in tqdm(enumerate(usage_list)):
        feat = load_feat(os.path.join(feat_dir, mode, f'{fname}.pt'))
        cur_len = len(feat)
        if mode != 'test':
            label = torch.LongTensor(label_dict[fname])
            y.append(label)

        X.append(feat)

        idx += cur_len

    print(f'[INFO] {split} set')
    print(len(X), X[0].shape)
    if mode != 'test':
        print(len(y), y[0].shape)
        return X, y
    else:
        return X


# preprocess data
train_X, train_y = preprocess_data(split='train',
                                   feat_dir='./libriphone/feat',
                                   phone_path='./libriphone',
                                   train_ratio=train_ratio)
val_X, val_y = preprocess_data(split='val',
                               feat_dir='./libriphone/feat',
                               phone_path='./libriphone',
                               train_ratio=train_ratio)


[Dataset] - # phone classes: 41, number of utterances for train: 3428


3428it [00:01, 3340.44it/s]


[INFO] train set
3428 torch.Size([936, 39])
3428 torch.Size([936])
[Dataset] - # phone classes: 41, number of utterances for val: 858


858it [00:00, 3243.07it/s]

[INFO] val set
858 torch.Size([831, 39])
858 torch.Size([831])





## Define Dataset

In [5]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


class LibriDataset(Dataset):
    def __init__(self, X, y=None, num_swaps=0):
        self.data = X
        self.num_swaps = num_swaps
        if y is not None:
            self.label = y
        else:
            self.label = None
        
    def rand_swap(self, x):
        a, b = torch.randint(0, x.shape[-1] - 1, size=(2, ))
        x[a], x[b] = x[b], x[a]
        return x
    
    def __getitem__(self, idx):
        x = torch.tensor(self.data[idx])
        
        if self.label is not None:       
            for _ in range(self.num_swaps):
                x = self.rand_swap(x)
            return x, self.label[idx]
        else:
            return x, None

    def __len__(self):
        return len(self.data)

## Define Model

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    def __init__(self, input_dim, output_dim, dropout):
        super(BasicBlock, self).__init__()

        self.block = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(input_dim, output_dim),
            nn.LeakyReLU(),
        )

    def forward(self, x):
        x = self.block(x)
        return x


class AdditiveAttention(torch.nn.Module):
    def __init__(self, in_dim=100, v_size=200):
        super().__init__()

        self.in_dim = in_dim
        self.v_size = v_size
        self.proj = nn.Sequential(nn.Linear(self.in_dim, self.v_size),
                                  nn.Tanh(), nn.Linear(self.v_size, 1))

    def forward(self, context):
        """Additive Attention
        Args:
            context (tensor): [B, seq_len, in_dim]
        Returns:
            outputs, weights: [B, seq_len, out_dim], [B, seq_len]
        """
        # weights = self.proj(context) @ self.v
        weights = self.proj(context).squeeze(-1)
        weights = torch.softmax(weights, dim=-1)  # [B, seq_len]
        return torch.bmm(weights.unsqueeze(1), context).squeeze(
            1)  # [B, 1, seq_len], [B, seq_len, dim]


class Reconstruction(nn.Module):
    def __init__(self, hidden_dim, inp_dim, dropout) -> None:
        super().__init__()
        self.down = nn.Sequential(nn.Linear(hidden_dim, hidden_dim // 2),
                                  nn.Dropout(dropout), nn.LeakyReLU(),
                                  nn.Linear(hidden_dim // 2, inp_dim))

    def forward(self, hidden: torch.Tensor, x_original: torch.Tensor):
        hidden = self.down(hidden)
        return F.mse_loss(hidden, x_original)


class LstmClassifier(nn.Module):
    def __init__(self,
                 input_dim,
                 output_dim=41,
                 hidden_layers=1,
                 hidden_dim=256,
                 bidirectional=True,
                 dropout=.3,
                 reconstruct_alpha=0):
        super().__init__()
        self.output_dim = output_dim
        self.reconstruct_alpha = reconstruct_alpha

        self.lstm = nn.LSTM(input_dim,
                            hidden_dim,
                            hidden_layers,
                            batch_first=True,
                            bidirectional=bidirectional,
                            dropout=dropout)
        lstm_size = hidden_dim * 2 if bidirectional else hidden_dim
        # self.conv1d = nn.LazyConv1d(lstm_size // 2, 5, padding=2)
        # self.mha = nn.MultiheadAttention(input_dim, 13, dropout=dropout, batch_first=True, )
        self.fc = nn.Sequential(BasicBlock(lstm_size, lstm_size // 4, dropout),
                                nn.Linear(lstm_size // 4, output_dim))
        if reconstruct_alpha > 0:
            self.recons = Reconstruction(hidden_dim, input_dim, dropout)
        self.loss = nn.CrossEntropyLoss()

    def forward(self, x, y=None):
        x = nn.utils.rnn.pad_sequence(x, batch_first=True)
        hidden, _ = self.lstm(x)  # [B, maxlen, his_size]
        # hidden, _ = self.mha(hidden, hidden, hidden)
        # hidden_2, _ = self.mha(x, x, x)
        # hidden = torch.cat((hidden, hidden_2), dim=-1)
        # hidden = self.conv1d(hidden.transpose(1, 2)).transpose(1, 2)
        logits = self.fc(hidden)

        if y is None:
            return logits
        else:
            y = nn.utils.rnn.pad_sequence(y,
                                          batch_first=True,
                                          padding_value=-1).reshape(-1)

            mask = y != -1
            logits = logits.reshape(-1, self.output_dim)[mask]
            loss = self.loss(logits, y[mask])

            if self.reconstruct_alpha != 0:
                loss += self.reconstruct_alpha * self.recons(hidden, x)
            return logits, loss


## Prepare dataset and model

In [7]:
import gc


def my_collate(batch):
    data = [item[0] for item in batch]
    target = [item[1] for item in batch]

    if target[0] == None:
        return data
    return [data, target]


# get dataset
train_set = LibriDataset(train_X, train_y, num_swaps=random_swap)
val_set = LibriDataset(val_X, val_y)

# remove raw feature to save memory
del train_X, train_y, val_X, val_y
gc.collect()

# get dataloader
train_loader = DataLoader(train_set,
                          batch_size=batch_size,
                          shuffle=True,
                          collate_fn=my_collate)
val_loader = DataLoader(val_set,
                        batch_size=batch_size,
                        shuffle=False,
                        collate_fn=my_collate)


In [8]:
model = LstmClassifier(input_dim,
                       hidden_layers=hidden_layers,
                       hidden_dim=hidden_dim,
                       bidirectional=bidirectional,
                       dropout=dropout)
for dl in train_loader:
    print(model(dl[0], dl[1]))
    break
del model

  x = torch.tensor(self.data[idx])


(tensor([[0.0272, 0.0172, 0.0517,  ..., 0.0401, 0.0523, 0.0224],
        [0.0189, 0.0165, 0.0488,  ..., 0.0384, 0.0572, 0.0210],
        [0.0218, 0.0138, 0.0558,  ..., 0.0412, 0.0526, 0.0186],
        ...,
        [0.0158, 0.0156, 0.0478,  ..., 0.0411, 0.0563, 0.0181],
        [0.0162, 0.0147, 0.0530,  ..., 0.0383, 0.0546, 0.0191],
        [0.0162, 0.0191, 0.0485,  ..., 0.0429, 0.0528, 0.0189]],
       grad_fn=<IndexBackward0>), tensor(3.7061, grad_fn=<NllLossBackward0>))


In [9]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(f'DEVICE: {device}')

DEVICE: cuda:0


In [10]:
# fix random seed

# create model, define a loss function, and optimizer
# model = Classifier(input_dim=input_dim, hidden_layers=hidden_layers, hidden_dim=hidden_dim).to(device)
# model = LstmClassifier(input_dim,
#                        hidden_layers=hidden_layers,
#                        hidden_dim=hidden_dim,
#                        bidirectional=bidirectional,
#                        dropout=dropout,
#                        pooling_method=pooling_method).to(device)
# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

## Training

In [11]:
import pytorch_lightning as pl


class Model(pl.LightningModule):
    best_acc = 0.

    def __init__(self, lr, weight_decay, input_dim, hidden_layers, hidden_dim,
                 bidirectional, dropout, reconstruct_alpha):
        super().__init__()
        self.save_hyperparameters(
            dict(lr=lr,
                 weight_decay=weight_decay,
                 input_dim=input_dim,
                 hidden_layers=hidden_layers,
                 hidden_dim=hidden_dim,
                 bidirectional=bidirectional,
                 dropout=dropout,
                 reconstruct_alpha=reconstruct_alpha))
        self.lr = lr
        self.weight_decay = weight_decay
        self.model = LstmClassifier(input_dim,
                                    hidden_layers=hidden_layers,
                                    hidden_dim=hidden_dim,
                                    bidirectional=bidirectional,
                                    dropout=dropout)

    @torch.no_grad()
    def forward(self, x):
        self.model.eval()
        return self.model(x)

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters(),
                                      lr=self.lr,
                                      weight_decay=self.weight_decay)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                               20,
                                                               eta_min=1e-5)
        return [optimizer], [scheduler]

    def training_step(self, batch, _):
        pred, loss = self.model(batch[0], batch[1])
        return {'loss': loss, 'preds': pred.detach(), 'ground': batch[1]}

    def training_epoch_end(self, outputs):
        loss = torch.tensor([o['loss'] for o in outputs]).reshape(-1)
        self.log('train_mean_loss', loss.mean().item(), prog_bar=True)

    @torch.no_grad()
    def validation_step(self, batch, _):
        pred, loss = self.model(batch[0], batch[1])
        ground = torch.cat([b for b in batch[1]], dim=0)
        return {'preds': pred, 'ground': ground}

    def validation_epoch_end(self, outputs):
        preds = torch.cat([o['preds'] for o in outputs], dim=0)
        preds = preds.argmax(-1)
        ground = torch.cat([o['ground'] for o in outputs], dim=0).reshape(-1)
        acc = (preds == ground).sum() / len(preds)
        self.best_acc = max(self.best_acc, acc.item())
        self.log('val_acc', acc, prog_bar=True)


## Hyperparmeters Tuning

In [12]:
import optuna
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
import warnings
from pytorch_lightning.loggers import NeptuneLogger


warnings.filterwarnings('ignore')

def objective(trial: optuna.trial.Trial) -> float:
    hidden_layers = trial.suggest_categorical('hidden_layers', [2, 3, 4])
    hidden_dim = trial.suggest_categorical('hidden_dim', [128, 256, 384])
    reconstruct_alpha = trial.suggest_categorical('reconstruct_alpha',
                                                  [0, 0.2, 0.4])
    dropout = trial.suggest_categorical('dropout', [0.2, 0.4])
    weight_decay = trial.suggest_categorical('weight_deacay',
                                             [1e-3, 1e-4, 5e-4, 2e-3])
    learning_rate = trial.suggest_categorical('lr',
                                              [1e-4, 2e-4, 5e-4, 1e-3, 2e-3])
    model = Model(learning_rate,
                  weight_decay,
                  input_dim,
                  hidden_layers=hidden_layers,
                  hidden_dim=hidden_dim,
                  bidirectional=bidirectional,
                  dropout=dropout,
                  reconstruct_alpha=reconstruct_alpha)
    neptune_logger = NeptuneLogger(
        project="aqweteddy/NTUML-HW2",  # format "<WORKSPACE/PROJECT>"
        tags=[f"hidden_layers_{hidden_layers}", f"hidden_dim_{hidden_dim}", 
              f'reconstruct_alpha_{reconstruct_alpha}', f'learning_rate_{learning_rate}'],  # optional
        log_model_checkpoints=False
    )
    trainer = pl.Trainer(gpus=1,
                         enable_progress_bar=True,
                         max_epochs=num_epoch,
                         reload_dataloaders_every_n_epochs=1,
                         checkpoint_callback=False,
                         weights_summary=None,
                        logger=neptune_logger,
                         callbacks=[
                             EarlyStopping(monitor="val_acc",
                                           patience=early_stop_patient,
                                           mode='max')
                         ])
    trainer.fit(
        model,
        train_dataloaders=train_loader,
        val_dataloaders=val_loader,
    )
    neptune_logger.experiment.stop()
    
    return model.best_acc


# study = optuna.create_study('sqlite:///tune.db',
#                             direction="maximize",
#                             load_if_exists=True,
#                             study_name='lstm_seq')
# study.optimize(objective, n_trials=20)
# print(f'best_trial: {study.best_trial.value}')
# print(f'best_params:')
# for key, value in study.best_trial.params.items():
#     print("    {}: {}".format(key, value))
# params = study.best_trial.params

## training

In [13]:
params =  {'hidden_layers': 5, 'hidden_dim': 256, 'reconstruct_alpha': 0, 'dropout': 0.4, 'weight_decay': 0.0001, 'lr': 0.002}
model = Model(params['lr'],
              params['weight_decay'],
              input_dim,
              hidden_layers=params['hidden_layers'],
              hidden_dim=params['hidden_dim'],
              bidirectional=bidirectional,
              dropout=params['dropout'],
              reconstruct_alpha=params['reconstruct_alpha'])
# model = Model(learning_rate,
#               weight_decay,
#               input_dim,
#               hidden_layers=hidden_layers,
#               hidden_dim=hidden_dim,
#               bidirectional=bidirectional,
#               dropout=dropout,
#               reconstruct_alpha=reconstruct_alpha)
checkpoint_callback = ModelCheckpoint(monitor="val_acc",
                                      save_top_k=2,
                                      mode='max')
trainer = pl.Trainer(gpus=1,
                     enable_progress_bar=True,
                     max_epochs=num_epoch,
                     reload_dataloaders_every_n_epochs=1,
                     callbacks=[
                         EarlyStopping(monitor="val_acc",
                                       patience=early_stop_patient,
                                       mode='max'), checkpoint_callback
                     ])
trainer.fit(
    model,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader,
)



GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type           | Params
-----------------------------------------
0 | model | LstmClassifier | 7.0 M 
-----------------------------------------
7.0 M     Trainable params
0         Non-trainable params
7.0 M     Total params
27.948    Total estimated model params size (MB)


Epoch 26: 100%|██████████| 537/537 [01:57<00:00,  4.57it/s, loss=0.265, v_num=11, val_acc=0.836, train_mean_loss=0.249]


## Testing
Create a testing dataset, and load model from the saved checkpoint.

In [14]:
# load data
test_X = preprocess_data(split='test',
                         feat_dir='./libriphone/feat',
                         phone_path='./libriphone')
test_set = LibriDataset(test_X, None)
test_loader = DataLoader(test_set,
                         batch_size=batch_size,
                         shuffle=False,
                         collate_fn=my_collate)


[Dataset] - # phone classes: 41, number of utterances for test: 1078


1078it [00:00, 3703.54it/s]

[INFO] test set
1078 torch.Size([818, 39])





In [15]:
# load model
best_path = checkpoint_callback.best_model_path
model.load_from_checkpoint(best_path)
model.to(device)

Model(
  (model): LstmClassifier(
    (lstm): LSTM(39, 256, num_layers=5, batch_first=True, dropout=0.4, bidirectional=True)
    (fc): Sequential(
      (0): BasicBlock(
        (block): Sequential(
          (0): Dropout(p=0.4, inplace=False)
          (1): Linear(in_features=512, out_features=128, bias=True)
          (2): LeakyReLU(negative_slope=0.01)
        )
      )
      (1): Linear(in_features=128, out_features=41, bias=True)
    )
    (loss): CrossEntropyLoss()
  )
)

Make prediction.

In [16]:
test_acc = 0.0
test_lengths = 0
pred = []
model.eval()
with torch.no_grad():
    for i, batch in enumerate(tqdm(test_loader)):
        features = batch
        features = [f.to(device) for f in features]

        outputs = model(features)

        _, test_pred = torch.max(
            outputs,
            -1)  # get the index of the class with the highest probability
        for p, f in zip(test_pred, features):
            p = p[:len(f)]
            pred.extend(p.detach().cpu().tolist())

100%|██████████| 135/135 [00:13<00:00, 10.18it/s]


Write prediction to a CSV file.

After finish running this block, download the file `prediction.csv` from the files section on the left-hand side and submit it to Kaggle.

In [17]:
with open('prediction.csv', 'w') as f:
    f.write('Id,Class\n')
    for i, y in enumerate(pred):
        f.write('{},{}\n'.format(i, y))