In [None]:
! pip install optuna

In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning import seed_everything, loggers as pl_loggers
from pytorch_lightning.callbacks import ModelCheckpoint
from transformers import DistilBertModel, DistilBertTokenizerFast

import pandas as pd
import random

from scripts.dataset import *
from scripts.utils import *

import optuna
from optuna.integration import PyTorchLightningPruningCallback


In [38]:
# loading slot index file
final_slots = pd.read_csv(
    "./data/multiATIS/slots_list.csv", sep=",", header=None, names=["SLOTS"]
).SLOTS.values.tolist()

idx2slots = {idx: slots for idx, slots in enumerate(final_slots)}


In [39]:
# model parameter
config = {
    "mc": {
        "model_name": "distilbert-base-multilingual-cased",
        "tokenizer_name": "distilbert-base-multilingual-cased",
        "joint_loss_coef": 0.5,
    },
    # training parameters
    "tc": {
        "lr": 0.00005,
        "epoch": 13,
        "batch_size": 64,
        "weight_decay": 0.003,
        "shuffle_data": True,
        "num_worker": 8,
    },
    # data params
    "dc": {
        "train_dir": "./data/multiATIS/split/train/clean/train.tsv",
        "val_dir": "./data/multiATIS/split/valid/clean/val.tsv",
        "max_len": 56,
    },
    # misc
    "misc": {
        "fix_seed": False,
        "gpus": -1,
        "log_dir": "./",
        "precision": 16,
    },
}

In [40]:
class IC_NER(nn.Module):
    def __init__(self, idropout_1, idropout_2, sdropout, ihidden_size):

        super(IC_NER, self).__init__()

        self.encoder = DistilBertModel.from_pretrained(
            "distilbert-base-multilingual-cased",
            return_dict=True,
            output_hidden_states=True,
        )

        self.intent_dropout_1 = nn.Dropout(idropout_1)
        self.intent_dropout_2 = nn.Dropout(idropout_2)
        self.intent_FC1 = nn.Linear(768, ihidden_size)
        self.intent_FC2 = nn.Linear(ihidden_size, 18)

        # slots layer
        self.slots_dropout = nn.Dropout(sdropout)
        self.slots_FC = nn.Linear(768, 159)

        self.intent_loss_fn = nn.CrossEntropyLoss()
        self.slot_loss_fn = nn.CrossEntropyLoss()

        self.jlc = 0.5
        # self.cfg = cfg

    def forward(self, input_ids, attention_mask, intent_target, slots_target):

        encoded_output = self.encoder(input_ids, attention_mask)

        # intent data flow
        intent_hidden = encoded_output[0][:, 0]
        intent_hidden = self.intent_FC1(self.intent_dropout_1(F.gelu(intent_hidden)))
        intent_logits = self.intent_FC2(self.intent_dropout_2(F.gelu(intent_hidden)))

        # accumulating intent classification loss
        intent_loss = self.intent_loss_fn(intent_logits, intent_target)
        intent_pred = torch.argmax(nn.Softmax(dim=1)(intent_logits), axis=1)

        # slots data flow
        slots_hidden = encoded_output[0]
        slots_logits = self.slots_FC(self.slots_dropout(F.relu(slots_hidden)))
        slot_pred = torch.argmax(nn.Softmax(dim=2)(slots_logits), axis=2)

        # accumulating slot prediction loss
        slot_loss = self.slot_loss_fn(slots_logits.view(-1, 159), slots_target.view(-1))

        joint_loss = self.jlc * intent_loss + (1.0 - self.jlc) * slot_loss

        return {
            "joint_loss": joint_loss,
            "ic_loss": intent_loss,
            "ner_loss": slot_loss,
            "intent_pred": intent_pred,
            "slot_pred": slot_pred,
        }


In [41]:
class jointBert(pl.LightningModule):
    def __init__(self, cfg, idropout_1, idropout_2, sdropout, ihidden_size, lr):
        super().__init__()
        self.IC_NER = IC_NER(idropout_1, idropout_2, sdropout, ihidden_size)
        self.cfg = cfg
        self.lr = lr

    def forward(self, input_ids, attention_mask, intent_target, slots_target):
        return self.IC_NER(input_ids, attention_mask, intent_target, slots_target)

    def training_step(self, batch, batch_idx):

        token_ids, attention_mask = batch["token_ids"], batch["mask"]
        intent_target, slots_target = batch["intent_id"], batch["slots_id"]

        out = self(token_ids, attention_mask, intent_target, slots_target)

        self.log(
            "train_IC_NER_loss",
            out["joint_loss"],
            on_step=False,
            on_epoch=True,
            logger=True,
        )
        self.log(
            "train_IC_loss", out["ic_loss"], on_step=False, on_epoch=True, logger=True
        )
        self.log(
            "train_NER_loss", out["ner_loss"], on_step=False, on_epoch=True, logger=True
        )

        return out["joint_loss"]

    def validation_step(self, batch, batch_idx):

        token_ids, attention_mask = batch["token_ids"], batch["mask"]
        intent_target, slots_target = batch["intent_id"], batch["slots_id"]

        out = self(token_ids, attention_mask, intent_target, slots_target)
        intent_pred, slot_pred = out["intent_pred"], out["slot_pred"]

        self.log(
            "val_IC_NER_loss",
            out["joint_loss"],
            on_step=False,
            on_epoch=True,
            logger=True,
        )
        self.log(
            "val_IC_loss", out["ic_loss"], on_step=False, on_epoch=True, logger=True
        )
        self.log(
            "val_NER_loss", out["ner_loss"], on_step=False, on_epoch=True, logger=True
        )
        self.log(
            "val_intent_acc",
            accuracy(out["intent_pred"], intent_target),
            on_step=False,
            on_epoch=True,
            logger=True,
        )
        self.log(
            "slot_f1",
            slot_F1(out["slot_pred"], slots_target, idx2slots),
            on_step=False,
            on_epoch=True,
            logger=True,
        )

        return out["joint_loss"]

    def test_step(self, batch, batch_idx):

        token_ids, attention_mask = batch["token_ids"], batch["mask"]
        intent_target, slots_target = batch["intent_id"], batch["slots_id"]

        out = self(token_ids, attention_mask, intent_target, slots_target)
        intent_pred, slot_pred = out["intent_pred"], out["slot_pred"]

        self.log(
            "test_intent_acc",
            accuracy(intent_pred, intent_target),
            on_step=False,
            on_epoch=True,
            logger=True,
        )
        self.log(
            "test_slot_f1",
            slot_F1(slot_pred, slots_target, idx2slots),
            on_step=False,
            on_epoch=True,
            logger=True,
        )

        return out["joint_loss"]

    def configure_optimizers(self):
        return torch.optim.AdamW(
            self.parameters(), lr=self.lr, weight_decay=self.cfg["tc"]["weight_decay"]
        )


In [43]:
def objective(trial: optuna.trial.Trial) -> float:

    # We optimize the number of layers, hidden units in each layer and dropouts.
    ihidden_size = trial.suggest_int("intent_hidden_size", 32, 512, log=True)
    idropout_1 = trial.suggest_float("idropout1", 0.2, 0.5)
    idropout_2 = trial.suggest_float("idropout2", 0.2, 0.5)
    sdropout = trial.suggest_float("sdropout", 0.2, 0.5)
    lr = trial.suggest_loguniform("learning_rate", 1e-5, 1000)

    model = jointBert(config, idropout_1, idropout_2, sdropout, ihidden_size, lr)

    dm = NLU_Dataset_pl(
        config["dc"]["train_dir"],
        config["dc"]["val_dir"],
        config["dc"]["val_dir"],
        config["mc"]["tokenizer_name"],
        config["dc"]["max_len"],
        config["tc"]["batch_size"],
        config["tc"]["num_worker"],
    )

    trainer = pl.Trainer(
        logger=True,
        checkpoint_callback=False,
        max_epochs=13,
        precision=config["misc"]["precision"],
        gpus=config["misc"]["gpus"],
        callbacks=[PyTorchLightningPruningCallback(trial, monitor="val_IC_NER_loss")],
    )
    hyperparameters = dict(
        hidden=ihidden_size,
        idropout1=idropout_1,
        idropout_2=idropout_2,
        sdropout=sdropout,
    )
    trainer.logger.log_hyperparams(hyperparameters)
    trainer.fit(model, datamodule=dm)

    return trainer.callback_metrics["val_IC_NER_loss"].item()

In [None]:
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction="minimize", pruner=pruner)
study.optimize(objective, n_trials=100, timeout=1000)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2021-04-24 16:30:54,109][0m A new study created in memory with name: no-name-03497d8a-5928-4b14-ac4b-a62987c72375[0m
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Using native 16bit precision.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type   | Params
----------------------------------
0 | IC_NER | IC_NER | 134 M 
----------------------------------
134 M     Trainable params
0         Non-trainable params
134 M     Total params
539.583   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]