In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/MyDrive/research/Infinite/')

! pip install optuna
! pip install pytorch-lightning
! pip install transformers
! pip install seqeval

In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import DistilBertModel, DistilBertTokenizerFast

import pandas as pd
import random

from scripts.dataset import *
from scripts.utils import *

import optuna
from optuna.integration import PyTorchLightningPruningCallback

In [1]:
# loading slot index file
final_slots = pd.read_csv(
    "/content/drive/MyDrive/research/Infinite/data/ATIS/slots_list.csv", sep=",", header=None, names=["SLOTS"]
).SLOTS.values.tolist()

idx2slots = {idx: slots for idx, slots in enumerate(final_slots)}

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

NameError: name 'pd' is not defined

In [39]:
# model parameter
config = {
    "mc": {
        "model_name": "distilbert-base-multilingual-cased",
        "tokenizer_name": "distilbert-base-multilingual-cased",
        "joint_loss_coef": 0.5,
    },
    # training parameters
    "tc": {
        "lr": 0.00003,
        "epoch": 20,
        "batch_size": 64,
        "weight_decay": 0.003,
        "shuffle_data": True,
        "num_worker": 2,
    },
    # data params
    "dc": {
        "train_dir": "/content/drive/MyDrive/research/Infinite/data/ATIS/experiment/train/clean/train.tsv",
        "val_dir": "/content/drive/MyDrive/research/Infinite/data/ATIS/experiment/dev/clean/dev.tsv",
        "max_len": 56,
    },
    # misc
    "misc": {
        "fix_seed": False,
        "gpus": -1,
        "precision": 16,
    },
}

In [40]:
class IC_NER(nn.Module):
    def __init__(self, idropout_1, idropout_2, sdropout, ihidden_size):

        super(IC_NER, self).__init__()

        self.encoder = DistilBertModel.from_pretrained(
            "distilbert-base-multilingual-cased",
            return_dict=True,
            output_hidden_states=True,
            sinusoidal_pos_embds=True
        )

        self.intent_dropout_1 = nn.Dropout(idropout_1)
        self.intent_dropout_2 = nn.Dropout(idropout_2)
        self.intent_FC1 = nn.Linear(768, ihidden_size)
        self.intent_FC2 = nn.Linear(ihidden_size, 18)

        # slots layer
        self.slots_dropout = nn.Dropout(sdropout)
        self.slots_FC = nn.Linear(768, 120)

        self.intent_loss_fn = nn.CrossEntropyLoss()
        self.slot_loss_fn = nn.CrossEntropyLoss()

        self.jlc = 0.5
        # self.cfg = cfg

    def forward(self, input_ids, attention_mask, intent_target, slots_target):

        encoded_output = self.encoder(input_ids, attention_mask)

        # intent data flow
        intent_hidden = encoded_output[0][:, 0]
        intent_hidden = self.intent_FC1(self.intent_dropout_1(F.gelu(intent_hidden)))
        intent_logits = self.intent_FC2(self.intent_dropout_2(F.gelu(intent_hidden)))

        # accumulating intent classification loss
        intent_loss = self.intent_loss_fn(intent_logits, intent_target)
        intent_pred = torch.argmax(nn.Softmax(dim=1)(intent_logits), axis=1)

        # slots data flow
        slots_hidden = encoded_output[0]
        slots_logits = self.slots_FC(self.slots_dropout(F.relu(slots_hidden)))
        slot_pred = torch.argmax(nn.Softmax(dim=2)(slots_logits), axis=2)

        # accumulating slot prediction loss
        slot_loss = self.slot_loss_fn(slots_logits.view(-1, 120), slots_target.view(-1))

        joint_loss = self.jlc * intent_loss + (1.0 - self.jlc) * slot_loss

        return {
            "joint_loss": joint_loss,
            "ic_loss": intent_loss,
            "ner_loss": slot_loss,
            "intent_pred": intent_pred,
            "slot_pred": slot_pred,
        }


In [43]:
def objective(trial: optuna.trial.Trial) -> float:

    # We optimize the number of layers, hidden units in each layer and dropouts.
    ihidden_size = trial.suggest_int("intent_hidden_size", 64, 512)
    idropout_1 = trial.suggest_float("idropout1", 0.2, 0.5)
    idropout_2 = trial.suggest_float("idropout2", 0.2, 0.5)
    sdropout = trial.suggest_float("sdropout", 0.2, 0.5)
    

    model = IC_NER(idropout_1, idropout_2, sdropout, ihidden_size).to(DEVICE)

    dm = NLU_Dataset_pl(
        config["dc"]["train_dir"],
        config["dc"]["val_dir"],
        config["dc"]["val_dir"],
        config["mc"]["tokenizer_name"],
        config["dc"]["max_len"],
        config["tc"]["batch_size"],
        config["tc"]["num_worker"],
    )
    dm.setup()
    
    trainDL, valDL = dm.train_dataloader() , dm.val_dataloader()
    
    optimizer = torch.optim.AdamW(
            model.parameters(), lr=config["tc"]["lr"], weight_decay=config["tc"]["weight_decay"]
        )

    # training
    model.train()
    for epoch in range(config['tc']['epoch']):
        
        for batch in trainDL:
            token_ids, attention_mask = batch["token_ids"].to(DEVICE), batch["mask"].to(DEVICE)
            intent_target, slots_target = batch["intent_id"].to(DEVICE), batch["slots_id"].to(DEVICE)

            out = model(token_ids, attention_mask, intent_target, slots_target)
            optimizer.zero_grad()
            out["joint_loss"].backward()
            optimizer.step()
            
    
    model.eval()
    
    #validation

    acc,slotsF1,cnt = 0.0,0.0,0
    with torch.no_grad():
        
        for batch in valDL:

            token_ids, attention_mask = batch["token_ids"].to(DEVICE), batch["mask"].to(DEVICE)
            intent_target, slots_target = batch["intent_id"].to(DEVICE), batch["slots_id"].to(DEVICE)

            out = model(token_ids, attention_mask, intent_target, slots_target)
            intent_pred, slot_pred = out["intent_pred"], out["slot_pred"]
            
            acc += accuracy(out["intent_pred"], intent_target)
            slotsF1 += slot_F1(out["slot_pred"], slots_target, idx2slots)
            cnt += 1
        
    acc = acc/float(cnt)
    slotsF1 = slotsF1/float(cnt)

    return acc, slotsF1

In [None]:

sampler = optuna.multi_objective.samplers.MOTPEMultiObjectiveSampler(n_startup_trials=21,
                                                                     n_ehvi_candidates=24)
study = optuna.multi_objective.create_study(directions=["maximize","maximize"])


study.optimize(objective, n_trials=50, timeout=100000)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2021-04-24 16:30:54,109][0m A new study created in memory with name: no-name-03497d8a-5928-4b14-ac4b-a62987c72375[0m
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Using native 16bit precision.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type   | Params
----------------------------------
0 | IC_NER | IC_NER | 134 M 
----------------------------------
134 M     Trainable params
0         Non-trainable params
134 M     Total params
539.583   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]