In [1]:
import dataclasses
from pathlib import Path

import nlp
import torch
import numpy as np
from transformers import BertTokenizerFast
from transformers import BertForSequenceClassification
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.model_selection import train_test_split

try:
    from apex import amp
    APEX_AVAILABLE = True
except ModuleNotFoundError:
    APEX_AVAILABLE = False

from pytorch_helper_bot import (
    BaseBot, MovingAverageStatsTrackerCallback,  CheckpointCallback,
    LearningRateSchedulerCallback, MultiStageScheduler, Top1Accuracy,
    LinearLR
)

In [2]:
CACHE_DIR = Path("cache/")
CACHE_DIR.mkdir(exist_ok=True)

Reference:

    * https://github.com/huggingface/nlp/blob/master/notebooks/Overview.ipynb

In [3]:
dataset = nlp.load_dataset('glue', "sst2")

In [4]:
set([x['label'] for x in dataset["train"]])

{0, 1}

In [5]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [6]:
# Tokenize our training dataset
def convert_to_features(example_batch):
    # Tokenize contexts and questions (as pairs of inputs)
    encodings = tokenizer.batch_encode_plus(example_batch['sentence'], pad_to_max_length=True, max_length=64)
    return encodings

In [7]:
# Format our dataset to outputs torch.Tensor to train a pytorch model
columns = ['input_ids', 'token_type_ids', 'attention_mask', "label"]
for subset in ("train", "validation"): 
    dataset[subset] = dataset[subset].map(convert_to_features, batched=True)
    dataset[subset].set_format(type='torch', columns=columns)

In [8]:
tokenizer.decode(dataset['train'][6]["input_ids"].numpy())

'[CLS] demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small, personal film with an emotional wallop. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [9]:
dataset['train'][0]["attention_mask"]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [10]:
class SST2Dataset(torch.utils.data.Dataset):
    def __init__(self, entries_dict):
        super().__init__()
        self.entries_dict = entries_dict
    
    def __len__(self):
        return len(self.entries_dict["label"])
    
    def __getitem__(self, idx):
        return (
            self.entries_dict["input_ids"][idx],
            self.entries_dict["attention_mask"][idx],
            self.entries_dict["token_type_ids"][idx],
            self.entries_dict["label"][idx]
        )

In [11]:
valid_idx, test_idx = train_test_split(list(range(len(dataset["validation"]))), test_size=0.5, random_state=42)

In [12]:
train_dict = {
    "input_ids": dataset['train']["input_ids"],
    "attention_mask": dataset['train']["attention_mask"],
    "token_type_ids": dataset['train']["token_type_ids"],
    "label": dataset['train']["label"]
}
valid_dict = {
    "input_ids": dataset['validation']["input_ids"][valid_idx],
    "attention_mask": dataset['validation']["attention_mask"][valid_idx],
    "token_type_ids": dataset['validation']["token_type_ids"][valid_idx],
    "label": dataset['validation']["label"][valid_idx]
}
test_dict = {
    "input_ids": dataset['validation']["input_ids"][test_idx],
    "attention_mask": dataset['validation']["attention_mask"][test_idx],
    "token_type_ids": dataset['validation']["token_type_ids"][test_idx],
    "label": dataset['validation']["label"][test_idx]
}

In [13]:
# Instantiate a PyTorch Dataloader around our dataset
train_loader = torch.utils.data.DataLoader(SST2Dataset(train_dict), batch_size=32, shuffle=True)
valid_loader = torch.utils.data.DataLoader(SST2Dataset(valid_dict), batch_size=32, drop_last=False)
test_loader = torch.utils.data.DataLoader(SST2Dataset(test_dict), batch_size=32, drop_last=False)

In [14]:
@dataclasses.dataclass
class SST2Bot(BaseBot):
    log_dir = CACHE_DIR / "logs"
    
    def __post_init__(self):
        super().__post_init__()
        self.loss_format = "%.6f"

    @staticmethod
    def extract_prediction(output):
        return output[0]

In [15]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased').cuda()

In [16]:
# torch.nn.init.kaiming_normal_(model.classifier.weight)
# torch.nn.init.constant_(model.classifier.bias, 0)
# torch.nn.init.kaiming_normal_(model.bert.pooler.dense.weight)
# torch.nn.init.constant_(model.bert.pooler.dense.bias, 0);

In [17]:
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

In [18]:
if APEX_AVAILABLE:
    model, optimizer = amp.initialize(
        model, optimizer, opt_level="O1"
    )

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


In [19]:
total_steps = len(train_loader) * 3

checkpoints = CheckpointCallback(
    keep_n_checkpoints=1,
    checkpoint_dir=CACHE_DIR / "model_cache/",
    monitor_metric="accuracy"
)
lr_durations = [
    int(total_steps*0.2),
    int(np.ceil(total_steps*0.8))
]
break_points = [0] + list(np.cumsum(lr_durations))[:-1]
callbacks = [
    MovingAverageStatsTrackerCallback(
        avg_window=len(train_loader) // 8,
        log_interval=len(train_loader) // 10
    ),
    LearningRateSchedulerCallback(
        MultiStageScheduler(
            [
                LinearLR(optimizer, 0.01, lr_durations[0]),
                CosineAnnealingLR(optimizer, lr_durations[1])
            ],
            start_at_epochs=break_points
        )
    ),
    checkpoints
]
    
bot = SST2Bot(
    model=model, 
    train_loader=train_loader,
    valid_loader=valid_loader, 
    clip_grad=10.,
    optimizer=optimizer, echo=True,
    criterion=torch.nn.CrossEntropyLoss(),
    callbacks=callbacks,
    pbar=False, use_tensorboard=False,
    use_amp=APEX_AVAILABLE,
    metrics=(Top1Accuracy(),)
)

[INFO][06/15/2020 14:19:31] SEED: 9293
[INFO][06/15/2020 14:19:31] # of parameters: 109,483,778
[INFO][06/15/2020 14:19:31] # of trainable parameters: 109,483,778


In [20]:
print(total_steps)
bot.train(
    total_steps=total_steps,
    checkpoint_interval=len(train_loader) // 2
)
bot.load_model(checkpoints.best_performers[0][1])
checkpoints.remove_checkpoints(keep=0)

[INFO][06/15/2020 14:19:31] Optimizer Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    initial_lr: 2e-05
    lr: 2e-05
    weight_decay: 0
)
[INFO][06/15/2020 14:19:31] Batches per epoch: 2105


6315
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0


[INFO][06/15/2020 14:19:54] Step   210 | loss 0.642613 | lr: 3.49e-06 | 0.113s per step
[INFO][06/15/2020 14:20:18] Step   420 | loss 0.363807 | lr: 6.79e-06 | 0.114s per step
[INFO][06/15/2020 14:20:42] Step   630 | loss 0.272996 | lr: 1.01e-05 | 0.112s per step
[INFO][06/15/2020 14:21:06] Step   840 | loss 0.244767 | lr: 1.34e-05 | 0.117s per step
[INFO][06/15/2020 14:21:31] Step  1050 | loss 0.219585 | lr: 1.67e-05 | 0.119s per step
[INFO][06/15/2020 14:21:32] Metrics at step 1052:
[INFO][06/15/2020 14:21:32] loss: 0.270236
[INFO][06/15/2020 14:21:32] accuracy: 91.06%
[INFO][06/15/2020 14:21:56] Step  1260 | loss 0.206357 | lr: 2.00e-05 | 0.118s per step
[INFO][06/15/2020 14:22:20] Step  1470 | loss 0.194531 | lr: 1.99e-05 | 0.111s per step
[INFO][06/15/2020 14:22:43] Step  1680 | loss 0.190104 | lr: 1.97e-05 | 0.111s per step
[INFO][06/15/2020 14:23:06] Step  1890 | loss 0.175202 | lr: 1.93e-05 | 0.110s per step


Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0


[INFO][06/15/2020 14:23:29] Step  2100 | loss 0.170110 | lr: 1.87e-05 | 0.110s per step
[INFO][06/15/2020 14:23:30] Metrics at step 2104:
[INFO][06/15/2020 14:23:30] loss: 0.254708
[INFO][06/15/2020 14:23:30] accuracy: 91.74%
[INFO][06/15/2020 14:23:55] Step  2310 | loss 0.132254 | lr: 1.80e-05 | 0.123s per step
[INFO][06/15/2020 14:24:19] Step  2520 | loss 0.119210 | lr: 1.71e-05 | 0.116s per step
[INFO][06/15/2020 14:24:43] Step  2730 | loss 0.118474 | lr: 1.61e-05 | 0.115s per step
[INFO][06/15/2020 14:25:08] Step  2940 | loss 0.127645 | lr: 1.51e-05 | 0.116s per step
[INFO][06/15/2020 14:25:31] Step  3150 | loss 0.121345 | lr: 1.39e-05 | 0.111s per step
[INFO][06/15/2020 14:25:32] Metrics at step 3156:
[INFO][06/15/2020 14:25:32] loss: 0.253494
[INFO][06/15/2020 14:25:32] accuracy: 91.06%
[INFO][06/15/2020 14:25:54] Step  3360 | loss 0.116062 | lr: 1.27e-05 | 0.112s per step
[INFO][06/15/2020 14:26:18] Step  3570 | loss 0.114419 | lr: 1.14e-05 | 0.114s per step
[INFO][06/15/2020 14

In [21]:
TARGET_DIR = CACHE_DIR / "sst2_bert_uncased"
TARGET_DIR.mkdir(exist_ok=True)
bot.model.save_pretrained(TARGET_DIR)

In [22]:
bot.eval(valid_loader)

{'loss': (0.28851168882956196, '0.288512'),
 'accuracy': (-0.9220183486238532, '92.20%')}

In [23]:
bot.eval(test_loader)

{'loss': (0.19422287623816675, '0.194223'),
 'accuracy': (-0.9380733944954128, '93.81%')}

In [24]:
tokenizer.pad_token_id

0