In [1]:
import torch
from torch import nn
from torch.utils import data
from torch.cuda.amp import autocast, GradScaler

import numpy as np
from tqdm.notebook import tqdm

import sys
import datetime

sys.path.append('../code')
from dataset import get_data, MaskedDataset, make_vocab

from transformers import (
    AdamW, get_linear_schedule_with_warmup
)

from models import MaskedRDModel

In [2]:
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')

Using cache found in /home/ubuntu/.cache/torch/hub/huggingface_pytorch-transformers_master


In [3]:
mask_size = 5
model = MaskedRDModel.from_pretrained('bert-base-uncased')
model.initialize(mask_size)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing MaskedRDModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing MaskedRDModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MaskedRDModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
d = get_data('../wantwords-english-baseline/data', word2vec=False)

Loading data...
Training data: 675715 word-def pairs
Dev data: 75873 word-def pairs
Test data: 1200 word-def pairs


In [5]:
train_data, train_data_def, dev_data, test_data_seen, \
    test_data_unseen, test_data_desc = d

In [6]:
target_matrix, target2idx, idx2target = make_vocab(d, tokenizer, mask_size=mask_size)

In [7]:
# target2idx maps target words to indices
# target_matrix maps target indices to bpe sequences, padded/truncated to mask_size
target2idx['book'], target_matrix[target2idx['book']], idx2target[target2idx['book']]

(16187, tensor([2338,  103,  103,  103,  103]), 'book')

In [8]:
train_dataset = MaskedDataset(train_data + train_data_def, tokenizer, target2idx, mask_size=mask_size)
dev_dataset = MaskedDataset(dev_data, tokenizer, target2idx, mask_size=mask_size)
test_dataset_seen = MaskedDataset(test_data_seen, tokenizer, target2idx, mask_size=mask_size)
test_dataset_unseen = MaskedDataset(test_data_unseen, tokenizer, target2idx, mask_size=mask_size)
test_dataset_desc = MaskedDataset(test_data_desc, tokenizer, target2idx, mask_size=mask_size)

In [9]:
batch_size = 55
num_workers = 4

loader_params = {
    'pin_memory': False,
    'batch_size': batch_size,
    'num_workers': num_workers,
    'collate_fn': train_dataset.collate_fn
}

train_loader = data.DataLoader(train_dataset, **{'shuffle': True, **loader_params})
dev_loader = data.DataLoader(dev_dataset, **{'shuffle': True, **loader_params})
test_loader_seen = data.DataLoader(test_dataset_seen, **{'shuffle': False, **loader_params})
test_loader_unseen = data.DataLoader(test_dataset_unseen, **{'shuffle': False, **loader_params})
test_loader_desc = data.DataLoader(test_dataset_desc, **{'shuffle': False, **loader_params})

In [10]:
train_dataset[0], tokenizer.convert_ids_to_tokens(train_dataset[0][0])

((tensor([  101,   103,   103,   103,   103,   103,   102,  2000,  2713,  1037,
           2711,  2013,  7750, 11819,  2013,  6531,  1037,  7979,  4735,  2040,
           2001, 14933,  2098,  2011,  1996,  3099,   102]),
  0),
 ['[CLS]',
  '[MASK]',
  '[MASK]',
  '[MASK]',
  '[MASK]',
  '[MASK]',
  '[SEP]',
  'to',
  'release',
  'a',
  'person',
  'from',
  'punishment',
  'exempt',
  'from',
  'penalty',
  'a',
  'convicted',
  'criminal',
  'who',
  'was',
  'pardon',
  '##ed',
  'by',
  'the',
  'governor',
  '[SEP]'])

In [11]:
epochs = 10

lr = 2e-5
optim = AdamW(model.parameters(), lr=lr)

warmup_duration = 0.01 # portion of the first epoch spent on lr warmup
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=len(train_loader) * warmup_duration, 
                                            num_training_steps=len(train_loader) * epochs)

epoch = 0

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# scaler = GradScaler()

In [12]:
import wandb

wandb.init(project='reverse-dictionary', entity='reverse-dict')

config = wandb.config
config.learning_rate = lr
config.epochs = epochs
config.batch_size = batch_size
config.optimizer = type(optim).__name__
config.scheduler = type(scheduler).__name__
config.warmup_duration = warmup_duration

wandb.watch(model)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mreverse-dict[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.29 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[<wandb.wandb_torch.TorchGraph at 0x7f0ea4382410>]

In [13]:
target_matrix = target_matrix.to(device)

In [14]:
model = model.to(device)

In [15]:
def evaluate(pred, gt, test=False):
    acc1 = acc10 = acc100 = 0
    n = len(pred)
    pred_rank = []
    for p, word in zip(pred, gt):
        if test:
            loc = (p == word).nonzero(as_tuple=True)
            if len(loc) != 0:
                pred_rank.append(min(loc[-1], 1000))
            else:
                pred_rank.append(1000)
        if word in p[:100]:
            acc100 += 1
            if word in p[:10]:
                acc10 += 1
                if word == p[0]:
                    acc1 += 1
    if test:
        pred_rank = torch.tensor(pred_rank, dtype=torch.float32)
        return (acc1, acc10, acc100, pred_rank)
    else:
        return acc1/n, acc10/n, acc100/n

In [16]:
inc = 10
losses = []

for epoch in range(epoch, epochs):
    # Training
    model.train()
    train_loss = 0.0
    # Train on subset of training data to save time
    with tqdm(total=len(train_loader)) as pbar:
        for i, (x, y) in enumerate(train_loader):
            if i % inc == 0 and i != 0:
                display_loss = train_loss / i
                pbar.set_description(f'Epoch {epoch+1}, Train Loss: {train_loss / i}')

            optim.zero_grad()

            x = x.to(device)
            attention_mask = (x != train_dataset.pad_id)
            y = y.to(device)
            
            loss, out = model(input_ids=x, attention_mask=attention_mask, 
                              target_matrix=target_matrix, ground_truth=y)

#             scaler.scale(loss).backward()
            loss.backward()
            
#             scaler.unscale_(optim)
            nn.utils.clip_grad_value_(model.parameters(), 5)
            
#             scaler.step(optim)
            optim.step()
#             scaler.update()
            
            train_loss += loss.detach()
            
            scheduler.step()
            
            pbar.update(1)
            
            del x, y, out, loss, attention_mask
            
    model_name = type(model).__name__
    filename = f'../trained_models/{model_name} Epoch {epoch+1} at {datetime.datetime.now()}'.replace(' ', '_')
    with open(filename, 'wb+') as f:
        torch.save(model, f)
    
    # Validation
    model.eval()
    val_loss = 0.0
    val_acc1, val_acc10, val_acc100 = 0.0, 0.0, 0.0
    with torch.no_grad():
        with tqdm(total=len(dev_loader)) as pbar:
            for i, (x, y) in enumerate(dev_loader):
                if i % inc == 0 and i != 0:
                    display_loss = val_loss / i
                    pbar.set_description(f'Epoch {epoch+1}, Val Loss: {val_loss / i}')

                x = x.to(device)
                attention_mask = (x != train_dataset.pad_id)
                y = y.to(device)

#                 with autocast():
                loss, out = model(input_ids=x, attention_mask=attention_mask, target_matrix=target_matrix,
                              criterion=criterion, ground_truth=y)

                val_loss += loss.detach()

                pbar.update(1)                
                
                result, indices = torch.topk(out, k=100, dim=-1, largest=True, sorted=True)
                
                acc1, acc10, acc100 = evaluate(indices, y)
                val_acc1 += acc1
                val_acc10 += acc10
                val_acc100 += acc100

                del x, y, out, loss
    
    wandb.log({
        'train_loss': train_loss / len(train_loader),
        'val_loss': val_loss / len(dev_loader),
        'val_acc1': val_acc1 / len(dev_loader),
        'val_acc10': val_acc10 / len(dev_loader),
        'val_acc100': val_acc100 / len(dev_loader)
    })
    

  0%|          | 0/12286 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 142.00 MiB (GPU 0; 14.76 GiB total capacity; 12.69 GiB already allocated; 45.75 MiB free; 13.45 GiB reserved in total by PyTorch)

In [33]:
'''
Epoch 1, Train Loss: 5.3207106590271: 100%|██████████| 14078/14078 [1:39:36<00:00,  2.36it/s]   
Epoch 1, Val Loss: 7.255414962768555: 100%|██████████| 1581/1581 [04:09<00:00,  6.34it/s] 
'''

In [22]:
tokenizer.convert_ids_to_tokens(train_dataset[0][0].numpy())

['[CLS]',
 '[MASK]',
 '[MASK]',
 '[MASK]',
 '[MASK]',
 '[MASK]',
 '[SEP]',
 'to',
 'release',
 'a',
 'person',
 'from',
 'punishment',
 'exempt',
 'from',
 'penalty',
 'a',
 'convicted',
 'criminal',
 'who',
 'was',
 'pardon',
 '##ed',
 'by',
 'the',
 'governor',
 '[SEP]']

In [16]:
def getPredFromDesc(model, desc : str, mask_size=5, top_n=10):
    desc = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(desc))
    cls_id, mask_id, sep_id, pad_id = train_dataset.cls_id, train_dataset.mask_id, train_dataset.sep_id, train_dataset.pad_id
    desc_ids = [cls_id] + [mask_id] * mask_size + [sep_id] + desc
    x = torch.tensor(desc_ids).unsqueeze(0).to(device)
    attention_mask = (x != pad_id)
    out = model(input_ids=x, attention_mask=attention_mask, target_matrix=target_matrix)
    result, indices = torch.topk(out, k=top_n, dim=-1, largest=True, sorted=True)
    
    indices = indices[0]
    return [idx2target[i] for i in indices], indices
    

In [17]:
getPredFromDesc(model, 'an inhabitant of a cold country', 5, 100)

(['arctic',
  'rustic',
  'winters',
  'countryman',
  'landsman',
  'northerner',
  'vegan',
  'deserter',
  'trappist',
  'quagmire',
  'landscapist',
  'chiller',
  'sylvan',
  'landside',
  'quarantined',
  'icer',
  'gypsy',
  'arcadian',
  'naan',
  'outdoorsman',
  'country',
  'snowman',
  'furan',
  'iceman',
  'frozen',
  'borer',
  'icepick',
  'dendrite',
  'outflank',
  'outlander',
  'rustication',
  'spectrin',
  'highlander',
  'winter',
  'thar',
  'stoner',
  'denier',
  'inglenook',
  'replant',
  'alpine',
  'borzoi',
  'landman',
  'nomad',
  'barbarian',
  'countrywoman',
  'labradorite',
  'bittern',
  'orientalist',
  'northern',
  'landrover',
  'stranger',
  'dendritic',
  'frontiersman',
  'desert',
  'thaler',
  'pastoralist',
  'norther',
  'lander',
  'cold',
  'denizen',
  'coldhearted',
  'lanthanide',
  'philistine',
  'narcissist',
  'nordic',
  'glacial',
  'folksinger',
  'labrador',
  'barranca',
  'groundhog',
  'grr',
  'bushranger',
  'nihilist',

In [51]:
train_dataset[1000]

(tensor([  101,   103,   103,   103,   103,   103,   102,  2583,  1998,  5627,
          2000,  4553,  6570,  3085,  2402, 15608]),
 75)

In [52]:
train_data[1000]

{'word': 'teachable',
 'lexnames': ['adj.all'],
 'root_affix': ['able'],
 'sememes': ['willing', 'undergo', 'teach'],
 'definitions': 'able and willing to learn teachable youngsters'}

In [45]:
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(dev_dataset[120][0]))

'[CLS] [MASK] [MASK] [MASK] [MASK] [MASK] [SEP] a whipping or flogging the discipline of the scourge'

In [20]:
def test(loader, name):
    inc = 3
    model.eval()
    test_loss = 0.0
    test_acc1 = test_acc10 = test_acc100 = test_rank_median = test_rank_variance = 0.0
    total_seen = 0
    all_pred = []
    with torch.no_grad():
        with tqdm(total=len(loader)) as pbar:
            for i, (x,y) in enumerate(loader):
                if i % inc == 0 and i != 0:
                    display_loss = test_loss / i
                    pbar.set_description(f'Test Loss: {display_loss}')

                x = x.to(device)
                attention_mask = (x != train_dataset.pad_id)
                y = y.to(device)

#                 with autocast():
                loss, out = model(input_ids=x, attention_mask=attention_mask, target_matrix=target_matrix,
                                  criterion=criterion, ground_truth=y)

                test_loss += loss.detach()

                pbar.update(1)

                result, indices = torch.sort(out, descending=True)
                
                b = len(x)
                acc1, acc10, acc100, pred_rank = evaluate(indices, y, test=True)
                test_acc1 += acc1
                test_acc10 += acc10
                test_acc100 += acc100
                total_seen += b
                all_pred.extend(pred_rank)
                
                del x, y, out, loss
                if i % 20 == 0:
                    torch.cuda.empty_cache()
    
    test_loss /= len(loader)
    test_acc1 /= total_seen
    test_acc10 /= total_seen
    test_acc100 /= total_seen
    all_pred = torch.tensor(all_pred)
    median = torch.median(all_pred)
    var = torch.var(all_pred)**0.5
    
    print(f'{name}_test_loss:', test_loss)
    print(f'{name}_test_acc1:', test_acc1)
    print(f'{name}_test_acc10:', test_acc10)
    print(f'{name}_test_acc100:', test_acc100)
    print(f'{name}_test_rank_median:', median)
    print(f'{name}_test_rank_variance', var)
    
    wandb.log({
        f'{name}_test_loss': test_loss,
        f'{name}_test_acc1': test_acc1,
        f'{name}_test_acc10': test_acc10,
        f'{name}_test_acc100': test_acc100,
        f'{name}_test_rank_median': test_rank_median,
        f'{name}_test_rank_variance': test_rank_variance
    })
    

In [21]:
test(test_loader_seen, 'seen') # epoch 1

Test Loss: 4.620787620544434: 100%|██████████| 11/11 [00:01<00:00,  5.95it/s]

seen_test_loss: tensor(4.5265, device='cuda:0')
seen_test_acc1: 0.318
seen_test_acc10: 0.568
seen_test_acc100: 0.748
seen_test_rank_median: tensor(5.)
seen_test_rank_variance tensor(325.9005)





In [22]:
test(test_loader_unseen, 'unseen') # epoch 1

Test Loss: 7.362970352172852: 100%|██████████| 11/11 [00:01<00:00,  6.96it/s]

unseen_test_loss: tensor(7.3486, device='cuda:0')
unseen_test_acc1: 0.094
unseen_test_acc10: 0.276
unseen_test_acc100: 0.498
unseen_test_rank_median: tensor(107.)
unseen_test_rank_variance tensor(434.1319)





In [23]:
test(test_loader_desc, 'desc') # epoch 1

Test Loss: 3.122372627258301: 100%|██████████| 5/5 [00:00<00:00,  6.37it/s]

desc_test_loss: tensor(2.7965, device='cuda:0')
desc_test_acc1: 0.405
desc_test_acc10: 0.795
desc_test_acc100: 0.96
desc_test_rank_median: tensor(1.)
desc_test_rank_variance tensor(58.7754)





In [25]:
test(test_loader_seen, 'seen') # epoch 8

Error in callback <function _WandbInit._resume_backend at 0x7f21431ece60> (for pre_run_cell):


Exception: The wandb backend process has shutdown

Test Loss: 2.36694073677063: 100%|██████████| 11/11 [00:02<00:00,  5.39it/s] 

seen_test_loss: tensor(2.2075, device='cuda:0')
seen_test_acc1: 0.608
seen_test_acc10: 0.88
seen_test_acc100: 0.928
seen_test_rank_median: tensor(0.)
seen_test_rank_variance tensor(233.3596)





Exception: The wandb backend process has shutdown

Error in callback <function _WandbInit._pause_backend at 0x7f21431eccb0> (for post_run_cell):


Exception: The wandb backend process has shutdown

In [27]:
test(test_loader_unseen, 'unseen') # epoch 8

Error in callback <function _WandbInit._resume_backend at 0x7f21431ece60> (for pre_run_cell):


Exception: The wandb backend process has shutdown

Test Loss: 9.864437103271484: 100%|██████████| 11/11 [00:01<00:00,  6.89it/s]

unseen_test_loss: tensor(9.8111, device='cuda:0')
unseen_test_acc1: 0.08
unseen_test_acc10: 0.302
unseen_test_acc100: 0.532
unseen_test_rank_median: tensor(69.)
unseen_test_rank_variance tensor(435.2313)





Exception: The wandb backend process has shutdown

Error in callback <function _WandbInit._pause_backend at 0x7f21431eccb0> (for post_run_cell):


Exception: The wandb backend process has shutdown

In [28]:
test(test_loader_desc, 'desc') # epoch 8

Error in callback <function _WandbInit._resume_backend at 0x7f21431ece60> (for pre_run_cell):


Exception: The wandb backend process has shutdown

Test Loss: 3.3613228797912598: 100%|██████████| 5/5 [00:00<00:00,  6.74it/s]


desc_test_loss: tensor(2.8807, device='cuda:0')
desc_test_acc1: 0.38
desc_test_acc10: 0.745
desc_test_acc100: 0.93
desc_test_rank_median: tensor(2.)
desc_test_rank_variance tensor(158.7534)


Exception: The wandb backend process has shutdown

Error in callback <function _WandbInit._pause_backend at 0x7f21431eccb0> (for post_run_cell):


Exception: The wandb backend process has shutdown

In [37]:
input_ids, labels = next(iter(train_loader))
input_ids

tensor([[101, 103, 103,  ...,   0,   0,   0],
        [101, 103, 103,  ...,   0,   0,   0],
        [101, 103, 103,  ...,   0,   0,   0],
        ...,
        [101, 103, 103,  ...,   0,   0,   0],
        [101, 103, 103,  ...,   0,   0,   0],
        [101, 103, 103,  ...,   0,   0,   0]])

In [56]:
sep_locations = torch.roll(input_ids == torch.tensor(102).expand_as(input_ids), shifts=1, dims=-1)
sep_locations[:,0] = 0 # last [SEP] will wrap to 0th position
token_type_ids = (torch.cumsum(sep_locations, dim=-1) > 0).long()

In [57]:
token_type_ids.dtype

torch.int64

In [58]:
token_type_ids[0]

tensor([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [36]:
input_ids[0]

tensor([  101,   103,   103,   103,   103,   103,   102,  6331, 20976,  2819,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])