In [1]:
import torch
from torch import nn
from torch.utils import data
from torch.cuda.amp import autocast, GradScaler

import numpy as np
from tqdm import tqdm

import sys
import datetime

sys.path.append('../code')
from dataset import get_data, MaskedDataset, make_vocab

from transformers import (
    AdamW, get_linear_schedule_with_warmup
)

from models import MaskedRDModel

In [2]:
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')

Using cache found in /home/ubuntu/.cache/torch/hub/huggingface_pytorch-transformers_master


In [3]:
mask_size = 5
model = MaskedRDModel.from_pretrained('bert-base-uncased')
model.set_mask_size(mask_size)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing MaskedRDModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing MaskedRDModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MaskedRDModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
d = get_data('../wantwords-english-baseline/data', word2vec=False)

Loading data...
Training data: 675715 word-def pairs
Dev data: 75873 word-def pairs
Test data: 1200 word-def pairs


In [5]:
train_data, train_data_def, dev_data, test_data_seen, \
    test_data_unseen, test_data_desc = d

In [6]:
target_matrix, target2idx, idx2target = make_vocab(d, tokenizer, mask_size=mask_size)

In [7]:
# target2idx maps target words to indices
# target_matrix maps target indices to bpe sequences, padded/truncated to mask_size
target2idx['book'], target_matrix[target2idx['book']], idx2target[target2idx['book']]

(16187, tensor([2338,  103,  103,  103,  103]), 'book')

In [8]:
train_dataset = MaskedDataset(train_data + train_data_def, tokenizer, target2idx, mask_size=mask_size)
dev_dataset = MaskedDataset(dev_data, tokenizer, target2idx, mask_size=mask_size)
test_dataset_seen = MaskedDataset(test_data_seen, tokenizer, target2idx, mask_size=mask_size)
test_dataset_unseen = MaskedDataset(test_data_unseen, tokenizer, target2idx, mask_size=mask_size)
test_dataset_desc = MaskedDataset(test_data_desc, tokenizer, target2idx, mask_size=mask_size)

In [9]:
batch_size = 32
num_workers = 4

loader_params = {
    'pin_memory': False,
    'batch_size': batch_size,
    'num_workers': num_workers,
    'collate_fn': train_dataset.collate_fn
}

train_loader = data.DataLoader(train_dataset, **{'shuffle': True, **loader_params})
dev_loader = data.DataLoader(dev_dataset, **{'shuffle': True, **loader_params})
test_loader_seen = data.DataLoader(test_dataset_seen, **{'shuffle': False, **loader_params})
test_loader_unseen = data.DataLoader(test_dataset_unseen, **{'shuffle': False, **loader_params})
test_loader_desc = data.DataLoader(test_dataset_desc, **{'shuffle': False, **loader_params})

In [10]:
epochs = 10

lr = 2e-5
optim = AdamW(model.parameters(), lr=lr)

criterion = nn.CrossEntropyLoss()

warmup_duration = 0.05 # portion of the first epoch spent on lr warmup
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=len(train_loader) * warmup_duration, 
                                            num_training_steps=len(train_loader) * epochs)

epoch = 0

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

scaler = GradScaler()

In [11]:
import wandb

wandb.init(project='reverse-dictionary', entity='reverse-dict')

config = wandb.config
config.learning_rate = lr
config.epochs = epochs
config.batch_size = batch_size
config.optimizer = type(optim).__name__
config.scheduler = type(scheduler).__name__
config.warmup_duration = warmup_duration

wandb.watch(model)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mreverse-dict[0m (use `wandb login --relogin` to force relogin)


[<wandb.wandb_torch.TorchGraph at 0x7fbdd03b4250>]

In [12]:
target_matrix = target_matrix.to(device)

In [13]:
model = model.to(device)

In [14]:
def evaluate(pred, gt, test=False):
    acc1 = acc10 = acc100 = 0
    n = len(pred)
    pred_rank = []
    for p, word in zip(pred, gt):
        if test:
            loc = (p == word).nonzero(as_tuple=True)
            if len(loc) != 0:
                pred_rank.append(min(loc[-1], 1000))
            else:
                pred_rank.append(1000)
        if word in p[:100]:
            acc100 += 1
            if word in p[:10]:
                acc10 += 1
                if word == p[0]:
                    acc1 += 1
    if test:
        pred_rank = torch.tensor(pred_rank, dtype=torch.float32)
        return (acc1, acc10, acc100, pred_rank)
    else:
        return acc1/n, acc10/n, acc100/n

In [82]:
inc = 10
losses = []

for epoch in range(epoch, epochs):
    # Training
    model.train()
    train_loss = 0.0
    # Train on subset of training data to save time
    with tqdm(total=len(train_loader)) as pbar:
        for i, (x, y) in enumerate(train_loader):
            if i % inc == 0 and i != 0:
                display_loss = train_loss / i
                pbar.set_description(f'Epoch {epoch+1}, Train Loss: {train_loss / i}')

            optim.zero_grad()

            x = x.to(device)
            attention_mask = (x != train_dataset.pad_id)
            y = y.to(device)
            
            loss, out = model(input_ids=x, attention_mask=attention_mask, target_matrix=target_matrix,
                          criterion=criterion, ground_truth=y)
            
#             scaler.scale(loss).backward()
            loss.backward()
            
#             scaler.unscale_(optim)
            nn.utils.clip_grad_norm_(model.parameters(), 5)
            
#             scaler.step(optim)
            optim.step()
#             scaler.update()
            
            train_loss += loss.detach()
            
            scheduler.step()
            
            pbar.update(1)
            
            del x, y, out, loss, attention_mask
            
    model_name = type(model).__name__
    filename = f'../trained_models/{model_name} Epoch {epoch+1} at {datetime.datetime.now()}'.replace(' ', '_')
    with open(filename, 'wb+') as f:
        torch.save(model, f)
    
    # Validation
    model.eval()
    val_loss = 0.0
    val_acc1, val_acc10, val_acc100 = 0.0, 0.0, 0.0
    with torch.no_grad():
        with tqdm(total=len(dev_loader)) as pbar:
            for i, (x, y) in enumerate(dev_loader):
                if i % inc == 0 and i != 0:
                    display_loss = val_loss / i
                    pbar.set_description(f'Epoch {epoch+1}, Val Loss: {val_loss / i}')

                x = x.to(device)
                attention_mask = (x != train_dataset.pad_id)
                y = y.to(device)

#                 with autocast():
                loss, out = model(input_ids=x, attention_mask=attention_mask, target_matrix=target_matrix,
                              criterion=criterion, ground_truth=y)

                val_loss += loss.detach()

                pbar.update(1)                
                
                result, indices = torch.sort(out, descending=True)
                
                acc1, acc10, acc100 = evaluate(indices, y)
                val_acc1 += acc1
                val_acc10 += acc10
                val_acc100 += acc100

                del x, y, out, loss
    
    wandb.log({
        'train_loss': train_loss / len(train_loader),
        'val_loss': val_loss / len(dev_loader),
        'val_acc1': val_acc1 / len(dev_loader),
        'val_acc10': val_acc10 / len(dev_loader),
        'val_acc100': val_acc100 / len(dev_loader)
    })
    

Epoch 2, Train Loss: 4.103667736053467:   4%|▍         | 901/21117 [04:14<1:35:09,  3.54it/s] 


KeyboardInterrupt: 

In [35]:
out.shape

torch.Size([1, 5, 50477])

In [34]:
torch.argmax(out.sum(dim=1), dim=-1)

tensor([4128])

In [29]:
for w in target2idx:
    if target2idx[w] == 23413:
        break
print(w)

unicyclist


In [32]:
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(x[0].numpy()))

'[CLS] [MASK] [MASK] [MASK] [MASK] [MASK] [SEP] characterized by three colors in a specific sense having the three fundamental color sensations of red green and purple as the normal eye in distinction from a color blind eye which can perceive only two of the fundamental colors'

In [73]:
def getPredFromDesc(model, desc : str, mask_size=5, top_n=10):
    desc = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(desc))
    cls_id, mask_id, sep_id, pad_id = train_dataset.cls_id, train_dataset.mask_id, train_dataset.sep_id, train_dataset.pad_id
    desc_ids = [cls_id] + [mask_id] * mask_size + [sep_id] + desc
    x = torch.tensor(desc_ids).unsqueeze(0).to(device)
    attention_mask = (x != pad_id)
    out = model(input_ids=x, attention_mask=attention_mask, target_matrix=target_matrix)
    result, indices = torch.sort(out, descending=True)
    
    indices = indices[0][:top_n]
    return [idx2target[i] for i in indices], indices
    

In [77]:
getPredFromDesc(model, 'an inhabitant of a cold country', 5, 100)

(['arctic',
  'rustic',
  'nordic',
  'northerner',
  'winters',
  'deserter',
  'countryman',
  'arcadian',
  'alpine',
  'gypsy',
  'glacial',
  'russ',
  'highlander',
  'northern',
  'continental',
  'icer',
  'bannister',
  'viking',
  'chiller',
  'winter',
  'frozen',
  'outflank',
  'bulgar',
  'southerner',
  'merganser',
  'iceman',
  'bohemian',
  'cold',
  'borer',
  'vegan',
  'inglenook',
  'country',
  'snowy',
  'scandinavian',
  'outlander',
  'labradorite',
  'icepick',
  'siberian',
  'greengrocer',
  'labrador',
  'desert',
  'icy',
  'lugnut',
  'himalayan',
  'deserts',
  'lurcher',
  'norther',
  'summers',
  'snowman',
  'cannister',
  'flatterer',
  'amazonian',
  'yankee',
  'arcadia',
  'lubricant',
  'polar',
  'fragrant',
  'crofter',
  'barbarian',
  'coldhearted',
  'bittern',
  'pagan',
  'muggle',
  'russet',
  'antarctic',
  'maine',
  'thaler',
  'thar',
  'melter',
  'peregrine',
  'harsher',
  'fratricide',
  'folksinger',
  'husky',
  'northeaster'

In [51]:
train_dataset[1000]

(tensor([  101,   103,   103,   103,   103,   103,   102,  2583,  1998,  5627,
          2000,  4553,  6570,  3085,  2402, 15608]),
 75)

In [52]:
train_data[1000]

{'word': 'teachable',
 'lexnames': ['adj.all'],
 'root_affix': ['able'],
 'sememes': ['willing', 'undergo', 'teach'],
 'definitions': 'able and willing to learn teachable youngsters'}

In [45]:
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(dev_dataset[120][0]))

'[CLS] [MASK] [MASK] [MASK] [MASK] [MASK] [SEP] a whipping or flogging the discipline of the scourge'

In [84]:
def test(loader, name):
    inc = 3
    model.eval()
    test_loss = 0.0
    test_acc1 = test_acc10 = test_acc100 = test_rank_median = test_rank_variance = 0.0
    total_seen = 0
    all_pred = []
    with torch.no_grad():
        with tqdm(total=len(loader)) as pbar:
            for i, (x,y) in enumerate(loader):
                if i % inc == 0 and i != 0:
                    display_loss = test_loss / i
                    pbar.set_description(f'Test Loss: {display_loss}')

                x = x.to(device)
                attention_mask = (x != train_dataset.pad_id)
                y = y.to(device)

#                 with autocast():
                loss, out = model(input_ids=x, attention_mask=attention_mask, target_matrix=target_matrix,
                                  criterion=criterion, ground_truth=y)

                test_loss += loss.detach()

                pbar.update(1)

                result, indices = torch.sort(out, descending=True)
                
                b = len(x)
                acc1, acc10, acc100, pred_rank = evaluate(indices, y, test=True)
                test_acc1 += acc1
                test_acc10 += acc10
                test_acc100 += acc100
                total_seen += b
                all_pred.extend(pred_rank)
                
                del x, y, out, loss
                if i % 20 == 0:
                    torch.cuda.empty_cache()
    
    test_loss /= len(loader)
    test_acc1 /= total_seen
    test_acc10 /= total_seen
    test_acc100 /= total_seen
    all_pred = torch.tensor(all_pred)
    median = torch.median(all_pred)
    var = torch.var(all_pred)**0.5
    
    print(f'{name}_test_loss:', test_loss)
    print(f'{name}_test_acc1:', test_acc1)
    print(f'{name}_test_acc10:', test_acc10)
    print(f'{name}_test_acc100:', test_acc100)
    print(f'{name}_test_rank_median:', median)
    print(f'{name}_test_rank_variance', var)
    
    wandb.log({
        f'{name}_test_loss': test_loss / test_length,
        f'{name}_test_acc1': test_acc1 / test_length,
        f'{name}_test_acc10': test_acc10 / test_length,
        f'{name}_test_acc100': test_acc100 / test_length,
        f'{name}_test_rank_median': test_rank_median / test_length,
        f'{name}_test_rank_variance': test_rank_variance / test_length
    })
    

Error in callback <function _WandbInit._resume_backend at 0x7fbdd037e290> (for pre_run_cell):


Exception: The wandb backend process has shutdown

Error in callback <function _WandbInit._pause_backend at 0x7fbdd037e5f0> (for post_run_cell):


Exception: The wandb backend process has shutdown

In [79]:
test(test_loader_seen, 'seen')

Test Loss: 4.968989849090576: 100%|██████████| 16/16 [00:01<00:00,  9.49it/s] 

seen_test_loss: tensor(4.9500, device='cuda:0')
seen_test_acc1: 0.278
seen_test_acc10: 0.514
seen_test_acc100: 0.726
seen_test_rank_median: tensor(8.)
seen_test_rank_variance tensor(347.2525)





In [80]:
test(test_loader_unseen, 'unseen')

Test Loss: 7.104979038238525: 100%|██████████| 16/16 [00:01<00:00, 10.96it/s]

unseen_test_loss: tensor(7.0823, device='cuda:0')
unseen_test_acc1: 0.106
unseen_test_acc10: 0.296
unseen_test_acc100: 0.518
unseen_test_rank_median: tensor(83.)
unseen_test_rank_variance tensor(430.1956)





In [81]:
test(test_loader_desc, 'desc')

Test Loss: 3.049093008041382: 100%|██████████| 7/7 [00:00<00:00,  9.10it/s]

desc_test_loss: tensor(2.8834, device='cuda:0')
desc_test_acc1: 0.43
desc_test_acc10: 0.77
desc_test_acc100: 0.94
desc_test_rank_median: tensor(1.)
desc_test_rank_variance tensor(126.9127)





In [78]:
test_data_desc[0]

{'word': 'forget',
 'lexnames': ['verb.cognition'],
 'root_affix': [],
 'sememes': ['forget'],
 'definitions': 'when you knew a fact or to do something in the past but then without trying you lost this knowledge'}