In [9]:
import torch
from torch import nn
from torch.utils import data
from torch.cuda.amp import autocast, GradScaler

import numpy as np
from tqdm import tqdm

import sys
import datetime

sys.path.append('../code')
from dataset import get_data, MaskedDataset, make_vocab, read_json

from transformers import (
    AdamW, get_linear_schedule_with_warmup
)

from models import MaskedRDModel

In [10]:
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')

Using cache found in /home/ubuntu/.cache/torch/hub/huggingface_pytorch-transformers_master


In [11]:
mask_size = 5
# model = MaskedRDModel.from_pretrained('bert-base-uncased')
# model.initialize(mask_size=mask_size, multilabel=True)
model = torch.load('../trained_models/MaskedRDModel_Epoch_1_at_2021-05-03_07:52:35.719441')

In [12]:
d = get_data('../wantwords-english-baseline/data', word2vec=False)

Loading data...
Training data: 675715 word-def pairs
Dev data: 75873 word-def pairs
Test data: 1200 word-def pairs


In [13]:
train_data, train_data_def, dev_data, test_data_seen, \
    test_data_unseen, test_data_desc = d

In [14]:
target_matrix, target2idx, idx2target = make_vocab(d, tokenizer, mask_size=mask_size)

In [15]:
# target2idx maps target words to indices
# target_matrix maps target indices to bpe sequences, padded/truncated to mask_size
target2idx['book'], target_matrix[target2idx['book']], idx2target[target2idx['book']]

(16187, tensor([2338,  103,  103,  103,  103]), 'book')

In [16]:
wn_data = read_json('../data/wn_data.json')
wn_categories = ['synonyms', 'hyponyms', 'hypernyms', 'related_forms']

In [17]:
train_dataset = MaskedDataset(train_data + train_data_def, tokenizer, target2idx, wn_data=wn_data, wn_categories=wn_categories, mask_size=mask_size)

In [18]:
dev_dataset = MaskedDataset(dev_data, tokenizer, target2idx, 
                            wn_data=wn_data, wn_categories=wn_categories, mask_size=mask_size)
test_dataset_seen = MaskedDataset(test_data_seen, tokenizer, target2idx, 
                                  wn_data=wn_data, wn_categories=wn_categories, mask_size=mask_size)
test_dataset_unseen = MaskedDataset(test_data_unseen, tokenizer, target2idx, 
                                    wn_data=wn_data, wn_categories=wn_categories, mask_size=mask_size)
test_dataset_desc = MaskedDataset(test_data_desc, tokenizer, target2idx, 
                                  wn_data=wn_data, wn_categories=wn_categories, mask_size=mask_size)

In [19]:
index = 1593

[idx2target[x] for x in dev_dataset[index][-1].coalesce().indices().squeeze(0)], idx2target[dev_dataset[index][1]]

(['classic',
  'authorized',
  'importance',
  'authoritative',
  'classical',
  'definitive',
  'important'],
 'authoritative')

In [20]:
batch_size = 40
num_workers = 0

loader_params = {
    'pin_memory': False,
    'batch_size': batch_size,
    'num_workers': num_workers,
    'collate_fn': train_dataset.collate_fn
}

train_loader = data.DataLoader(train_dataset, **{'shuffle': True, **loader_params})
dev_loader = data.DataLoader(dev_dataset, **{'shuffle': True, **loader_params})
test_loader_seen = data.DataLoader(test_dataset_seen, **{'shuffle': False, **loader_params})
test_loader_unseen = data.DataLoader(test_dataset_unseen, **{'shuffle': False, **loader_params})
test_loader_desc = data.DataLoader(test_dataset_desc, **{'shuffle': False, **loader_params})

In [23]:
# Starting from epoch 2
epochs = 9
lr = 1e-5 * 0.905
optim = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=1, 
                                            num_training_steps=len(train_loader) * epochs)
epoch = 0
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
# epochs = 10

# lr = 1e-5
# optim = AdamW(model.parameters(), lr=lr)

# warmup_duration = 0.05 # portion of the first epoch spent on lr warmup
# scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=len(train_loader) * warmup_duration, 
#                                             num_training_steps=len(train_loader) * epochs)

# epoch = 0

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# scaler = GradScaler()

In [25]:
import wandb

wandb.init(project='reverse-dictionary', entity='reverse-dict')

config = wandb.config
config.learning_rate = lr
config.epochs = epochs
config.batch_size = batch_size
config.optimizer = type(optim).__name__
config.scheduler = type(scheduler).__name__
# config.warmup_duration = warmup_duration

# wandb.watch(model)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [26]:
target_matrix = target_matrix.to(device)

In [27]:
model = model.to(device)

In [28]:
def evaluate(pred, gt, test=False):
    acc1 = acc10 = acc100 = 0
    n = len(pred)
    pred_rank = []
    for p, word in zip(pred, gt):
        if test:
            loc = (p == word).nonzero(as_tuple=True)
            if len(loc) != 0:
                pred_rank.append(min(loc[-1], 1000))
            else:
                pred_rank.append(1000)
        if word in p[:100]:
            acc100 += 1
            if word in p[:10]:
                acc10 += 1
                if word == p[0]:
                    acc1 += 1
    if test:
        pred_rank = torch.tensor(pred_rank, dtype=torch.float32)
        return (acc1, acc10, acc100, pred_rank)
    else:
        return acc1/n, acc10/n, acc100/n

In [29]:
def test(loader, name, log=False):
    inc = 3
    model.eval()
    test_loss = 0.0
    test_acc1 = test_acc10 = test_acc100 = test_rank_median = test_rank_variance = 0.0
    total_seen = 0
    all_pred = []
    with torch.no_grad():
        with tqdm(total=len(loader)) as pbar:
            for i, (x,y, wn_ids) in enumerate(loader):
                if i % inc == 0 and i != 0:
                    display_loss = test_loss / i
                    pbar.set_description(f'Test Loss: {display_loss}')

                x = x.to(device)
                attention_mask = (x != train_dataset.pad_id)
                y = y.to(device)
                wn_ids = wn_ids.to_dense().to(device).float()

#                 with autocast():
                loss, out = model(input_ids=x, attention_mask=attention_mask, 
                                  target_matrix=target_matrix, ground_truth=y,
                                  wn_ids=wn_ids, weight_gt=5)

                test_loss += loss.detach()

                pbar.update(1)

                result, indices = torch.sort(out, descending=True)
                
                b = len(x)
                acc1, acc10, acc100, pred_rank = evaluate(indices, y, test=True)
                test_acc1 += acc1
                test_acc10 += acc10
                test_acc100 += acc100
                total_seen += b
                all_pred.extend(pred_rank)
                
                del x, y, out, loss
                if i % 20 == 0:
                    torch.cuda.empty_cache()
    
    test_loss /= len(loader)
    test_acc1 /= total_seen
    test_acc10 /= total_seen
    test_acc100 /= total_seen
    all_pred = torch.tensor(all_pred)
    median = torch.median(all_pred)
    var = torch.var(all_pred)**0.5
    
    print(f'{name}_test_loss:', test_loss)
    print(f'{name}_test_acc1:', test_acc1)
    print(f'{name}_test_acc10:', test_acc10)
    print(f'{name}_test_acc100:', test_acc100)
    print(f'{name}_test_rank_median:', median)
    print(f'{name}_test_rank_variance', var)
    
    return {
            f'{name}_test_loss': test_loss,
            f'{name}_test_acc1': test_acc1,
            f'{name}_test_acc10': test_acc10,
            f'{name}_test_acc100': test_acc100,
            f'{name}_test_rank_median': median,
            f'{name}_test_rank_variance': var
           }
    

In [None]:
inc = 10
losses = []
print('Training beginning!')

for epoch in range(epoch, epochs):
    # Training
    model.train()
    train_loss = 0.0
    # Train on subset of training data to save time
    with tqdm(total=len(train_loader)) as pbar:
        for i, (x, y, wn_ids) in enumerate(train_loader):
            if i % inc == 0 and i != 0:
                display_loss = train_loss / i
                pbar.set_description(f'Epoch {epoch+1}, Train Loss: {train_loss / i}')

            optim.zero_grad()

            x = x.to(device)
            attention_mask = (x != train_dataset.pad_id)
            y = y.to(device)
            wn_ids = wn_ids.to_dense().to(device).float()
            
            loss, out = model(input_ids=x, attention_mask=attention_mask, 
                              target_matrix=target_matrix, ground_truth=y, 
                              wn_ids=wn_ids, weight_gt=5)
            
#             scaler.scale(loss).backward()
            loss.backward()
            
#             scaler.unscale_(optim)
            nn.utils.clip_grad_norm_(model.parameters(), 5)
            
#             scaler.step(optim)
            optim.step()
#             scaler.update()
            
            train_loss += loss.detach()
            
            scheduler.step()
            
            pbar.update(1)
            
            del x, y, out, loss, attention_mask
            
    model_name = type(model).__name__
    filename = f'../trained_models/{model_name} Epoch {epoch+1} at {datetime.datetime.now()}'.replace(' ', '_')
    with open(filename, 'wb+') as f:
        torch.save(model, f)
    
    # Validation
    model.eval()
    val_loss = 0.0
    val_acc1, val_acc10, val_acc100 = 0.0, 0.0, 0.0
    try:
        with torch.no_grad():
            with tqdm(total=len(dev_loader)) as pbar:
                for i, (x, y, wn_ids) in enumerate(dev_loader):
                    if i % inc == 0 and i != 0:
                        display_loss = val_loss / i
                        pbar.set_description(f'Epoch {epoch+1}, Val Loss: {val_loss / i}')

                    x = x.to(device)
                    attention_mask = (x != train_dataset.pad_id)
                    y = y.to(device)
                    wn_ids = wn_ids.to_dense().to(device).float()

    #                 with autocast():
                    loss, out = model(input_ids=x, attention_mask=attention_mask, 
                                      target_matrix=target_matrix, ground_truth=y,
                                      wn_ids=wn_ids, weight_gt=5)

                    val_loss += loss.detach()

                    pbar.update(1)                

                    result, indices = torch.topk(out, k=100, dim=-1, largest=True, sorted=True)

                    acc1, acc10, acc100 = evaluate(indices, y)
                    val_acc1 += acc1
                    val_acc10 += acc10
                    val_acc100 += acc100

                    del x, y, out, loss
    except:
        print('Error encountered, aborting validation!')
    
    wandb.log({
        'train_loss': train_loss / len(train_loader),
        'val_loss': val_loss / len(dev_loader),
        'val_acc1': val_acc1 / len(dev_loader),
        'val_acc10': val_acc10 / len(dev_loader),
        'val_acc100': val_acc100 / len(dev_loader),
        **test(test_loader_seen, 'seen'),
        **test(test_loader_unseen, 'unseen'),
        **test(test_loader_desc, 'desc')
    })
    

  0%|          | 0/16893 [00:00<?, ?it/s]

Training beginning!


Epoch 1, Train Loss: 144.08778381347656:   6%|▌         | 1035/16893 [06:28<1:29:22,  2.96it/s]

In [21]:
def getPredFromDesc(model, desc : str, mask_size=5, top_n=10):
    desc = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(desc))
    cls_id, mask_id, sep_id, pad_id = train_dataset.cls_id, train_dataset.mask_id, train_dataset.sep_id, train_dataset.pad_id
    desc_ids = [cls_id] + [mask_id] * mask_size + [sep_id] + desc
    x = torch.tensor(desc_ids).unsqueeze(0).to(device)
    attention_mask = (x != pad_id)
    out = model(input_ids=x, attention_mask=attention_mask, target_matrix=target_matrix)
    result, indices = torch.topk(out, k=top_n, dim=-1, largest=True, sorted=True)
    
    indices = indices[0]
    return [idx2target[i] for i in indices], indices
    

In [27]:
getPredFromDesc(model, 'an inhabitant of a cold country', 5, 100)

(['arctic',
  'icy',
  'cold',
  'glacial',
  'coldness',
  'frozen',
  'coldhearted',
  'ice',
  'icer',
  'rustic',
  'cool',
  'frost',
  'chiller',
  'iceman',
  'winters',
  'european',
  'norwegian',
  'chill',
  'glacier',
  'northerner',
  'coolie',
  'nordic',
  'country',
  'iceberg',
  'icepick',
  'coolness',
  'barbarian',
  'bohemian',
  'outflank',
  'highlander',
  'cooler',
  'vagrant',
  'polar',
  'inhabitant',
  'dweller',
  'northern',
  'alpine',
  'scandinavian',
  'russian',
  'freezing',
  'peasant',
  'icebound',
  'outlander',
  'winter',
  'gypsy',
  'highland',
  'deserter',
  'chilliness',
  'somebody',
  'bleak',
  'icelandic',
  'snowy',
  'countryman',
  'muscovite',
  'irish',
  'fragrant',
  'continental',
  'siberian',
  'dwell',
  'hardy',
  'skiing',
  'cooly',
  'frosty',
  'snow',
  'frosting',
  'yankee',
  'maine',
  'icecap',
  'blizzard',
  'inhabit',
  'russ',
  'viking',
  'aussie',
  'groat',
  'petticoat',
  'skier',
  'wild',
  'climate'

In [49]:
getPredFromDesc(model, '', 5, 100)

(['tree',
  'wood',
  'treed',
  'shrub',
  'treetop',
  'teakwood',
  'pinewood',
  'lime',
  'linden',
  'spruce',
  'bush',
  'logwood',
  'maple',
  'chestnut',
  'timber',
  'birch',
  'oak',
  'pollard',
  'woodcut',
  'mahogany',
  'hazel',
  'woodpecker',
  'boxwood',
  'conifer',
  'bearberry',
  'fir',
  'hop',
  'eucalyptus',
  'forest',
  'brushwood',
  'manoeuver',
  'rosewood',
  'huckleberry',
  'vine',
  'laurel',
  'pine',
  'bark',
  'pineapple',
  'cottonwood',
  'herb',
  'raspberry',
  'beechwood',
  'redwood',
  'palm',
  'cypress',
  'hazelnut',
  'plum',
  'bushwhack',
  'dogwood',
  'grow',
  'fig',
  'tea',
  'grass',
  'lumber',
  'gum',
  'logjam',
  'ebony',
  'bushmeat',
  'log',
  'cover',
  'hit',
  'vinifera',
  'cedarwood',
  'leatherwood',
  'sap',
  'cordwood',
  'berry',
  'elderberry',
  'bayberry',
  'tap',
  'elm',
  'support',
  'shoot',
  'touchwood',
  'gage',
  'hawthorn',
  'woodruff',
  'willow',
  'taproot',
  'oakleaf',
  'root',
  'limes

In [25]:
getPredFromDesc(model, 'a road on which cars can go fast', 5, 100)

(['road',
  'drive',
  'runway',
  'expressway',
  'freeway',
  'highway',
  'trackway',
  'roads',
  'move',
  'motorway',
  'track',
  'drag',
  'turn',
  'driveway',
  'route',
  'turnpike',
  'travel',
  'fast',
  'parkway',
  'beltway',
  'speedway',
  'pass',
  'raceway',
  'belt',
  'street',
  'speed',
  'hit',
  'motor',
  'strip',
  'make',
  'roadster',
  'straightway',
  'crisscross',
  'render',
  'thruway',
  'straight',
  'autobahn',
  'maneuver',
  'chase',
  'get',
  'mean',
  'drift',
  'go',
  'take',
  'dragster',
  'rip',
  'lift',
  'range',
  'way',
  'channel',
  'itinerary',
  'supply',
  'railway',
  'incline',
  'glide',
  'course',
  'railroad',
  'flyway',
  'run',
  'path',
  'canal',
  'trail',
  'reach',
  'pike',
  'thoroughfare',
  'avenue',
  'draw',
  'fly',
  'superhighway',
  'bypass',
  'divide',
  'interchange',
  'tail',
  'zip',
  'reverberate',
  'straightaway',
  'curve',
  'rails',
  'carry',
  'fix',
  'getaway',
  'cutting',
  'section',
 

In [47]:
getPredFromDesc(model, 'an intelligent person', 5, 100)

(['intellect',
  'intelligent',
  'genius',
  'intelligence',
  'intellectual',
  'brain',
  'somebody',
  'brainiac',
  'brainpower',
  'expert',
  'einstein',
  'mind',
  'intelligentsia',
  'person',
  'brainstem',
  'brains',
  'soul',
  'minder',
  'expertness',
  'scientist',
  'sensitive',
  'wit',
  'smart',
  'brainy',
  'philosopher',
  'sensible',
  'psychic',
  'intellectualism',
  'mindfulness',
  'brainwashed',
  'adept',
  'scholar',
  'clever',
  'eyewitness',
  'sensitiveness',
  'brainstorming',
  'braincase',
  'smartness',
  'sentient',
  'brainchild',
  'emollient',
  'mastermind',
  'intellection',
  'visionary',
  'individual',
  'insight',
  'sentience',
  'individualist',
  'cognition',
  'keen',
  'mindful',
  'adeptness',
  'brainwash',
  'cleverness',
  'creativity',
  'think',
  'soulmate',
  'minding',
  'knowledge',
  'artist',
  'psyche',
  'brilliant',
  'wisdom',
  'soulful',
  'intelligently',
  'keener',
  'sense',
  'reader',
  'science',
  'able',


In [26]:
test(test_loader_seen, 'seen') # epoch 1

  0%|          | 0/11 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 48.00 MiB (GPU 0; 14.76 GiB total capacity; 12.38 GiB already allocated; 17.75 MiB free; 13.48 GiB reserved in total by PyTorch)

In [21]:
test(test_loader_unseen, 'unseen') # epoch 1

Test Loss: 105.3856201171875: 100%|██████████| 11/11 [00:01<00:00,  7.55it/s]

unseen_test_loss: tensor(107.8782, device='cuda:0')
unseen_test_acc1: 0.056
unseen_test_acc10: 0.216
unseen_test_acc100: 0.366
unseen_test_rank_median: tensor(640.)
unseen_test_rank_variance tensor(458.6854)





{'unseen_test_loss': tensor(107.8782, device='cuda:0'),
 'unseen_test_acc1': 0.056,
 'unseen_test_acc10': 0.216,
 'unseen_test_acc100': 0.366,
 'unseen_test_rank_median': tensor(640.),
 'unseen_test_rank_variance': tensor(458.6854)}

In [25]:
test(test_loader_desc, 'desc') # epoch 1

Test Loss: 199.60968017578125: 100%|██████████| 5/5 [00:00<00:00, 10.03it/s]

desc_test_loss: tensor(173.0187, device='cuda:0')
desc_test_acc1: 0.195
desc_test_acc10: 0.705
desc_test_acc100: 0.935
desc_test_rank_median: tensor(3.)
desc_test_rank_variance tensor(88.3132)





{'desc_test_loss': tensor(173.0187, device='cuda:0'),
 'desc_test_acc1': 0.195,
 'desc_test_acc10': 0.705,
 'desc_test_acc100': 0.935,
 'desc_test_rank_median': tensor(3.),
 'desc_test_rank_variance': tensor(88.3132)}

In [50]:
torch.save(model, '../trained_models/epoch2_mid.pt')