In [1]:
dataset = {
    'question_ids': [0],
    'questions': ["When was Arjun born ?"],
    'contexts': ["Arjun was born in 2001"], 
    'context_ids': [0]
}

In [14]:
import itertools
import json
import os

from multiprocessing import Pool
from tokenizer import SpacyTokenizer
from dictionary import Dictionary
from dotdict import DotDictify
from tqdm.auto import tqdm
TOK = None
# Each process has its own tokenizer


def init_tokenizer(annotators):
    global TOK
    TOK = SpacyTokenizer(annotators=annotators)

# Multiprocessing requires global function


def tokenize(text):
    global TOK
    return TOK.tokenize(text)


def tokenize_all(texts, annotators, num_workers=None):
    """Tokenization might take a long time, even when done in parallel"""
    init_tokenizer(annotators)
    tokens = map(tokenize,texts)
    return list(tokens)


In [15]:
questions = tokenize_all(dataset['questions'], [
    'lemma'])
contexts = tokenize_all(dataset['contexts'], [
    'lemma', 'pos', 'ner'])

In [20]:
examples = []
for qid, cid in tqdm(enumerate(dataset['context_ids'])):
    examples.append({
        'id': dataset['question_ids'][qid],
        'question': {key: questions[qid][key] for key in ['tokens', 'lemma']},
        'context_id': cid,
        'answers': {'spans': [], 'texts': []}
    })
output = {'contexts': contexts, 'examples': examples}


1it [00:00, 15141.89it/s]


In [21]:
import json
import logging
import os

import torch
import torch.nn.functional as F

from tqdm import tqdm
import drqa
import utils
from dictionary import Dictionary
from dataset import ReadingDataset, BatchSampler
from dotdict import DotDictify
args = {
    "seed": 42,
    "data": "/scratch/arjunth2001/data",
    "max_tokens": 16000,
    "batch_size": 32,
    "num_workers": 4,
    "max_epoch": 400,
    "clip_norm": 10,
    "lr": 2e-6,
    "momentum": 0.99,
    "weight_decay": 0.0,
    "lr_shrink": 0.1,
    "min_lr": 1e-8,
    "log_file": "/scratch/arjunth2001/logs/train.log",
    "tune_embed": 1000,
    "checkpoint_dir": "./models",
    'embed_dim': 300,
    'embed_path': '/scratch/arjunth2001/data/glove.840B.300d.txt',
    'hidden_size': 128,
    'context_layers': 3,
    'question_layers': 3,
    'dropout': 0.4,
    'bidirectional': True,
    'concat_layers': True,
    'question_embed': True,
    'use_in_question': True,
    'use_lemma': True,
    'use_pos': True,
    'use_ner': True,
    'use_tf': True,

}
args = DotDictify(args)


In [22]:
if torch.cuda.is_available():
        device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('device: ' + str(device))

torch.manual_seed(args.seed)

# Load a dictionary
dictionary = Dictionary.load(os.path.join(args.data, 'dict.txt'))
print(
    'Loaded a word dictionary with {} words'.format(len(dictionary)))

# Load a training and validation dataset
with open(os.path.join(args.data, 'train.json')) as file:
    train_contents = json.load(file)
    train_dataset = ReadingDataset(
        args, train_contents['contexts'], train_contents['examples'], dictionary, skip_no_answer=True, single_answer=True)

with open(os.path.join(args.data, 'dev.json')) as file:
    contents = json.load(file)
    valid_dataset = ReadingDataset(
        args, contents['contexts'], contents['examples'], dictionary, feature_dict=train_dataset.feature_dict, skip_no_answer=True, single_answer=True
    )
contents = output
test_dataset = ReadingDataset(
    args, contents['contexts'], contents['examples'], dictionary, feature_dict=train_dataset.feature_dict, skip_no_answer=False, single_answer=True
)

# Build a model
model = drqa.DrQA.build_model(args, dictionary).to(device)
print('Built a model with {} parameters'.format(
    sum(p.numel() for p in model.parameters())))


device: cuda
Loaded a word dictionary with 29934 words
Loading Embedding..
Loaded 29547 / 29934 word embeddings (98.71%)
Built a model with 13096101 parameters


In [23]:
# Build an optimizer and a learning rate schedule
optimizer = torch.optim.Adamax(
    model.parameters(), args.lr, weight_decay=args.weight_decay)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', patience=3, factor=args.lr_shrink)

# Load last checkpoint if one exists
utils.load_checkpoint(args, model, optimizer, lr_scheduler, device)

Loaded Checkpoint


{'epoch': 1,
 'f1_score': 0.7765869040391028,
 'best_score': 0.7765869040391028,
 'last_epoch': 33,
 'model': OrderedDict([('embedding.weight',
               tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
                       [-0.0717,  0.0883,  0.2333,  ..., -0.0548, -0.0141, -0.2110],
                       [ 0.2644, -0.2087, -0.2541,  ..., -0.0267, -0.1617,  0.2764],
                       ...,
                       [-0.0951,  0.1825, -0.0320,  ...,  0.5656,  0.5199, -0.7232],
                       [ 1.6700,  0.3455,  0.0318,  ...,  0.1492, -3.0710,  0.1245],
                       [-0.7918, -0.1060,  1.0434,  ..., -0.0442,  1.2402, -0.7389]],
                      device='cuda:0')),
              ('context_question_attention.linear.weight',
               tensor([[-0.0874, -0.0071,  0.0127,  ...,  0.0129,  0.1181, -0.2294],
                       [ 0.0250,  0.0933,  0.0577,  ..., -0.0435, -0.0226, -0.0309],
                       [-0.0291,  0.1232, -0.07

In [24]:
model.eval()
test_loader = torch.utils.data.DataLoader(
    test_dataset, num_workers=args.num_workers, collate_fn=test_dataset.collater,
    batch_sampler=BatchSampler(
        test_dataset, args.max_tokens, args.batch_size, shuffle=False, seed=args.seed)
)
progress_bar = tqdm(
    test_loader, desc='Testing', leave=False)

for batch_id, sample in enumerate(progress_bar):
    sample = utils.move_to_device(sample, device)
    with torch.no_grad():
        start_scores, end_scores = model(
            sample['context_tokens'], sample['question_tokens'],
            context_features=sample['context_features']
        )
       

        start_pred, end_pred, scores = model.decode(
            start_scores, end_scores, max_len=15)

        for i, (start_ex, end_ex) in enumerate(zip(start_pred, end_pred)):
            context = test_dataset.contexts[test_dataset.context_ids[sample['id'][i]]]
            start_idx = context['offsets'][start_ex][0]
            end_idx = context['offsets'][end_ex][1]
            text_pred = context['text'][start_idx: end_idx]
            print(text_pred)


  pred_start.append(max_idx // scores.size(0))
                                                      

2001


