# 2 - NestedField, CharCNN and Inference

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets

import random
import numpy as np

SEED = 1234

np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

WORDS = data.Field(lower=True)
CHAR_NESTING = data.Field(tokenize=list, lower=True)
CHARS = data.NestedField(CHAR_NESTING)
UD_TAGS = data.Field(unk_token=None)
PTB_TAGS = data.Field(unk_token=None)

fields = [(("words", "chars"), (WORDS, CHARS)), ("udtags", UD_TAGS), ("ptbtags", PTB_TAGS)]

In [2]:
train_data, valid_data, test_data = datasets.UDPOS.splits(fields)

In [3]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 12543
Number of validation examples: 2002
Number of testing examples: 2077


In [4]:
print(vars(train_data.examples[0]))

{'words': ['al', '-', 'zaman', ':', 'american', 'forces', 'killed', 'shaikh', 'abdullah', 'al', '-', 'ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'qaim', ',', 'near', 'the', 'syrian', 'border', '.'], 'chars': [['a', 'l'], ['-'], ['z', 'a', 'm', 'a', 'n'], [':'], ['a', 'm', 'e', 'r', 'i', 'c', 'a', 'n'], ['f', 'o', 'r', 'c', 'e', 's'], ['k', 'i', 'l', 'l', 'e', 'd'], ['s', 'h', 'a', 'i', 'k', 'h'], ['a', 'b', 'd', 'u', 'l', 'l', 'a', 'h'], ['a', 'l'], ['-'], ['a', 'n', 'i'], [','], ['t', 'h', 'e'], ['p', 'r', 'e', 'a', 'c', 'h', 'e', 'r'], ['a', 't'], ['t', 'h', 'e'], ['m', 'o', 's', 'q', 'u', 'e'], ['i', 'n'], ['t', 'h', 'e'], ['t', 'o', 'w', 'n'], ['o', 'f'], ['q', 'a', 'i', 'm'], [','], ['n', 'e', 'a', 'r'], ['t', 'h', 'e'], ['s', 'y', 'r', 'i', 'a', 'n'], ['b', 'o', 'r', 'd', 'e', 'r'], ['.']], 'udtags': ['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'N

In [5]:
print(vars(train_data.examples[0])['words'])

['al', '-', 'zaman', ':', 'american', 'forces', 'killed', 'shaikh', 'abdullah', 'al', '-', 'ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'qaim', ',', 'near', 'the', 'syrian', 'border', '.']


In [6]:
print(vars(train_data.examples[0])['chars'])

[['a', 'l'], ['-'], ['z', 'a', 'm', 'a', 'n'], [':'], ['a', 'm', 'e', 'r', 'i', 'c', 'a', 'n'], ['f', 'o', 'r', 'c', 'e', 's'], ['k', 'i', 'l', 'l', 'e', 'd'], ['s', 'h', 'a', 'i', 'k', 'h'], ['a', 'b', 'd', 'u', 'l', 'l', 'a', 'h'], ['a', 'l'], ['-'], ['a', 'n', 'i'], [','], ['t', 'h', 'e'], ['p', 'r', 'e', 'a', 'c', 'h', 'e', 'r'], ['a', 't'], ['t', 'h', 'e'], ['m', 'o', 's', 'q', 'u', 'e'], ['i', 'n'], ['t', 'h', 'e'], ['t', 'o', 'w', 'n'], ['o', 'f'], ['q', 'a', 'i', 'm'], [','], ['n', 'e', 'a', 'r'], ['t', 'h', 'e'], ['s', 'y', 'r', 'i', 'a', 'n'], ['b', 'o', 'r', 'd', 'e', 'r'], ['.']]


In [7]:
print(vars(train_data.examples[0])['udtags'])

['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT']


In [8]:
print(vars(train_data.examples[0])['ptbtags'])

['NNP', 'HYPH', 'NNP', ':', 'JJ', 'NNS', 'VBD', 'NNP', 'NNP', 'NNP', 'HYPH', 'NNP', ',', 'DT', 'NN', 'IN', 'DT', 'NN', 'IN', 'DT', 'NN', 'IN', 'NNP', ',', 'IN', 'DT', 'JJ', 'NN', '.']


In [9]:
MIN_FREQ = 2

WORDS.build_vocab(train_data, 
                 min_freq = MIN_FREQ,
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

CHARS.build_vocab(train_data,
                  min_freq = MIN_FREQ,
                  vectors = "glove.6B.50d",
                  unk_init = torch.Tensor.normal_)

UD_TAGS.build_vocab(train_data)
PTB_TAGS.build_vocab(train_data)

In [10]:
print(f"Unique tokens in WORDS vocabulary: {len(WORDS.vocab)}")
print(f"Unique tokens in CHARS vocabulary: {len(CHARS.vocab)}")
print(f"Unique tokens in UD_TAG vocabulary: {len(UD_TAGS.vocab)}")
print(f"Unique tokens in PTB_TAG vocabulary: {len(PTB_TAGS.vocab)}")

Unique tokens in WORDS vocabulary: 8866
Unique tokens in CHARS vocabulary: 78
Unique tokens in UD_TAG vocabulary: 18
Unique tokens in PTB_TAG vocabulary: 51


In [11]:
print(CHARS.vocab.freqs.most_common())

[('e', 95341), ('t', 71276), ('a', 66834), ('o', 60181), ('i', 57634), ('n', 55029), ('s', 48772), ('r', 46948), ('h', 38062), ('l', 33300), ('d', 30102), ('u', 22951), ('c', 22852), ('m', 20382), ('y', 16994), ('f', 16355), ('g', 16040), ('p', 15996), ('w', 15814), ('b', 12319), ('.', 11491), ('v', 8567), (',', 7155), ('k', 7150), ('-', 3768), ('0', 3035), ("'", 2558), ('1', 1929), ('j', 1652), ('x', 1640), ('2', 1583), ('"', 1298), ('!', 1221), ('/', 1140), (':', 1092), ('q', 1065), ('3', 1048), (')', 938), ('?', 937), ('(', 866), ('5', 855), ('4', 717), ('z', 704), ('9', 702), ('6', 598), ('7', 597), ('8', 547), ('_', 539), ('=', 369), ('*', 310), ('$', 270), ('@', 177), ('&', 158), ('>', 151), ('<', 143), (';', 109), ('’', 94), ('#', 73), ('+', 54), ('%', 43), ('[', 34), (']', 34), ('“', 30), ('”', 30), ('|', 25), ('~', 17), ('`', 15), ('‘', 13), ('—', 9), ('–', 9), ('^', 8), ('…', 7), ('·', 6), ('{', 4), ('}', 3), ('é', 2), ('ã', 1), ('³', 1), ('á', 1), ('ç', 1), ('\xad', 1), ('£'

In [12]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

In [13]:
class WordEncoder(nn.Module):
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 pad_idx):
    
        super().__init__()
    
        self.embedding_dim = embedding_dim
    
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
    
    def forward(self, words):
        
        #words = [sent len, batch size]
        
        embedded = self.embedding(words)
        
        #embedded = [sent len, batch size, emb dim]
        
        return embedded

In [14]:
class CharacterEncoder(nn.Module):
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 filter_size,
                 hidden_dim,
                 pad_idx):
    
        super().__init__()
        
        assert filter_size % 2 == 1, "Kernel size must be odd!"
        
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.cnn = nn.Conv1d(in_channels = embedding_dim,
                             out_channels = hidden_dim,
                             kernel_size = filter_size,
                             padding = (filter_size - 1) // 2)
        
    def forward(self, chars):
        
        #chars = [batch size, sent len, word len]
        
        batch_size = chars.shape[0]
        sent_len = chars.shape[1]
        word_len = chars.shape[2]
        
        embedded = self.embedding(chars)
        
        #embedded = [batch size, sent len, char emb dim, word len]
        
        embedded = embedded.view(-1, word_len, self.embedding_dim)
        embedded = embedded.permute(0, 2, 1)
        
        #embedded = [batch size * sent len, char emb dim, word len]
        
        embedded = self.cnn(embedded)
                
        #embedded = [batch size * sent len, hid dim, word len]
        
        embedded = embedded.view(batch_size, sent_len, self.hidden_dim, word_len)
                
        #embedded = [batch size, sent len, hid dim, word len]
        
        embedded = torch.max(embedded, dim = -1).values
                
        #embeded = [batch size, sent len, hid dim]
        
        embedded = embedded.permute(1, 0, 2)
        
        #embedded = [sent len, batch size, hid dim]
        
        return embedded

In [15]:
class RNNPOSTagger(nn.Module):
    def __init__(self, 
                 word_encoder,
                 char_encoder,
                 hidden_dim, 
                 output_dim, 
                 n_layers, 
                 bidirectional, 
                 dropout):
        
        super().__init__()
        
        assert word_encoder.embedding_dim == char_encoder.hidden_dim
        
        self.word_encoder = word_encoder
        self.char_encoder = char_encoder
        
        embedding_dim = 2 * word_encoder.embedding_dim
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers = n_layers, 
                           bidirectional = bidirectional)
        
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
          
    def forward(self, words, chars):

        #words = [sent len, batch size]
        #chars = [batch size, sent len, word len]
        
        words_embedded = self.dropout(self.word_encoder(words))
        chars_embedded = self.dropout(self.char_encoder(chars))
                
        #words_embedded = [sent len, batch size, emb dim]
        #chars_embeded = [sent len, batch size, emb dim]
        
        embedded = torch.cat((chars_embedded, words_embedded), dim = -1)
        
        #embedded = [sent len, batch size, emb dim * 2]
                
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * n directions]
        #hidden/cell = [n layers * n directions, batch size, hid dim]
        
        predictions = self.fc(self.dropout(outputs))
        
        #predictions = [sent len, batch size, output dim]
        
        return predictions

In [16]:
WORD_INPUT_DIM = len(WORDS.vocab)
WORD_EMBEDDING_DIM = 100
WORD_PAD_IDX = WORDS.vocab.stoi[WORDS.pad_token]

CHAR_INPUT_DIM = len(CHARS.vocab)
CHAR_EMBEDDING_DIM = 50
CHAR_CNN_FILTER_SIZE = 3
CHAR_PAD_IDX = CHARS.vocab.stoi[CHARS.pad_token]

HIDDEN_DIM = 128
OUTPUT_DIM = len(UD_TAGS.vocab)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

word_encoder = WordEncoder(WORD_INPUT_DIM,
                           WORD_EMBEDDING_DIM,
                           WORD_PAD_IDX)

char_encoder = CharacterEncoder(CHAR_INPUT_DIM,
                                CHAR_EMBEDDING_DIM,
                                CHAR_CNN_FILTER_SIZE,
                                WORD_EMBEDDING_DIM,
                                CHAR_PAD_IDX)

model = RNNPOSTagger(word_encoder,
                     char_encoder,
                     HIDDEN_DIM, 
                     OUTPUT_DIM, 
                     N_LAYERS, 
                     BIDIRECTIONAL, 
                     DROPOUT)

In [17]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.1)
        
model.apply(init_weights)

RNNPOSTagger(
  (word_encoder): WordEncoder(
    (embedding): Embedding(8866, 100, padding_idx=1)
  )
  (char_encoder): CharacterEncoder(
    (embedding): Embedding(78, 50, padding_idx=1)
    (cnn): Conv1d(50, 100, kernel_size=(3,), stride=(1,), padding=(1,))
  )
  (rnn): LSTM(200, 128, num_layers=2, bidirectional=True)
  (fc): Linear(in_features=256, out_features=18, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

In [18]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,643,410 trainable parameters


In [19]:
pretrained_word_embeddings = WORDS.vocab.vectors

print(pretrained_word_embeddings.shape)

torch.Size([8866, 100])


In [20]:
pretrained_char_embeddings = CHARS.vocab.vectors

print(pretrained_char_embeddings.shape)

torch.Size([78, 50])


In [21]:
word_encoder.embedding.weight.data.copy_(pretrained_word_embeddings)
char_encoder.embedding.weight.data.copy_(pretrained_char_embeddings)

tensor([[ 1.0519,  0.7052,  0.8600,  ...,  0.0044, -2.0071, -0.0570],
        [-1.2102, -0.9203,  0.1332,  ...,  1.9241,  0.7073,  1.3111],
        [ 0.7383,  0.6545,  1.0873,  ..., -0.1680,  0.6562,  1.1014],
        ...,
        [-0.4286,  1.0551,  0.6042,  ..., -0.0753, -0.1357,  0.6105],
        [-0.6707,  0.6986,  0.6963,  ...,  0.0801,  0.1009,  0.9292],
        [-0.1443,  0.1088, -0.5041,  ...,  0.2690,  1.1543,  1.0493]])

In [22]:
WORD_UNK_IDX = WORDS.vocab.stoi[WORDS.unk_token]

word_encoder.embedding.weight.data[WORD_UNK_IDX] = torch.zeros(WORD_EMBEDDING_DIM)
word_encoder.embedding.weight.data[WORD_PAD_IDX] = torch.zeros(WORD_EMBEDDING_DIM)

print(word_encoder.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.9261,  2.3049,  0.5502,  ..., -0.3492, -0.5298, -0.1577],
        [-0.5972,  0.0471, -0.2406,  ..., -0.9446, -0.1126, -0.2260],
        [-0.4809,  2.5629,  0.9530,  ...,  0.5278, -0.4588,  0.7294]])


In [23]:
CHAR_UNK_IDX = CHARS.vocab.stoi[CHARS.unk_token]

char_encoder.embedding.weight.data[CHAR_UNK_IDX] = torch.zeros(CHAR_EMBEDDING_DIM)
char_encoder.embedding.weight.data[CHAR_PAD_IDX] = torch.zeros(CHAR_EMBEDDING_DIM)

print(char_encoder.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.7383,  0.6545,  1.0873,  ..., -0.1680,  0.6562,  1.1014],
        ...,
        [-0.4286,  1.0551,  0.6042,  ..., -0.0753, -0.1357,  0.6105],
        [-0.6707,  0.6986,  0.6963,  ...,  0.0801,  0.1009,  0.9292],
        [-0.1443,  0.1088, -0.5041,  ...,  0.2690,  1.1543,  1.0493]])


In [24]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [25]:
TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

In [26]:
model = model.to(device)
criterion = criterion.to(device)

In [27]:
def categorical_accuracy(preds, y, tag_pad_idx):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]])

In [28]:
def train(model, iterator, optimizer, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        words = batch.words
        chars = batch.chars
        tags = batch.udtags
        
        optimizer.zero_grad()
        
        #words = [sent len, batch size]
        #chars = [batch size, sent len, word len]
        
        predictions = model(words, chars)
        
        #predictions = [sent len, batch size, output dim]
        #tags = [sent len, batch size]
        
        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)
        
        #predictions = [sent len * batch size, output dim]
        #tags = [sent len * batch size]
        
        loss = criterion(predictions, tags)
                
        acc = categorical_accuracy(predictions, tags, tag_pad_idx)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [29]:
def evaluate(model, iterator, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            words = batch.words
            chars = batch.chars
            tags = batch.udtags
            
            predictions = model(words, chars)
            
            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)
            
            loss = criterion(predictions, tags)
            
            acc = categorical_accuracy(predictions, tags, tag_pad_idx)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [30]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [31]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, TAG_PAD_IDX)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 10s
	Train Loss: 0.806 | Train Acc: 74.70%
	 Val. Loss: 0.402 |  Val. Acc: 87.24%
Epoch: 02 | Epoch Time: 0m 10s
	Train Loss: 0.281 | Train Acc: 91.07%
	 Val. Loss: 0.315 |  Val. Acc: 89.61%
Epoch: 03 | Epoch Time: 0m 10s
	Train Loss: 0.213 | Train Acc: 93.18%
	 Val. Loss: 0.289 |  Val. Acc: 90.80%
Epoch: 04 | Epoch Time: 0m 10s
	Train Loss: 0.177 | Train Acc: 94.26%
	 Val. Loss: 0.268 |  Val. Acc: 91.32%
Epoch: 05 | Epoch Time: 0m 10s
	Train Loss: 0.153 | Train Acc: 95.01%
	 Val. Loss: 0.250 |  Val. Acc: 91.89%
Epoch: 06 | Epoch Time: 0m 10s
	Train Loss: 0.134 | Train Acc: 95.65%
	 Val. Loss: 0.247 |  Val. Acc: 92.18%
Epoch: 07 | Epoch Time: 0m 10s
	Train Loss: 0.120 | Train Acc: 96.06%
	 Val. Loss: 0.244 |  Val. Acc: 92.42%
Epoch: 08 | Epoch Time: 0m 10s
	Train Loss: 0.107 | Train Acc: 96.51%
	 Val. Loss: 0.233 |  Val. Acc: 93.01%
Epoch: 09 | Epoch Time: 0m 10s
	Train Loss: 0.098 | Train Acc: 96.76%
	 Val. Loss: 0.241 |  Val. Acc: 92.48%
Epoch: 10 | Epoch T

In [32]:
test_loss, test_acc = evaluate(model, test_iterator, criterion, TAG_PAD_IDX)

print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.244 |  Test Acc: 92.68%


In [33]:
import spacy

def tag_sentence(model, device, sentence, word_vocab, char_vocab, tag_vocab):
    
    if isinstance(sentence, str):
        nlp = spacy.load('en')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    numericalized_words = [word_vocab.stoi[t] for t in tokens]

    unks = [t for t, n in zip(tokens, numericalized_words) if n == 0]
        
    chars = [list(t) for t in tokens]
    char_len = max([len(c) for c in chars])
    chars = [c + ['<pad>'] * (char_len - len(c)) for c in chars]
    

    numericalized_chars = [[char_vocab.stoi[i] for i in c] for c in chars]
    
    word_tensor = torch.LongTensor(numericalized_words)
    char_tensor = torch.LongTensor(numericalized_chars)
    
    word_tensor = word_tensor.unsqueeze(-1).to(device)
    char_tensor = char_tensor.unsqueeze(0).to(device)
    
    model.eval()
     
    predictions = model(word_tensor, char_tensor)
    
    top_predictions = predictions.argmax(-1)
    
    predicted_tags = [tag_vocab.itos[t.item()] for t in top_predictions]
    
    return tokens, predicted_tags, unks

In [34]:
example_index = 4

sentence = vars(train_data.examples[example_index])['words']
actual_tags = vars(train_data.examples[example_index])['udtags']

print(sentence)

['the', 'moi', 'in', 'iraq', 'is', 'equivalent', 'to', 'the', 'us', 'fbi', ',', 'so', 'this', 'would', 'be', 'like', 'having', 'j.', 'edgar', 'hoover', 'unwittingly', 'employ', 'at', 'a', 'high', 'level', 'members', 'of', 'the', 'weathermen', 'bombers', 'back', 'in', 'the', '1960s', '.']


In [35]:
tokens, pred_tags, unks = tag_sentence(model, device, sentence, WORDS.vocab, CHARS.vocab, UD_TAGS.vocab)

print(unks)

['moi', 'edgar', 'hoover', 'unwittingly', 'weathermen']


In [36]:
print("Pred. Tag\tActual Tag\tCorrect?\tToken\n")

for token, pred_tag, actual_tag in zip(tokens, pred_tags, actual_tags):
    correct = '✔' if pred_tag == actual_tag else '✘'
    print(f"{pred_tag}\t\t{actual_tag}\t\t{correct}\t\t{token}")

Pred. Tag	Actual Tag	Correct?	Token

DET		DET		✔		the
NOUN		PROPN		✘		moi
ADP		ADP		✔		in
PROPN		PROPN		✔		iraq
AUX		AUX		✔		is
ADJ		ADJ		✔		equivalent
ADP		ADP		✔		to
DET		DET		✔		the
PROPN		PROPN		✔		us
PROPN		PROPN		✔		fbi
PUNCT		PUNCT		✔		,
ADV		ADV		✔		so
PRON		PRON		✔		this
AUX		AUX		✔		would
VERB		VERB		✔		be
SCONJ		SCONJ		✔		like
VERB		VERB		✔		having
PROPN		PROPN		✔		j.
PROPN		PROPN		✔		edgar
PROPN		PROPN		✔		hoover
ADV		ADV		✔		unwittingly
VERB		VERB		✔		employ
ADP		ADP		✔		at
DET		DET		✔		a
ADJ		ADJ		✔		high
NOUN		NOUN		✔		level
NOUN		NOUN		✔		members
ADP		ADP		✔		of
DET		DET		✔		the
PROPN		PROPN		✔		weathermen
NOUN		NOUN		✔		bombers
ADV		ADV		✔		back
ADP		ADP		✔		in
DET		DET		✔		the
NOUN		NOUN		✔		1960s
PUNCT		PUNCT		✔		.


In [37]:
sentence = 'The Queen will deliver a speech about the conflict in North Korea at 1pm tomorrow.'

tokens, tags, unks = tag_sentence(model, device, sentence, WORDS.vocab, CHARS.vocab, UD_TAGS.vocab)

print(unks)

[]


In [38]:
print("Pred. Tag\tToken\n")

for token, tag in zip(tokens, tags):
    print(f"{tag}\t\t{token}")

Pred. Tag	Token

DET		the
NOUN		queen
AUX		will
VERB		deliver
DET		a
NOUN		speech
ADP		about
DET		the
NOUN		conflict
ADP		in
PROPN		north
PROPN		korea
ADP		at
NUM		1
NOUN		pm
NOUN		tomorrow
PUNCT		.
