In [1]:
import torch
import numpy as np
from gensim.models import Word2Vec
import gensim
from nltk.tokenize import sent_tokenize, word_tokenize
from conllu import parse
import torch.nn as nn
from torch.utils.data import Dataset
from torchtext.vocab import build_vocab_from_iterator, Vocab
from torch.nn.utils.rnn import pad_sequence

In [2]:
train_file = "./ud-treebanks-v2.13/UD_English-Atis/en_atis-ud-train.conllu"
dev_file = "./ud-treebanks-v2.13/UD_English-Atis/en_atis-ud-dev.conllu"
test_file = "./ud-treebanks-v2.13/UD_English-Atis/en_atis-ud-test.conllu"

def read_conllu(file):
    with open(file, "r") as f:
        data = f.read()
    return parse(data)

train_data = read_conllu(train_file)
dev_data = read_conllu(dev_file)
test_data = read_conllu(test_file)

In [18]:
import torch
def get_sentences(data):
    sentences = []
    for sentence in data:
        sentence = [word['form'] for word in sentence]
        sentences.append(sentence)
    return sentences

train_sentences = get_sentences(train_data)
dev_sentences = get_sentences(dev_data)
test_sentences = get_sentences(test_data)


tags_to_num = {
    "ADJ": 0,
    "ADP": 1,
    "ADV": 2,
    "AUX": 3,
    "CCONJ": 4,
    "DET": 5,
    "INTJ": 6,
    "NOUN": 7,
    "NUM": 8,
    "PART": 9,
    "PRON": 10,
    "PROPN": 11,
    "PUNCT": 12,
    "SCONJ": 13,
    "SYM": 14,
    "VERB": 15,
    "X": 16,
    "PAD": 17,
}

num_to_tag = {
    0: "ADJ",
    1: "ADP",
    2: "ADV",
    3: "AUX",
    4: "CCONJ",
    5: "DET",
    6: "INTJ",
    7: "NOUN",
    8: "NUM",
    9: "PART",
    10: "PRON",
    11: "PROPN",
    12: "PUNCT",
    13: "SCONJ",
    14: "SYM",
    15: "VERB",
    16: "X",
    17: "PAD",

}

def get_tags(data):
    tags = []
    for sentence in data:
        sentence = [tags_to_num[word['upostag']] for word in sentence]
        tags.append(sentence)
    return tags

train_tags = get_tags(train_data)
dev_tags = get_tags(dev_data)
test_tags = get_tags(test_data)




In [4]:
START_TOKEN = "<s>"
END_TOKEN = "</s>"
UNKNOWN_TOKEN = "<unk>"
PAD_TOKEN = "<pad>"

class POSDataset(Dataset):
  def __init__(self, p, s, data: list[tuple[list[str], list[int]]], vocabulary:Vocab|None=None):
    """Initialize the dataset. Setup Code goes here"""
    self.p = p
    self.s = s
    self.sentences = [i[0] for i in data]
    self.labels = [i[1] for i in data]


    if vocabulary is None:
      self.vocabulary = build_vocab_from_iterator(self.sentences, specials=[START_TOKEN, END_TOKEN, UNKNOWN_TOKEN, PAD_TOKEN]) # use min_freq for handling unkown words better
      self.vocabulary.set_default_index(self.vocabulary[UNKNOWN_TOKEN])
    else:
      # if vocabulary provided use that
      self.vocabulary = vocabulary

    self.sentences = []
    self.labels = []
    for sentence, label in data:
      sentence = [START_TOKEN] + sentence + [END_TOKEN]
      label = [tags_to_num["PAD"]] + label + [tags_to_num["PAD"]]
      sentence = [PAD_TOKEN] * (self.p) + sentence + [PAD_TOKEN] * (self.s)
      label = [tags_to_num["PAD"]] * (self.p) + label + [tags_to_num["PAD"]] * (self.s)

      # split into p+s+1 chunks
      for i in range(self.p, len(sentence)-self.s-1):
        temp = sentence[i-self.p:i+self.s+1]
        self.sentences.append(temp)
        self.labels.append(torch.nn.functional.one_hot(torch.tensor(label[i]), num_classes=len(tags_to_num)))
        

  def __len__(self) -> int:
    """Returns number of datapoints."""
    return len(self.sentences)

  def __getitem__(self, index: int) -> tuple[torch.Tensor, torch.Tensor]:
    """Get the datapoint at `index`."""
    return torch.tensor(self.vocabulary.lookup_indices(self.sentences[index])), torch.tensor(self.labels[index])

  def collate(self, batch: list[tuple[torch.Tensor, torch.Tensor]]) -> tuple[torch.Tensor, torch.Tensor]:
    """Given a list of datapoints, batch them together"""
    sentences = [i[0] for i in batch]
    labels = [i[1] for i in batch]
    padded_sentences = pad_sequence(sentences, batch_first=True, padding_value=self.vocabulary[PAD_TOKEN]) # pad sentences with pad token id
    padded_labels = pad_sequence(labels, batch_first=True, padding_value=torch.tensor(17)) # pad labels with 17

    return padded_sentences, padded_labels

In [5]:
class FNN_POS_Tagger(nn.Module):
    def __init__(self, p, s, vocabulary_size: int):
        super().__init__()
        self.p = p
        self.s = s

        self.embedding_module = torch.nn.Embedding(vocabulary_size, 32)
        self.entity_predictor = torch.nn.Sequential(
                                    torch.nn.Linear(32, 20),
                                    torch.nn.ReLU(),
                                    torch.nn.Linear(20, 18))


    def forward(self, x):
        x = self.embedding_module(x)
        x = self.entity_predictor(x)

        # x = self.p i self.s
        x = x[:, self.p, :]
        return x


In [6]:
p, s = 3, 2

train_dataset = POSDataset(p,s, list(zip(train_sentences, train_tags)))
dev_dataset = POSDataset(p,s,list(zip(dev_sentences, dev_tags)), vocabulary=train_dataset.vocabulary)
test_dataset = POSDataset(p,s,list(zip(test_sentences, test_tags)), vocabulary=train_dataset.vocabulary)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=32)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

In [7]:
for i, j in train_loader:
    print(i.shape, j.shape)
    break   

torch.Size([32, 6]) torch.Size([32, 18])


  return torch.tensor(self.vocabulary.lookup_indices(self.sentences[index])), torch.tensor(self.labels[index])


In [22]:
loss_fn = torch.nn.CrossEntropyLoss() # use ignore index to ignore losses for padding value indices
entity_predictor = FNN_POS_Tagger(p, s, len(train_dataset.vocabulary))
optimizer = torch.optim.SGD(entity_predictor.parameters(), lr=1e-1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# entity_predictor = torch.nn.DataParallel(entity_predictor)
entity_predictor.to(device)

from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

for epoch in range(10):
    for step, (word, tag) in enumerate(train_loader):
        word, tag = word.to(device), tag.to(device)
        optimizer.zero_grad()
        output = entity_predictor(word)
        tag = tag.float()
        loss = loss_fn(output, tag)
        loss.backward()
        optimizer.step()

        if step%1000 == 0:
            print(f"Epoch {epoch} Step {step} Loss: {loss.item():.3f}")

    correct = 0
    total = 0
    with torch.no_grad():
        for word, tag in dev_loader:
            word, tag = word.to(device), tag.to(device)
            output = entity_predictor(word)
            output = torch.argmax(output, dim=1)
            tag = torch.argmax(tag, dim=1)
            correct += (output == tag).sum()

    print()
    print(f"Epoch {epoch} Accuracy: {correct/len(dev_dataset):.3f}")
    print()
        

correct = 0
total = 0
with torch.no_grad(): 
    for word, tag in test_loader:
        word, tag = word.to(device), tag.to(device)
        output = entity_predictor(word)
        output = torch.argmax(output, dim=1)
        tag = torch.argmax(tag, dim=1)
        correct += (output == tag).sum()


print()
print(f"Test Accuracy: {correct/len(test_dataset):.3f}")


        

Epoch 0 Step 0 Loss: 2.859
Epoch 0 Step 1000 Loss: 0.501
Epoch 0 Accuracy: 0.896
Epoch 1 Step 0 Loss: 0.272
Epoch 1 Step 1000 Loss: 0.097
Epoch 1 Accuracy: 0.918
Epoch 2 Step 0 Loss: 0.145
Epoch 2 Step 1000 Loss: 0.353
Epoch 2 Accuracy: 0.939
Epoch 3 Step 0 Loss: 0.047
Epoch 3 Step 1000 Loss: 0.136
Epoch 3 Accuracy: 0.927
Epoch 4 Step 0 Loss: 0.135
Epoch 4 Step 1000 Loss: 0.274
Epoch 4 Accuracy: 0.936
Epoch 5 Step 0 Loss: 0.019
Epoch 5 Step 1000 Loss: 0.012
Epoch 5 Accuracy: 0.951
Epoch 6 Step 0 Loss: 0.138
Epoch 6 Step 1000 Loss: 0.233
Epoch 6 Accuracy: 0.953
Epoch 7 Step 0 Loss: 0.103
Epoch 7 Step 1000 Loss: 0.040
Epoch 7 Accuracy: 0.949
Epoch 8 Step 0 Loss: 0.169
Epoch 8 Step 1000 Loss: 0.270
Epoch 8 Accuracy: 0.955
Epoch 9 Step 0 Loss: 0.174
Epoch 9 Step 1000 Loss: 0.557
Epoch 9 Accuracy: 0.955

Test Accuracy: 0.954


In [37]:
sentence = "There were 70 children there."
sentence = sentence.lower()
sentence = word_tokenize(sentence)

sentence = test_sentences[0]
sentence = [START_TOKEN] + sentence + [END_TOKEN]
sentence = [PAD_TOKEN] * p + sentence + [PAD_TOKEN] * s

sentence = torch.tensor(train_dataset.vocabulary.lookup_indices(sentence)).to(device)
# split into chunks of p+s+1
chunks = [sentence[i:i+p+s+1] for i in range(len(sentence)-p-s)]

tag_chunk = [tags_to_num["PAD"]] + test_tags[0] + [tags_to_num["PAD"]]

for i, chunk in enumerate(chunks):
    output = entity_predictor(chunk.unsqueeze(0))
    output = torch.argmax(output, dim=1)
    print(num_to_tag[output.item()], train_dataset.vocabulary.get_itos()[chunk[p].item()], num_to_tag[tag_chunk[i]])

PAD <s> PAD
PRON what PRON
AUX are AUX
DET the DET
NOUN coach NOUN
NOUN flights NOUN
ADP between ADP
PROPN dallas PROPN
CCONJ and CCONJ
PROPN baltimore PROPN
VERB leaving VERB
NOUN august NOUN
ADJ tenth ADJ
CCONJ and CCONJ
VERB returning VERB
NOUN august NOUN
VERB <unk> NUM
PROPN </s> PAD
