In [17]:
import torch.nn as nn
import torch
import copy
import torch.optim as optim
import torch.nn.functional as F
import math
import matplotlib.pyplot as plt
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Tagger

In [18]:
class tag_Dataset():

    def __init__(self, filename):
        self.filename = filename

    def __iter__(self):
        tmp = []
        with open(self.filename, 'rt', encoding='utf-8') as lines:
            for line in lines:
                line = line.rstrip()
                if line:
                    tmp.append(tuple(line.split('\t')))
                else:
                    yield tmp
                    tmp = []

tag_train_data = tag_Dataset('train.txt')
tag_dev_data = tag_Dataset('dev.txt')

In [19]:
def accuracy(tagger, gold_data):
    tp = 0
    total = 0
    for sentence in gold_data:
        tokens = []
        tags = []
        for pair in sentence:
            tokens.append(pair[0])
            tags.append(pair[1])
        for i, pred_tag in enumerate(tagger.predict(tokens)):
            if sentence[i][1] == pred_tag:
                tp += 1
            total += 1
    return tp/total

In [20]:
PAD = '<pad>'
UNK = '<unk>'
def make_vocabs(gold_data):
    word_vocab = {PAD:0,UNK:1}
    tag_vocab = {PAD:0}
    inverse_word_vocab = {0:PAD,1:UNK}
    inverse_tag_vocab = {0:PAD}
    word_i = 2
    tag_i = 1
    for sentence in gold_data:
        for pair in sentence:
            if not (pair[0] in word_vocab):
                word_vocab[pair[0]] = word_i
                inverse_word_vocab[word_i] = pair[0]
                word_i += 1
            if not (pair[1] in tag_vocab):
                tag_vocab[pair[1]] = tag_i
                inverse_tag_vocab[tag_i] = pair[1]
                tag_i += 1
    return word_vocab, tag_vocab, inverse_word_vocab, inverse_tag_vocab

vocab_words, vocab_tags, inverse_vocab_words, inverse_vocab_tags = make_vocabs(tag_train_data)

In [21]:
class tag_FixedWindowModel(nn.Module):

    def __init__(self, embedding_specs = [(3, len(vocab_words), 50), (1, len(vocab_tags), 10)],
                 hidden_dim = 100, output_dim = len(vocab_tags)):
        super().__init__()
        embed_out_dim = 0
        embed_list = []
        n_cols = 0
        iters = []
        embed_i = 0
        for n, num_words, word_dim in embedding_specs:
            tmp_embedding_layer = nn.Embedding(num_words, word_dim)
            nn.init.normal_(tmp_embedding_layer.weight, std=0.01)
            embed_list.append(tmp_embedding_layer)
            embed_out_dim += word_dim * n
            iters += [embed_i] * n
            embed_i += 1
            n_cols += n
        self.iters = iters
        self.n_cols = n_cols
        self.embed = nn.ModuleList(embed_list)
        self.linear1 = nn.Linear(embed_out_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, features):
        if len(features.shape) == 1:
            features = torch.unsqueeze(features, dim=0)
        features = torch.cat(tuple(self.embed[self.iters[i]](features[:,i]) for i in range(self.n_cols)), 1)
        features = self.linear1(features)
        features = self.relu(features)
        features = self.linear2(features)
        return features

In [22]:
class Tagger(object):

    def predict(self, sentence):
        raise NotImplementedError

class FixedWindowTagger(Tagger):

    def __init__(self, vocab_words, vocab_tags, inverse_vocab_words, inverse_vocab_tags, output_dim = 4, word_dim=50, tag_dim=10, hidden_dim=100):
        self.fw_model = tag_FixedWindowModel()
        self.vocab_words = vocab_words
        self.vocab_tags = vocab_tags
        self.inverse_vocab_words = inverse_vocab_words
        self.inverse_vocab_tags = inverse_vocab_tags

    def featurize(self, words, i, pred_tags):
        tagger_conf = [0] * 4
        tagger_conf[0] = words[i]
        if i-1 < 0:
            tagger_conf[1] = self.vocab_words[PAD]
            tagger_conf[3] = self.vocab_tags[PAD]
        else:
            tagger_conf[1] = words[i-1]
            tagger_conf[3] = pred_tags[i-1]
        if i+1 > len(words)-1:
            tagger_conf[2] = self.vocab_words[PAD]
        else:
            tagger_conf[2] = words[i+1]
        return torch.LongTensor(tagger_conf)

    def predict(self, words):
        pred_tag_ids = []
        word_ids = []
        for word in words:

            word_ids.append(vocab_words[word] if word in vocab_words else vocab_words[UNK])

        for i in range(len(word_ids)):
            window = self.featurize(word_ids, i, pred_tag_ids)
            pred = self.fw_model.forward(window)
            pred_tag_ids.append(torch.argmax(pred).item())
        pred_tags = []
        for tag_id in pred_tag_ids:
            pred_tags.append(inverse_vocab_tags[tag_id])

        return pred_tags


In [23]:
def tag_training_examples(vocab_words, vocab_tags, inverse_vocab_words, inverse_vocab_tags, gold_data, tagger, batch_size=100):
  batch_i = 0
  batch_x = torch.zeros((batch_size, 4), dtype=int)
  batch_y = torch.zeros(batch_size, dtype=int)
  for sentence in gold_data:
      word_ids = []
      tag_ids = []
      for pair in sentence:
          word_ids.append(vocab_words[pair[0]])
          tag_ids.append(vocab_tags[pair[1]])
      for i in range(len(word_ids)):
          if batch_i > batch_size - 1: 
              yield batch_x, batch_y
              batch_x = torch.zeros((batch_size, 4), dtype=int)
              batch_y = torch.zeros(batch_size, dtype=int)
              batch_i = 0        
          batch_x[batch_i,:] = tagger.featurize(word_ids, i, tag_ids)
          batch_y[batch_i] = tag_ids[i]
          batch_i += 1
  if not batch_i == 0:
      yield batch_x[0:batch_i,:], batch_y[0:batch_i]

In [24]:
def tag_train_fixed_window(train_data, n_epochs=2, batch_size=100, lr=1e-2):
    vocab_words, vocab_tags, inverse_vocab_words, inverse_vocab_tags = make_vocabs(train_data)
    fw_tagger = FixedWindowTagger(vocab_words, vocab_tags, inverse_vocab_words, inverse_vocab_tags)
    print("Vocabulary and model created!")
    optimizer = optim.Adam(fw_tagger.fw_model.parameters(), lr=lr)

    for ep in range(n_epochs):
        fw_tagger.fw_model.train()
        loss_sum = 0
        n_batch = 0
        for bx, by in tag_training_examples(vocab_words, vocab_tags, inverse_vocab_words, inverse_vocab_tags, train_data, fw_tagger, batch_size=batch_size):
            bx = bx.to(device)
            by = by.to(device)
            optimizer.zero_grad()
            output = fw_tagger.fw_model.forward(bx)
            loss = F.cross_entropy(output, by)
            loss.backward()
            optimizer.step()
            loss_sum += loss.item()
            n_batch += 1      
        fw_tagger.fw_model.eval()
        print("Accuracy on validation data: " + str(accuracy(fw_tagger, tag_dev_data)))
    return fw_tagger

Parser

In [25]:
class Dataset():

    ROOT = ('<root>', '<root>', 0)  # Pseudo-root

    def __init__(self, filename):
        self.filename = filename

    def __iter__(self):
        with open(self.filename, 'rt', encoding='utf-8') as lines:
            tmp = [Dataset.ROOT]
            for line in lines:
                if not line.startswith('#'):  # Skip lines with comments
                    line = line.rstrip()
                    if line:
                        columns = line.split('\t')
                        if columns[0].isdigit():  # Skip range tokens
                            tmp.append((columns[1], columns[3], int(columns[6])))
                    else:
                        yield tmp
                        tmp = [Dataset.ROOT]

parse_train_data = Dataset('en_ewt-ud-train-projectivized.conllu')
parse_dev_data = Dataset('en_ewt-ud-dev.conllu')

In [26]:
def uas(parser, gold_data):
    tp = 0
    total = 0
    for sentence in gold_data:
        words, tags, heads = zip(*sentence)
        pred = parser.predict(words, tags)
        for i in range(1, len(pred)):
            if heads[i] == pred[i]:
                tp += 1
            total += 1
            #print("Iteration: " + str(total) + ", UAS: " + str(tp/total))
    return tp/total

In [27]:
def oracle_moves(gold_heads):
 
    MOVES = tuple(range(3))
    SH, LA, RA = MOVES

    parser = ArcStandardParser()
    config = parser.initial_config(len(gold_heads))
  
   
    while (not parser.is_final_config(config)):
        valid_moves = parser.valid_moves(config)

        if len(valid_moves) >= 2:
            indices1 = [i for i, x in enumerate(gold_heads) if x == config[1][-2]]
            indices2 = [i for i, x in enumerate(config[2]) if x == config[1][-2]]
            indices3 = [i for i, x in enumerate(gold_heads) if x == config[1][-1]]
            indices4 = [i for i, x in enumerate(config[2]) if x == config[1][-1]]
            if LA in valid_moves and gold_heads[config[1][-2]] == config[1][-1] and all(item in indices2 for item in indices1):       
                yield (config, LA)
                config = parser.next_config(config, LA)
            elif ((RA in valid_moves) and (gold_heads[config[1][-1]] == config[1][-2]) and (all(item in indices4 for item in indices3))):
                yield (config, RA)
                config = parser.next_config(config, RA)
            else:
                yield (config, SH)
                config = parser.next_config(config, SH)
              
        else:
            yield (parser.next_config(config, SH), SH)
            config = parser.next_config(config, SH)

In [28]:
PAD = '<pad>'
UNK = '<unk>'
def make_vocabs(gold_data):
    # TODO: Replace the next line with your own code
    word_vocab = {PAD:0,UNK:1}
    tag_vocab = {PAD:0}
    inverse_word_vocab = {0:PAD,1:UNK}
    inverse_tag_vocab = {0:PAD}
    word_i = 2
    tag_i = 1
    for sentence in gold_data:
        for pair in sentence:
            if not (pair[0] in word_vocab):
                word_vocab[pair[0]] = word_i
                inverse_word_vocab[word_i] = pair[0]
                word_i += 1
            if not (pair[1] in tag_vocab):
                tag_vocab[pair[1]] = tag_i
                inverse_tag_vocab[tag_i] = pair[1]
                tag_i += 1
    return word_vocab, tag_vocab, inverse_word_vocab, inverse_tag_vocab

vocab_words, vocab_tags, inverse_vocab_words, inverse_vocab_tags = make_vocabs(parse_train_data)

In [29]:
class Parser(object):

    def predict(self, words, tags):
        raise NotImplementedError

class ArcStandardParser(Parser):

    MOVES = tuple(range(3))

    SH, LA, RA = MOVES  # Parser moves are specified as integers.

    @staticmethod
    def initial_config(num_words):
        return (0, [], [0]*num_words)

    @staticmethod
    def valid_moves(config):
        valid_moves = []
        i, stack, head = config
        if i < len(head):
            valid_moves.append(0)
        if len(stack) >= 2:
            valid_moves.append(1)
            valid_moves.append(2)
        return valid_moves

    @staticmethod
    def next_config(config, move):
        i, stack, head = config
        i, stack, head = copy.deepcopy(i), copy.deepcopy(stack), copy.deepcopy(head)
        if move == 0:
            stack.append(i)
            i += 1
            return (i, stack, head)
        if move == 1:
            head[stack[-2]] = stack[-1]
            del stack[-2]
            return (i, stack, head)
        if move == 2:
            head[stack[-1]] = stack[-2]
            del stack[-1]
            return (i, stack, head)
        print("Error!!!")

    @staticmethod
    def is_final_config(config):
        i, stack, head = config

        if i == len(head) and len(stack) == 1:
            return True
        return False

In [30]:
class parse_FixedWindowModel(nn.Module):

    def __init__(self, embedding_specs = [(3, len(vocab_words), 50), (3, len(vocab_tags), 10)],
                 hidden_dim = 180, output_dim = 3):
        super().__init__()
        embed_out_dim = 0
        embed_list = []
        n_cols = 0
        iters = []
        embed_i = 0
        for n, num_words, word_dim in embedding_specs:
            tmp_embedding_layer = nn.Embedding(num_words, word_dim)
            nn.init.normal_(tmp_embedding_layer.weight, std=0.01)
            embed_list.append(tmp_embedding_layer)
            embed_out_dim += word_dim * n
            iters += [embed_i] * n
            embed_i += 1
            n_cols += n
        self.iters = iters
        self.n_cols = n_cols
        self.embed = nn.ModuleList(embed_list)
        self.linear1 = nn.Linear(embed_out_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, features):
        if len(features.shape) == 1:
            features = torch.unsqueeze(features, dim=0)
        features = torch.cat(tuple(self.embed[self.iters[i]](features[:,i]) for i in range(self.n_cols)), 1)
        features = self.linear1(features)
        features = self.relu(features)
        features = self.linear2(features)
        return features

In [31]:
class FixedWindowParser(ArcStandardParser):

    def __init__(self, vocab_words, vocab_tags, hidden_dim=180):
        super().__init__()
        self.fw_model = parse_FixedWindowModel()
        self.vocab_words = vocab_words
        self.vocab_tags = vocab_tags
 

    def featurize(self, words, tags, config):
        features = [0]*6
        i, stack, head = config
        i, stack, head = copy.deepcopy(i), copy.deepcopy(stack), copy.deepcopy(head)
        if i < len(words):
            features[0] = words[i]
            features[3] = tags[i]
        else:
            features[0] = self.vocab_words[PAD]
            features[3] = self.vocab_tags[PAD]
        if len(stack) == 0:
            features[1] = self.vocab_words[PAD]
            features[4] = self.vocab_tags[PAD]
        else:
            features[1] = words[stack[-1]]
            features[4] = tags[stack[-1]]
        if len(stack) <= 1:
            features[2] = self.vocab_words[PAD]
            features[5] = self.vocab_tags[PAD]
        else:
            features[2] = words[stack[-2]]
            features[5] = tags[stack[-2]]

        return torch.LongTensor(features)


    def predict(self, words, tags, want_print = False):
        word_ids = []
        tag_ids = []
        for word in words:
            word_ids.append(self.vocab_words[word] if word in self.vocab_words else self.vocab_words[UNK])
        for tag in tags:
            tag_ids.append(self.vocab_tags[tag])
        config = self.initial_config(len(words))
        while not self.is_final_config(config):
            features = self.featurize(word_ids, tag_ids, config)
            pred = self.fw_model.forward(features)
            valid_moves = self.valid_moves(config)

            best_move_score = -math.inf
            best_move = None
            for move in valid_moves:
                if pred[0,move].item() > best_move_score:
                    best_move = move
                    best_move_score = pred[0,move].item()
            if want_print:
                print(config)
            config = self.next_config(config, best_move)
        return config[2]

In [32]:
moves = [0, 0, 0, 1, 0, 0, 1, 2, 0, 2, 2]    # 0 = SH, 1 = LA, 2 = RA

parser = ArcStandardParser()
example_sentence = list(parse_train_data)[531]
config = parser.initial_config(len(example_sentence))
for move in moves:
    assert move in parser.valid_moves(config)
    config = parser.next_config(config, move)
assert parser.is_final_config(config)
assert config == (6, [0], [0, 2, 0, 4, 2, 2])


gold_heads = [h for w, t, h in example_sentence]
gold_moves = [0, 0, 0, 1, 0, 0, 1, 2, 0, 2, 2]

assert list(m for _, m in oracle_moves(gold_heads)) == gold_moves

print('Looks good!')

Looks good!


In [33]:
def parse_training_examples(vocab_words, vocab_tags, inverse_vocab_words, inverse_vocab_tags, gold_data, parser, batch_size=100):
  batch_i = 0
  batch_x = torch.zeros((batch_size, 6), dtype=int)
  batch_y = torch.zeros(batch_size, dtype=int)
  for sentence in gold_data:
      word_ids = []
      tag_ids = []
      heads = []
      for trip in sentence:
          word_ids.append(vocab_words[trip[0]])
          tag_ids.append(vocab_tags[trip[1]])
          heads.append(trip[2])
      
      for config, move in tuple(oracle_moves(heads))[1:]:
          if batch_i > batch_size - 1:
              yield batch_x, batch_y
              batch_x = torch.zeros((batch_size, 6), dtype=int)
              batch_y = torch.zeros(batch_size, dtype=int)
              batch_i = 0
          batch_x[batch_i,:] = parser.featurize(word_ids, tag_ids, config)
          batch_y[batch_i] = move 
          batch_i += 1
  if not batch_i == 0:
      yield batch_x[0:batch_i,:], batch_y[0:batch_i]

In [34]:
def parse_train_fixed_window(train_data, n_epochs=2, batch_size=100, lr=1e-2):
    vocab_words, vocab_tags, inverse_vocab_words, inverse_vocab_tags = make_vocabs(train_data)
    parser = FixedWindowParser(vocab_words, vocab_tags)
    print("Vocabulary and model created!")
    optimizer = optim.Adam(parser.fw_model.parameters(), lr=lr)
    
    for ep in range(n_epochs):
        parser.fw_model.train()
        n_batch = 0
        for bx, by in parse_training_examples(vocab_words, vocab_tags, inverse_vocab_words, inverse_vocab_tags, train_data, parser, batch_size=batch_size):
            #if n_batch > 10:
             #   break
            bx = bx.to(device)
            by = by.to(device)
            optimizer.zero_grad()
            output = parser.fw_model.forward(bx)
            loss = F.cross_entropy(output, by)
            loss.backward()
            optimizer.step()
            n_batch += 1
        parser.fw_model.eval()
    
    return parser

Pipeline

In [35]:
vocab_words, vocab_tags, inverse_vocab_words, inverse_vocab_tags = make_vocabs(parse_train_data)
parser = parse_train_fixed_window(parse_train_data, n_epochs=1)
print('{:.4f}'.format(uas(parser, parse_dev_data)))

Vocabulary and model created!
0.7072


In [36]:
vocab_words, vocab_tags, inverse_vocab_words, inverse_vocab_tags = make_vocabs(tag_train_data)
tagger = tag_train_fixed_window(tag_train_data, n_epochs=2)
print('{:.4f}'.format(accuracy(tagger, tag_dev_data)))

Vocabulary and model created!
Accuracy on validation data: 0.869850485128042
Accuracy on validation data: 0.8946238269444886
0.8946


In [75]:
pipeline_valid_data = []
for sentence in list(parse_dev_data):
    words = []
    heads = []
    for trip in sentence:
        words.append(trip[0])
        heads.append(trip[2])
    pred_tags = tagger.predict(words)
    valid_sentence = []
    for i in range(len(words)):
        valid_sentence.append((words[i], pred_tags[i], heads[i]))
    pipeline_valid_data.append(valid_sentence)

print('{:.4f}'.format(uas(parser, pipeline_valid_data)))

0.6414
