In [None]:
import numpy as np
import pandas as pd

import string
import time
from collections import Counter

### Load data

In [None]:
# append word into current sentence
# if this word is a new line, append current sentence ot all sentences and then refresh current sent
def load_data(data_path):
    with open(data_path) as data:
        sentences = []
        tags = []
        curr_sen = []
        curr_tag = []
        i = 0
        for phrase in data:
            if phrase != '\n':
                word, tag = phrase.split('\t')
                curr_sen.append(word)
                # strip removes the trailing '\n'
                curr_tag.append(tag.strip())
            if phrase == '\n':
                sentences.append(curr_sen)
                tags.append(curr_tag)
                curr_sen = []
                curr_tag = []
                
    return sentences, tags

In [None]:
def load_test_data(data_path):
    with open(data_path) as data:
        sentences = []
        test_data = []
        for phrase in data.read().strip().split('\n\n'):
            phrase = phrase.strip()
            lines = phrase.split('\n')
            test_data.append([line.strip() for line in lines])
    return test_data

In [None]:
train_data = 'data/train/train.txt'
sentences, tags = load_data(train_data)

### Preprocess data
1. remove hashtags
2. mentions
3. change punctuation to a unique tag
4. changes links to link tag
5. change verbs to verb tag

In [None]:
def create_vocab_dicts(corpus):
    vocab_counts = {}
    vocab_ids = {}
    id_num = 1
    vocab_counts['-UNK-'] = 0
    vocab_ids['-UNK-'] = 0
    
    for i in range(len(corpus)):
        for j in range(len(corpus[i])):
            curr_word = corpus[i][j]
            
            # make dict with key = vocab, value = id
            if curr_word not in vocab_ids:
                vocab_ids[curr_word] = id_num
                id_num += 1
                
            # make dict with key = vocab, value = freq
            if curr_word not in vocab_counts:
                vocab_counts[curr_word] = 1
            else:
                vocab_counts[curr_word] += 1
                
    return vocab_counts, vocab_ids
    

In [None]:
vocab_counts, vocab_ids = create_vocab_dicts(sentences)

In [None]:
def preprocess(corpus, vocab_count_dict, word_threshold):
    
    num_hashtags = 0
    num_mentions = 0
    num_puncs = 0
    num_links = 0
    num_rare = 0
    num_digit = 0
    special_tags = ['-HASHTAG-', '-MENTION-', '-PUNC-', '-LINK-']
    
    for i in range(len(corpus)):
        for j in range(len(corpus[i])):
            curr_word = corpus[i][j]
        
            if curr_word[0] == '#':
                corpus[i][j] = '-HASHTAG-'
                num_hashtags += 1
                
            if curr_word[0] == '@':
                corpus[i][j] = '-MENTION-'
                num_mentions += 1
            
            if curr_word[0] in '!$%&\'()*+,-./:;<=>?[\\]^_`{|}~':
                corpus[i][j] = '-PUNC-'
                num_puncs += 1
            
            if (curr_word[0:4] == 'http') or (curr_word[0:3] == 'www'):
                corpus[i][j] = '-LINK-'
                num_links += 1
    
    for i in range(len(corpus)):
        for j in range(len(corpus[i])):
            curr_word = corpus[i][j]
            
            if curr_word in special_tags:
                continue
                
            if curr_word.isdigit():
                corpus[i][j] = '-DIGIT-'
                num_digit += 1
                
#             if vocab_count_dict[curr_word] <= word_threshold:
#                 corpus[i][j] = '-RARE-'
#                 num_rare += 1
                    
    vocab_counts, vocab_ids = create_vocab_dicts(corpus)

    print('Total Changed Words')
    print('-------------------')
    print('Hashtags:      {}'.format(num_hashtags))
    print('Mentions:      {}'.format(num_mentions))
    print('Punctuation:   {}'.format(num_puncs))
    print('Links:         {}'.format(num_links))
#     print('Rare:          {}'.format(num_rare))
    print('Digit:         {}'.format(num_digit))
    print('-------------------')
    print('Vocab Size:    {}'.format(len(vocab_counts)))
    
    return corpus, vocab_counts, vocab_ids

In [None]:
corpus, vocab_counts, vocab_ids = preprocess(sentences, vocab_counts, 1)

In [None]:
def words_to_ids(corpus, vocab2ID_dict):
    
    num_unk = 0
    
    for i in range(len(corpus)):
        for j in range(len(corpus[i])):
            try:
                corpus[i][j] = vocab_ids[corpus[i][j]]
            except:
                corpus[i][j] = vocab_ids['-UNK-']
                num_unk += 1
    
    print('Number of Unknown Words: {}'.format(num_unk))
    
    return corpus

In [None]:
corpus = words_to_ids(corpus, vocab_ids)

In [None]:
def tags_to_vectors(tags):
    for i in range(len(tags)):
        for j in range(len(tags[i])):
            curr_tag = tags[i][j]
            if curr_tag == 'B':
                tags[i][j] = [1, 0, 0]
            elif curr_tag == 'I':
                tags[i][j] = [0, 1, 0]
            elif curr_tag == 'O':
                tags[i][j] = [0, 0, 1]
            else:
                print('bro what')
                break
    return tags

In [None]:
def tags_to_onehot(tags):
    for i in range(len(tags)):
        for j in range(len(tags[i])):
            curr_tag = tags[i][j]
            if curr_tag == 'B':
                tags[i][j] = 2
            elif curr_tag == 'I':
                tags[i][j] = 1
            elif curr_tag == 'O':
                tags[i][j] = 0
            else:
                print('Invalid tag!')
                break
    return tags

In [None]:
tags = tags_to_onehot(tags)

In [None]:
dev_path = 'data/dev/dev.txt'
dev_data, dev_tags = load_data(dev_path)

dev_vocab_counts, dev_vocab_ids = create_vocab_dicts(dev_data)
dev_corpus, dev_vocab_counts, dev_vocab_ids = preprocess(dev_data, dev_vocab_counts, 1)

dev_corpus = words_to_ids(dev_corpus, vocab_ids)

dev_tags = tags_to_onehot(dev_tags)

In [None]:
test_path = 'data/test/test.nolabels.txt'
test_data = load_test_data(test_path)

test_vocab_counts, test_vocab_ids = create_vocab_dicts(test_data)
test_corpus, test_vocab_counts, test_vocab_ids = preprocess(test_data, test_vocab_counts, 1)

test_corpus = words_to_ids(test_corpus, vocab_ids)

# test_tags = tags_to_onehot(test_tags)

# Create neural network

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
class Net(nn.Module):
    
    def __init__(self, num_epochs, learning_rate, momentum, 
                 vocab_size, embedding_dim, lstm_dim1, lstm_dim2, lstm_dim3):
        
        self.learning_rate = learning_rate
        self.momentum = momentum
        
        self.num_epochs = num_epochs
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.lstm_hidden_dim1 = lstm_dim1
        self.lstm_hidden_dim2 = lstm_dim2
        self.lstm_hidden_dim3 = lstm_dim3
        
        super(Net, self).__init__()
        
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        
        self.lstm1 = nn.LSTM(self.embedding_dim, self.lstm_hidden_dim1)
        self.lstm2 = nn.LSTM(self.lstm_hidden_dim1, self.lstm_hidden_dim2)
        self.lstm3 = nn.LSTM(self.lstm_hidden_dim2, self.lstm_hidden_dim3)
        
        self.fc1 = nn.Linear(self.lstm_hidden_dim3, int(self.lstm_hidden_dim3/2))
        self.fc2 = nn.Linear(int(self.lstm_hidden_dim2/2), 3)
        
    def forward(self, x):
        
        x = self.embedding(x)

        x = x.view(1, -1, x.shape[1])
        
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x, _ = self.lstm3(x)
        
        x = x.contiguous()
        x = x.view(-1, x.shape[2])
        
        x = self.fc1(x)
        #x = self.fc2(x)
        
        return F.log_softmax(x, dim = 1)
        #return x
    
    def train_and_evaluate(self, data, labels, dev_data, dev_labels):

        start_time = time.time()

        total_training_loss = []
        total_dev_loss = []
        correct = 0
        total = 0

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(net.parameters(), lr = self.learning_rate, momentum = self.momentum)

        for epoch in range(self.num_epochs):
            epoch_start = time.time()

            print('Starting Epoch: {}'.format(epoch))
            print('------------------------')
            train_loss = 0

            for i, (sentence, label) in enumerate(zip(data, labels)):            
                sentence, label = torch.LongTensor(sentence), torch.LongTensor(label)

                optimizer.zero_grad()

                outputs = net(sentence)
                loss = criterion(outputs, label)
                loss.backward()
                optimizer.step()    
                train_loss += loss.item()

            total_training_loss.append(train_loss)

            with torch.no_grad():
                running_dev_loss = 0
                for i, (sentencedev, labeldev) in enumerate(zip(dev_data, dev_labels)):
                    sentencedev, labeldev = torch.LongTensor(sentencedev), torch.LongTensor(labeldev)

                    net.eval()

                    prediction = net(sentencedev)

                    dev_loss = criterion(prediction, labeldev)
                    running_dev_loss += dev_loss.item()

            total_dev_loss.append(dev_loss)

            epoch_end = time.time()
            total_epoch_time = round((epoch_end - epoch_start) / 60, 2)
 
            print('Epoch: {}\nTraining Loss: {}\nDev Loss: {}'.format(epoch, train_loss, dev_loss))
            print('Epoch run time: {} minutes\n'.format(total_epoch_time))

        end_time = time.time()
        total_time = round((end_time - start_time) / 60, 2)

        print('Finished training in {} minutes'.format(total_time))
        return total_training_loss, total_dev_loss

In [None]:
print('', len(corpus), len(corpus[0]), '\n', len(tags), len(tags[0]))
print('', len(dev_corpus), len(dev_corpus[0]), '\n', len(dev_tags), len(dev_tags[0]))

print('', len(vocab_counts))
print('', len(dev_vocab_counts))


In [None]:
num_epochs = 5
learning_rate = 0.01
momentum = 0.9
vocab_size = len(vocab_ids)
embedding_dim = 1000
lstm_dim1 = 500
lstm_dim2 = 200
lstm_dim3 = 100

# net = Net()

net = Net(num_epochs, learning_rate, momentum, vocab_size, embedding_dim, lstm_dim1, lstm_dim2, lstm_dim3)

total_train_loss, total_dev_loss = net.train_edev(corpus, tags, dev_corpus, dev_tags)

In [None]:
print('Net Architecture:')
net

In [None]:
def predict_and_transform(network, data, tag_index_dict):
    predictions = []
    
    with torch.no_grad():
        for sample in data:
            sample = torch.LongTensor(sample)
            predicted_label = network(sample)
            predictions.append(predicted_label)

    trans_pred = []
    for prediction in predictions:
        curr_sample = []
        for label in prediction:
            label = np.argsort(-label.numpy())[0]
            trans_label = tag_index_dict[label]
            curr_sample.append(trans_label)
        trans_pred.append(curr_sample)
    
    return trans_pred

def export_predictions(file_path, predictions):
    with open(file_path, 'w') as f:
        for sample in predictions:
            for label in sample:
                line = label + '\n'
                f.write(line)
            f.write('\n')

In [None]:
import re, sys

def warning(msg):
    print("WARNING:", msg)

def convert_bio_to_spans(bio_sequence):
    spans = []  # (label, startindex, endindex)
    cur_start = None
    cur_label = None
    N = len(bio_sequence)
    for t in range(N+1):
        if ((cur_start is not None) and
                (t==N or re.search("^[BO]", bio_sequence[t]))):
            assert cur_label is not None
            spans.append((cur_label, cur_start, t))
            cur_start = None
            cur_label = None
        if t==N: continue
        assert bio_sequence[t] and bio_sequence[t][0] in ("B","I","O")
        if bio_sequence[t].startswith("B"):
            cur_start = t
            cur_label = re.sub("^B-?","", bio_sequence[t]).strip()
        if bio_sequence[t].startswith("I"):
            if cur_start is None:
               #warning("BIO inconsistency: I without starting B. Rewriting to B.")
                newseq = bio_sequence[:]
                newseq[t] = "B" + newseq[t][1:]
                return convert_bio_to_spans(newseq)
            continuation_label = re.sub("^I-?","",bio_sequence[t])
            if continuation_label != cur_label:
                newseq = bio_sequence[:]
                newseq[t] = "B" + newseq[t][1:]
                #warning("BIO inconsistency: %s but current label is '%s'. Rewriting to %s" % (bio_sequence[t], cur_label, newseq[t]))
                return convert_bio_to_spans(newseq)

    # should have exited for last span ending at end by now
    assert cur_start is None
    spancheck(spans)
    return spans

def test_bio_conversion():
    spans = convert_bio_to_spans(["B"])
    assert spans==[("",0,1)]
    spans = convert_bio_to_spans(["B","I"])
    assert spans==[("",0,2)]
    spans = convert_bio_to_spans(["B","I","O"])
    assert spans==[("",0,2)]
    spans = convert_bio_to_spans(["O","B","I","O","O"])
    assert spans==[("",1,3)]
    spans = convert_bio_to_spans(["B","B"])
    assert spans==[("",0,1), ("",1,2)]
    spans = convert_bio_to_spans(["B","I","B"])
    assert spans==[("",0,2), ("",2,3)]
    spans = convert_bio_to_spans(["B-asdf","I-asdf","B"])
    assert spans==[("asdf",0,2), ("",2,3)]
    spans = convert_bio_to_spans(["B-asdf","I-difftype","B"])
    assert spans==[("asdf",0,1), ("difftype",1,2), ("",2,3)]
    spans = convert_bio_to_spans(["I","I"])
    assert spans==[("",0,2)]
    spans = convert_bio_to_spans(["B-a","I-b"])
    assert spans==[("a",0,1), ("b",1,2)]


def spancheck(spanlist):
    s = set(spanlist)
    assert len(s)==len(spanlist), "spans are non-unique ... is this a bug in the eval script?"

def kill_labels(bio_seq):
    ret = []
    for x in bio_seq:
        if re.search("^[BI]", x):
            x = re.sub("^B.*","B", x)
            x = re.sub("^I.*","I", x)
        ret.append(x)
    return ret

def evaluate_taggings(goldseq_predseq_pairs, ignore_labels=False):
    """a list of (goldtags,predtags) pairs.  goldtags and predtags are both lists of strings, of the same length."""
    num_sent = 0
    num_tokens= 0
    num_goldspans = 0
    num_predspans = 0

    tp, fp, fn = 0,0,0

    for goldseq,predseq in goldseq_predseq_pairs:
        N = len(goldseq)
        assert N==len(predseq)
        num_sent += 1
        num_tokens += N

        if ignore_labels:
            goldseq = kill_labels(goldseq)
            predseq = kill_labels(predseq)

        goldspans = convert_bio_to_spans(goldseq)
        predspans = convert_bio_to_spans(predseq)

        num_goldspans += len(goldspans)
        num_predspans += len(predspans)

        goldspans_set = set(goldspans)
        predspans_set = set(predspans)

        tp += len(goldspans_set & predspans_set)
        fp += len(predspans_set - goldspans_set)
        fn += len(goldspans_set - predspans_set)

    prec = tp/(tp+fp) if (tp+fp)>0 else 0
    rec =  tp/(tp+fn) if (tp+fn)>0 else 0
    f1 = 2*prec*rec / (prec + rec) if (prec + rec) > 0 else 0
    print("F = {f1:.4f},  Prec = {prec:.4f} ({tp}/{tpfp}),  Rec = {rec:.4f} ({tp}/{tpfn})".format(
            tpfp=tp+fp, tpfn=tp+fn, **locals()))
    print("({num_sent} sentences, {num_tokens} tokens, {num_goldspans} gold spans, {num_predspans} predicted spans)".format(**locals()))

def read_tokens_tags_file(filename):
    """Returns list of sentences.  each sentence is a pair (tokens, tags), each
    of which is a list of strings of the same length."""
    sentences = open(filename).read().strip().split("\n\n")
    ret = []
    for sent in sentences:
        sent = sent.strip()
        lines = sent.split("\n")
        pairs = [L.split("\t") for L in lines]
        for pair in pairs:
            assert len(pair)==2, "Was expecting 2 tab-separated items per line."
        tokens = [tok for tok,tag in pairs]
        tags = [tag for tok,tag in pairs]
        ret.append( (tokens,tags) )
    return ret

def read_tags_file(filename):
    sentences = open(filename).read().strip().split("\n\n")
    ret = []
    for sent in sentences:
        sent = sent.strip()
        lines = sent.split("\n")
        for line in lines:
            assert len(line.split())==1, "Was expecting 1 item per line"
        ret.append( [line.strip() for line in lines] )
    return ret

def evaluate_tagging_file(gold_tags_file, predicted_tags_file):

    tokens_and_tags = read_tokens_tags_file(gold_tags_file)
    goldseqs = [tags for tokens,tags in tokens_and_tags]
    predtags = read_tags_file(predicted_tags_file)

    print("Span-level NER evaluation")
    evaluate_taggings( list(zip(goldseqs, predtags)), ignore_labels=True )

In [None]:
dev_pred = predict(dev_data)

tag_index_dict = {2: 'B', 1: 'I', 0: 'O'}
trans_dev_pred = transform_predictions(dev_pred, tag_index_dict)

pred_file_path = 'results/dev/dev_pred.out'
export_predictions(pred_file_path, trans_dev_pred)

In [None]:
evaluate_tagging_file('data/dev/dev.txt', 'results/dev_preds.txt')

In [None]:
evaluate_tagging_file('data/dev/dev.txt', 'results/dev/dev_epoch_4_true.out')

In [None]:
tag_index_dict = {2: 'B', 1: 'I', 0: 'O'}
dev_pred = predict_and_transform(net, dev_data, tag_index_dict)

pred_file_path = 'results/dev/dev_pred.out'
export_predictions(pred_file_path, dev_pred)

In [None]:
evaluate_tagging_file('data/dev/dev.txt', 'results/dev/Preds.out')

In [3]:
classes = ["B", "I", "O"]
tag_to_idx = {t: i for i, t in enumerate(classes)}
tag_to_idx['[PAD]'] = -100
idx_to_tag = {i: t for t, i in tag_to_idx.items()}

In [4]:
tag_to_idx

{'B': 0, 'I': 1, 'O': 2, '[PAD]': -100}

In [5]:
idx_to_tag

{0: 'B', 1: 'I', 2: 'O', -100: '[PAD]'}