In [1]:
from athnlp.readers.brown_pos_corpus import BrownPosTag
import numpy as np

In [2]:
corpus = BrownPosTag()
print("vocabulary size: ", len(corpus.dictionary.x_dict))
print("train/dev/test set length: ", len(corpus.train), len(corpus.dev), len(corpus.test))
print("First train sentence: ", corpus.train[0])
print("First dev sentence: ", corpus.dev[0])
#print("First test sentence: ", corpus.test[0])

vocabulary size:  17427
train/dev/test set length:  10000 1000 1000
First train sentence:  Merger/noun proposed/verb 
First dev sentence:  To/prt provide/verb service/noun of/adp local/adj origin/noun to/adp as/adv many/adj listeners/noun as/adp possible/adj ./. 


In [3]:
corpus.train[1]

It/pron urged/verb that/adp the/det city/noun ``/. take/verb steps/noun to/prt remedy/verb ''/. this/det problem/noun ./. 

In [4]:
print(corpus.train[1].x)
print(corpus.train[1].y)

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
[2, 1, 3, 4, 0, 5, 1, 0, 6, 1, 5, 4, 0, 5]


In [5]:
def one_hot(x, num):
    arr = np.zeros(num)
    arr[x] = 1
    return arr



In [6]:
one_hot(2, 5)

array([0., 0., 1., 0., 0.])

In [7]:
def feature_x(x):
    return one_hot(x, word_count)

def feature_y(y):
    return one_hot(y, label_count)
           

        
np.sum(np.equal([1, 0, 1], [1, 0, 0]))        

2

In [8]:
word_count = len(corpus.dictionary.x_dict)
label_count = len(corpus.dictionary.y_dict)

In [9]:
print("words:" + str(word_count))
print("labels:" + str(label_count))

words:17427
labels:12


In [10]:
def all_paths(logits_seq):
    if len(logits_seq) == 0:
        return []
    next_paths = all_paths(logits_seq[1:])
    result = []
    for i, l in enumerate(logits_seq[0]):
        if len(next_paths) > 0:
            for np in next_paths:          
                p  = [(i, l)]
                p.extend(np)
                result.append(p)
        else:
            result.append([(i, l)])
            
    return result

all_paths([[1, 2], [3, 4], [4, 5]])


[[(0, 1), (0, 3), (0, 4)],
 [(0, 1), (0, 3), (1, 5)],
 [(0, 1), (1, 4), (0, 4)],
 [(0, 1), (1, 4), (1, 5)],
 [(1, 2), (0, 3), (0, 4)],
 [(1, 2), (0, 3), (1, 5)],
 [(1, 2), (1, 4), (0, 4)],
 [(1, 2), (1, 4), (1, 5)]]

In [11]:
def path_score(path):
    score = 0
    for (l, s) in path:
        score += s
    return score

def best_path(logits_seq):
    possible_paths = all_paths(logits_seq)
    sorted_paths = list(sorted(possible_paths, key=path_score, reverse=True))
    best = sorted_paths[0]
    
    result = [l for (l, s) in best]
    return result

best_path([[1, 2], [2, 1]])

[1, 0]

In [12]:
def top_paths(paths, top):
    sorted_paths = list(sorted(paths, key=path_score, reverse=True))
    best = sorted_paths[0:top]    
    return best

top_paths([[(1, 3), (2, 4)], [(1, 3), (2, 4)]], 1)

[[(1, 3), (2, 4)]]

In [13]:
def convert_logits_seq(logits_seq):
    result = []
    for logits in logits_seq:
        logits_result = []
        for i, l in enumerate(logits):
            logits_result.append((i, l))
        result.append(logits_result)
    return result

convert_logits_seq([[3, 4], [1, 3], [2, 4]])

[[(0, 3), (1, 4)], [(0, 1), (1, 3)], [(0, 2), (1, 4)]]

In [14]:
def beam_best_path(logits_seq, beam_size):
    logs_with_label = convert_logits_seq(logits_seq)
    best = beam_best_path_helper([], logs_with_label, beam_size)
    top_path = top_paths(best, 1)[0]
    return [l for (l, s) in top_path]

def beam_best_path_helper(prev_paths, logits_seq, beam_size):
    
    if len(prev_paths) == 0:
        paths = [[ls] for ls in logits_seq[0]] 
    else:
        paths = []
        for pp in prev_paths:
            for ls in logits_seq[0]:
                r = pp + [ls]
                paths.append(r)
    tops = top_paths(paths, beam_size)
    
    if len(logits_seq) > 1:
        return beam_best_path_helper(tops, logits_seq[1:], beam_size=2)    
    else:
        return tops
    
    
beam_best_path([[1, 2, 3], [3, 2, 1]], beam_size=2)    

[2, 0]

In [23]:
def predict(weights, sent):
    logits_seq = []
    for word_idx, word in enumerate(sent):
        x = feature_x(word)    
        logits = np.matmul(weights, x)
        logits_seq.append(logits)
    #path = best_path(logits_seq)
    path = beam_best_path(logits_seq, beam_size=10)
    return path

def compare_sequences(y_true, y_hat):
    correct = np.sum(np.equal(y_true, y_hat))   
    total = len(y_true)
    return correct, total
    
def accuracy_dev(weights):
    correct_count = 0
    total_count = 0
    for i, sent in enumerate(corpus.dev):       
        y_hat = predict(weights, sent.x)
        y_true = sent.y
        correct, total = compare_sequences(y_true, y_hat)
        correct_count += correct
        total_count += total
    return correct_count / total_count


In [24]:
w = np.zeros(shape=(label_count, word_count))

print(accuracy_dev(w))

0.27510962622676965


In [25]:
w = np.zeros(shape=(label_count, word_count))

print(accuracy_dev(w))

for epoch in range(5):
    train = corpus.train
    np.random.shuffle(train)
    for sent in train:
        y_hat_seq = predict(w, sent.x)
        
        y_true_seq = sent.y            
        #print("p")
        #print("y_hat: " + str(y_hat_seq))
        #print("y_true: " +str(y_true_seq))
        
        for i, (y_hat, y_true) in enumerate(zip(y_hat_seq, y_true_seq)):
            if y_hat != y_true:
                x = feature_x(sent.x[i])    
                w[y_true] = w[y_true] + x
                w[y_hat] = w[y_hat] - x
    
    print("Accuracy: " + str(accuracy_dev(w)))


0.27510962622676965
Accuracy: 0.9031112967216538
Accuracy: 0.9006055543954896
Accuracy: 0.8830653581123408
Accuracy: 0.902276049279599
Accuracy: 0.9024848611401127
