In [1]:
from athnlp.readers.brown_pos_corpus import BrownPosTag
import numpy as np
from nltk.util import ngrams

In [2]:
corpus = BrownPosTag()
print("vocabulary size: ", len(corpus.dictionary.x_dict))
print("train/dev/test set length: ", len(corpus.train), len(corpus.dev), len(corpus.test))
print("First train sentence: ", corpus.train[0])
print("First dev sentence: ", corpus.dev[0])
print("First test sentence: ", corpus.test[0])

vocabulary size:  17427
train/dev/test set length:  10000 1000 1000
First train sentence:  Merger/noun proposed/verb 
First dev sentence:  To/prt provide/verb service/noun of/adp local/adj origin/noun to/adp as/adv many/adj listeners/noun as/adp possible/adj ./. 
First test sentence:  For/adp example/noun :/. a/det sales/noun presentation/noun can/verb be/verb analyzed/verb and/conj evaluated/verb through/adp roleplaying/noun ./. 


In [3]:
corpus.train[1]

It/pron urged/verb that/adp the/det city/noun ``/. take/verb steps/noun to/prt remedy/verb ''/. this/det problem/noun ./. 

In [4]:
print(corpus.train[1].x)
print(corpus.train[1].y)

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
[2, 1, 3, 4, 0, 5, 1, 0, 6, 1, 5, 4, 0, 5]


In [5]:
def one_hot(x, num):
    arr = np.zeros(num)
    arr[x] = 1
    return arr

In [6]:
one_hot(2, 5)

array([0., 0., 1., 0., 0.])

In [7]:
# print add padding word
word_count = len(corpus.dictionary.x_dict) + 1
label_count = len(corpus.dictionary.y_dict)

In [8]:
print("words:" + str(word_count))
print("labels:" + str(label_count))

words:17428
labels:12


In [9]:
def feature_x(x):
    return one_hot(x, word_count)

def feature_gram(gram):
    x = np.array(list(map(feature_x, gram)))
    x = np.reshape(x, -1)
    return x


def feature_y(y):
    return one_hot(y, label_count)

def predict(weights, x):
    logits = np.matmul(weights, x)
    y_hat = np.argmax(logits, axis=0)       
    return y_hat
    

In [10]:
grams_num = 2
def ngrams_sent(sent):
    return list(ngrams(sent, grams_num, pad_left=True, pad_right=False, left_pad_symbol=word_count-1))

In [11]:
for gram in ngrams_sent([1, 2, 3, 4, 5]):
    x = map(feature_x, gram)
    print(list(x))

[array([0., 0., 0., ..., 0., 0., 1.]), array([0., 1., 0., ..., 0., 0., 0.])]
[array([0., 1., 0., ..., 0., 0., 0.]), array([0., 0., 1., ..., 0., 0., 0.])]
[array([0., 0., 1., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.])]
[array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.])]
[array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.])]


In [12]:


def accuracy_dev(weights):
    true_count = 0
    all_count = 0
    for i, sent in enumerate(corpus.dev):       
        #print(sent.x)
        grams = ngrams_sent(sent.x)
        #print(grams)
        for word_idx, gram in enumerate(grams):   
            x = feature_gram(gram)
            y_true = sent.y[word_idx]            
            y_hat = predict(weights, x)
            
            #print("weights shape: " + str(np.shape(weights)))
            #print("x shape: " + str(np.shape(x)))
            #print("y_true shape: " + str(y_true))
            #print("y_hat shape: " + str(y_hat))
            
            all_count += 1
            if y_hat == y_true:
                true_count += 1
    return true_count / all_count


In [13]:
w = np.zeros(shape=(label_count, word_count * grams_num))

print(accuracy_dev(w))

0.27510962622676965


In [14]:

print(accuracy_dev(w))

for epoch in range(5):
    for sent in corpus.train:
        grams = ngrams_sent(sent.x)
        for word_idx, gram in enumerate(grams):   
            x = feature_gram(gram)
            y_true = sent.y[word_idx]            
            y_hat = predict(w, x)
            if y_hat != y_true:
                w[y_true] = w[y_true] + x
                w[y_hat] = w[y_hat] - x
    
    print("Accuracy: " + str(accuracy_dev(w)))


0.27510962622676965
Accuracy: 0.8821257047400293
Accuracy: 0.8924618918354562
Accuracy: 0.9025892670703696
Accuracy: 0.9015452077678012
Accuracy: 0.8993526832324076
