The cell below uses the treebank corpus from nltk

In [1]:
import nltk

tagged_sentences = nltk.corpus.treebank.tagged_sents()

print(tagged_sentences)
print("Tagged sentences: ", len(tagged_sentences))
print("Tagged words:", len(nltk.corpus.treebank.tagged_words()))


[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]
Tagged sentences:  3914
Tagged words: 100676


The cells below use the COMP 182 HW 6 corpus

In [2]:
def read_pos_file(filename):
    """
    Parses an input tagged text file.
    Input:
    filename --- the file to parse
    Returns: 
    The file represented as a list of tuples, where each tuple 
    is of the form (word, POS-tag).
    A list of unique words found in the file.
    A list of unique POS tags found in the file.
    """
    file_representation = []
    unique_words = set()
    unique_tags = set()
    f = open(str(filename), "r")
    for line in f:
        if len(line) < 2 or len(line.split("/")) != 2:
            continue
        word = line.split("/")[0].replace(" ", "").replace("\t", "").strip()
        tag = line.split("/")[1].replace(" ", "").replace("\t", "").strip()
        file_representation.append( (word, tag) )
        unique_words.add(word)
        unique_tags.add(tag)
    f.close()
    return file_representation, unique_words, unique_tags

In [3]:
def create_training_sentences(train_filepath):
    training_sentences = []
    with open(train_filepath) as fp:  
        line = fp.readline()
        cnt = 1
        sentence = []
        while line:   
            #if (cnt%1000 == 0):
                #print (cnt)
            word = line.split("/")[0].replace(" ", "").replace("\n", "").strip()
            tag = line.split("/")[1].replace(" ", "").replace("\n", "").strip()
            sentence.append((word, tag))
            if cnt < 100:
                print("Line {}: {}".format(cnt, line.strip()))
            line = fp.readline() #read next line 
            cnt += 1
            if line.isspace() == True: #if the line you are currently at is empty, then you have completed a sentence
                if cnt < 100:
                    print("Line {}: {}".format(cnt, line.strip()))
                #print (str(cnt+1) + ' space ass!')
                training_sentences.append(sentence) #append the sentence to the list of all training sentences
                line = fp.readline() #go to the next line
                #print (training_sentences[0])
                sentence = []
                
    return training_sentences


comp182_training = create_training_sentences('training.txt')

print (comp182_training[1]) #print the second sentence



Line 1: The / DT
Line 2: final / JJ
Line 3: major / JJ
Line 4: items / NNS
Line 5: of / IN
Line 6: New / NNP
Line 7: Deal / NNP
Line 8: legislation / NN
Line 9: were / VBD
Line 10: the / DT
Line 11: creation / NN
Line 12: of / IN
Line 13: the / DT
Line 14: United / NNP
Line 15: States / NNPS
Line 16: Housing / NNP
Line 17: Authority / NNP
Line 18: and / CC
Line 19: Farm / NNP
Line 20: Security / NNP
Line 21: Administration / NNP
Line 22: , / ,
Line 23: both / DT
Line 24: in / IN
Line 25: 1937 / CD
Line 26: , / ,
Line 27: and / CC
Line 28: the / DT
Line 29: Fair / NNP
Line 30: Labor / NNP
Line 31: Standards / NNP
Line 32: Act / NNP
Line 33: of / IN
Line 34: 1938 / CD
Line 35: , / ,
Line 36: which / WDT
Line 37: set / VBP
Line 38: maximum / NN
Line 39: hours / NNS
Line 40: and / CC
Line 41: minimum / NN
Line 42: wages / NNS
Line 43: for / IN
Line 44: most / JJS
Line 45: categories / NNS
Line 46: of / IN
Line 47: workers / NNS
Line 48: . / .
Line 49: 
Line 49: The / DT
Line 50: economic /

Data pre-processing

In [4]:
import numpy as np
 
sentences, sentence_tags =[], [] 
for tagged_sentence in comp182_training:        #tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))
 
# Let's see how a sequence looks
 
print(sentences[1])
print(sentence_tags[1])

['The' 'economic' 'downturn' 'of' '1937' '--' '38' ',' 'and' 'the'
 'bitter' 'split' 'between' 'the' 'AFL' 'and' 'CIO' 'labor' 'unions' 'led'
 'to' 'major' 'Republican' 'gains' 'in' 'Congress' 'in' '1938' '.']
['DT' 'JJ' 'NN' 'IN' 'CD' ':' 'CD' ',' 'CC' 'DT' 'JJ' 'NN' 'IN' 'DT' 'NNP'
 'CC' 'NNP' 'NN' 'NNS' 'VBD' 'TO' 'JJ' 'JJ' 'NNS' 'IN' 'NNP' 'IN' 'CD' '.']


In [5]:
from sklearn.model_selection import train_test_split 
(
    train_sentences, 
    test_sentences,
    train_tags, 
    test_tags) = train_test_split(sentences, sentence_tags, test_size=0.2)


In [6]:
#create word vocabulary dictionary

words = set([])
for sentence in train_sentences:
    for word in sentence:
        words.add(word.lower())
        
#create tag vocabulary dictionary

tags = set([])
for sentence_tag in sentence_tags:
    for tag in sentence_tag:
        tags.add(tag)
        

word_vocab = {} #dictionary mapping unique words to a unique integer
word_vocab['-PAD-'] = 0  # The special value used for padding
word_vocab['-OOV-'] = 1  # The special value used for OOVs
i = 2
for word in list(words):
    word_vocab[word] = i
    i+=1  

    
tag_vocab = {} #dictionary mapping unique tags to a unique integer
tag_vocab['-PAD-'] = 0
i = 1
for tag in list(tags):
    tag_vocab[tag] = i
    i+=1  

#print (tag_vocab)


    

In [7]:
train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []

for s in train_sentences:
    s_int = []
    for w in s:
        if w.lower() in word_vocab:
            s_int.append(word_vocab[w.lower()])
        else:
            s_int.append(word_vocab['-OOV-'])
    train_sentences_X.append(np.array(s_int))
    
for s in test_sentences:
    s_int = []
    for w in s:
        if w.lower() in word_vocab:
            s_int.append(word_vocab[w.lower()])
        else:
            s_int.append(word_vocab['-OOV-'])
    
    test_sentences_X.append(np.array(s_int))
 

for s in train_tags:
    s_int = []
    for t in s:
        s_int.append(tag_vocab[t])
    train_tags_y.append(np.array(s_int))

for s in test_tags:
    s_int = []
    for t in s:
        s_int.append(tag_vocab[t])
    test_tags_y.append(np.array(s_int))
 
 

train_sentences_X = (np.asarray(train_sentences_X))
test_sentences_X = (np.asarray(test_sentences_X))
train_tags_y = (np.asarray(train_tags_y))
test_tags_y = (np.asarray(test_tags_y))
print (train_sentences_X[0])
print (train_tags_y[0])


[31629 17698 22211 40055 74526 69586 21173 36599 56692 75689 18910  9279
 53261 55676  1790 21538 53261  6112 31925 41091  4637  1790 29783  9922
 49349 18910 63919 21538 36750 76133 69916 37629 51359 32823 35763 63919
 35763  3378  6112 15510 27971 61031  3799 65863 28947 46854 21173 39559
 36484]
[457 497 169 497 327 402  37 428  36 428  37 202 500 202 287 202 500 457
 169 497 497 287  37 497 402  37 202 202 318 640 402 513 457 497 414 202
 414  37 457 497 497 156 520 327 202 202  37 428 158]


In [8]:
lengths = []
for element in train_sentences:
    lengths.append(len(element))
MAX_LENGTH = max(lengths)
print(MAX_LENGTH)  # 271

1134


In [10]:
#pad sequences with 0s until length = MAX_LENGTH
import tensorflow
from keras.preprocessing.sequence import pad_sequences
 
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')

print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])

[31629 17698 22211 ...     0     0     0]
[25396 40485 21173 ...     0     0     0]
[457 497 169 ...   0   0   0]
[457 497  37 ...   0   0   0]


Bi-directional LSTM model

In [11]:
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation, Input, Dropout
from keras.optimizers import Adam
 

def POS_LSTM():
    
    inputs = Input(shape = (MAX_LENGTH, ))
    X = Embedding(len(word_vocab), 24)(inputs)  
    X = Bidirectional(LSTM(64, return_sequences=True))(X)
    X = TimeDistributed(Dropout(0.8))(X)
    X = Bidirectional(LSTM(128, return_sequences=True))(X)
    X = TimeDistributed(Dropout(0.8))(X)
    X = Bidirectional(LSTM(64, return_sequences=True))(X)
    X = TimeDistributed(Dense(len(tag_vocab)))(X)
    outputs = Activation('softmax')(X)
    
    model = Model(inputs = inputs, outputs = outputs)
    
    model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])
    
    return model
 
model = POS_LSTM()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1134)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1134, 24)          1845624   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1134, 128)         45568     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 1134, 128)         0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 1134, 256)         263168    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 1134, 256)         0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 1134, 128)         164352    
__________

In [None]:
def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.asarray(cat_sequences)

#print (len(tag_vocab))
cat_train_tags_y = to_categorical(train_tags_y, len(tag_vocab))
#print (cat_train_tags_y.shape)

In [162]:
#train the model
model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag_vocab)), batch_size=128, epochs= 5, validation_split=0.2)

Train on 2504 samples, validate on 627 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x26b3f210a58>

In [165]:
#evaluate
scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag_vocab)))
print(f"{model.metrics_names[1]}: {scores[1] * 100}")   # acc: 99.09751977804825

acc: 91.49642094189484
