The cell below uses the treebank corpus from nltk

In [124]:
import nltk

tagged_sentences = nltk.corpus.treebank.tagged_sents()

print(tagged_sentences)
print("Tagged sentences: ", len(tagged_sentences))
print("Tagged words:", len(nltk.corpus.treebank.tagged_words()))


[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]
Tagged sentences:  3914
Tagged words: 100676


The cells below use the COMP 182 HW 6 corpus

In [152]:
def read_pos_file(filename):
    """
    Parses an input tagged text file.
    Input:
    filename --- the file to parse
    Returns: 
    The file represented as a list of tuples, where each tuple 
    is of the form (word, POS-tag).
    A list of unique words found in the file.
    A list of unique POS tags found in the file.
    """
    file_representation = []
    unique_words = set()
    unique_tags = set()
    f = open(str(filename), "r")
    for line in f:
        if len(line) < 2 or len(line.split("/")) != 2:
            continue
        if line.isspace() == True:
            print ("ass")
        word = line.split("/")[0].replace(" ", "").replace("\t", "").strip()
        tag = line.split("/")[1].replace(" ", "").replace("\t", "").strip()
        file_representation.append( (word, tag) )
        unique_words.add(word)
        unique_tags.add(tag)
    f.close()
    return file_representation, unique_words, unique_tags

In [153]:
training_data, unique_word, unique_tag = read_pos_file('training.txt')

print (training_data)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x90 in position 5865: character maps to <undefined>

Data pre-processing

In [154]:
import numpy as np
 
sentences, sentence_tags =[], [] 
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))
 
# Let's see how a sequence looks
 
print(sentences[5])
print(sentence_tags[5])

['Lorillard' 'Inc.' ',' 'the' 'unit' 'of' 'New' 'York-based' 'Loews'
 'Corp.' 'that' '*T*-2' 'makes' 'Kent' 'cigarettes' ',' 'stopped' 'using'
 'crocidolite' 'in' 'its' 'Micronite' 'cigarette' 'filters' 'in' '1956'
 '.']
['NNP' 'NNP' ',' 'DT' 'NN' 'IN' 'JJ' 'JJ' 'NNP' 'NNP' 'WDT' '-NONE-' 'VBZ'
 'NNP' 'NNS' ',' 'VBD' 'VBG' 'NN' 'IN' 'PRP$' 'NN' 'NN' 'NNS' 'IN' 'CD'
 '.']


In [155]:
from sklearn.model_selection import train_test_split 
(
    train_sentences, 
    test_sentences,
    train_tags, 
    test_tags) = train_test_split(sentences, sentence_tags, test_size=0.2)


In [156]:
#create word vocabulary dictionary

words = set([])
for sentence in train_sentences:
    for word in sentence:
        words.add(word.lower())
        
#create tag vocabulary dictionary

tags = set([])
for sentence_tag in sentence_tags:
    for tag in sentence_tag:
        tags.add(tag)
        

word_vocab = {} #dictionary mapping unique words to a unique integer
word_vocab['-PAD-'] = 0  # The special value used for padding
word_vocab['-OOV-'] = 1  # The special value used for OOVs
i = 2
for word in list(words):
    word_vocab[word] = i
    i+=1  

    
tag_vocab = {} #dictionary mapping unique tags to a unique integer
tag_vocab['-PAD-'] = 0
i = 1
for tag in list(tags):
    tag_vocab[tag] = i
    i+=1  

print (tag_vocab)


    

{'-PAD-': 0, 'VBP': 1, 'IN': 2, '-LRB-': 3, 'NNS': 4, 'NNP': 5, ':': 6, 'WP$': 7, '#': 8, 'JJS': 9, '``': 10, 'NNPS': 11, 'RBR': 12, "''": 13, 'VBG': 14, 'WRB': 15, 'PRP$': 16, 'VBN': 17, 'VBD': 18, 'WP': 19, '$': 20, 'VBZ': 21, 'WDT': 22, 'JJR': 23, 'SYM': 24, 'DT': 25, 'POS': 26, 'VB': 27, 'TO': 28, 'FW': 29, 'LS': 30, 'RB': 31, 'CC': 32, 'RBS': 33, 'EX': 34, '-NONE-': 35, 'PRP': 36, '.': 37, ',': 38, 'NN': 39, 'UH': 40, 'JJ': 41, 'CD': 42, 'RP': 43, 'MD': 44, 'PDT': 45, '-RRB-': 46}


In [157]:
train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []

for s in train_sentences:
    s_int = []
    for w in s:
        if w.lower() in word_vocab:
            s_int.append(word_vocab[w.lower()])
        else:
            s_int.append(word_vocab['-OOV-'])
    train_sentences_X.append(np.array(s_int))
    
for s in test_sentences:
    s_int = []
    for w in s:
        if w.lower() in word_vocab:
            s_int.append(word_vocab[w.lower()])
        else:
            s_int.append(word_vocab['-OOV-'])
    
    test_sentences_X.append(np.array(s_int))
 

for s in train_tags:
    s_int = []
    for t in s:
        s_int.append(tag_vocab[t])
    train_tags_y.append(np.array(s_int))

for s in test_tags:
    s_int = []
    for t in s:
        s_int.append(tag_vocab[t])
    test_tags_y.append(np.array(s_int))
 
 

train_sentences_X = (np.asarray(train_sentences_X))
test_sentences_X = (np.asarray(test_sentences_X))
train_tags_y = (np.asarray(train_tags_y))
test_tags_y = (np.asarray(test_tags_y))
print (train_sentences_X[0])
print (train_tags_y[0])


[6193 4079  760 7038 1727 3884 6193 1727 9490 3352 5408  887 3884 7053
  687 8522 4312 4465 8525 7856 4917 5102 1956 1819 3884 9561 4912 2499]
[25 39 21  2 39 38 25 39  2 39  4 14 38 32 34 21 31 39 41 32 41  2 25 41
 38 41  4 37]


In [158]:
lengths = []
for element in train_sentences:
    lengths.append(len(element))
MAX_LENGTH = max(lengths)
print(MAX_LENGTH)  # 271

271


In [159]:
#pad sequences with 0s until length = MAX_LENGTH
from keras.preprocessing.sequence import pad_sequences
 
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')

print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])

[6193 4079  760 7038 1727 3884 6193 1727 9490 3352 5408  887 3884 7053
  687 8522 4312 4465 8525 7856 4917 5102 1956 1819 3884 9561 4912 2499
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

Bi-directional LSTM model

In [160]:
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation, Input, Dropout
from keras.optimizers import Adam
 

def POS_LSTM():
    
    inputs = Input(shape = (MAX_LENGTH, ))
    X = Embedding(len(word_vocab), 24)(inputs)  
    X = Bidirectional(LSTM(64, return_sequences=True))(X)
    X = TimeDistributed(Dropout(0.8))(X)
    X = Bidirectional(LSTM(128, return_sequences=True))(X)
    X = TimeDistributed(Dropout(0.8))(X)
    X = Bidirectional(LSTM(64, return_sequences=True))(X)
    X = TimeDistributed(Dense(len(tag_vocab)))(X)
    outputs = Activation('softmax')(X)
    
    model = Model(inputs = inputs, outputs = outputs)
    
    model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])
    
    return model
 
model = POS_LSTM()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_20 (InputLayer)        (None, 271)               0         
_________________________________________________________________
embedding_18 (Embedding)     (None, 271, 24)           243288    
_________________________________________________________________
bidirectional_35 (Bidirectio (None, 271, 128)          45568     
_________________________________________________________________
time_distributed_24 (TimeDis (None, 271, 128)          0         
_________________________________________________________________
bidirectional_36 (Bidirectio (None, 271, 256)          263168    
_________________________________________________________________
time_distributed_25 (TimeDis (None, 271, 256)          0         
_________________________________________________________________
bidirectional_37 (Bidirectio (None, 271, 128)          164352    
__________

In [161]:
def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.asarray(cat_sequences)

print (len(tag_vocab))
cat_train_tags_y = to_categorical(train_tags_y, len(tag_vocab))
print (cat_train_tags_y.shape)

47
(3131, 271, 47)


In [162]:
#train the model
model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag_vocab)), batch_size=128, epochs= 5, validation_split=0.2)

Train on 2504 samples, validate on 627 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x26b3f210a58>

In [165]:
#evaluate
scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag_vocab)))
print(f"{model.metrics_names[1]}: {scores[1] * 100}")   # acc: 99.09751977804825

acc: 91.49642094189484
