In [1]:
from keras.layers.core import Activation, Dense, Dropout, RepeatVector, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.layers.wrappers import TimeDistributed
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import collections
import nltk
import numpy as np
import os
from nltk.corpus import brown
from nltk.tokenize import RegexpTokenizer

Using TensorFlow backend.


In [6]:
fedata = open("treebank_sents.txt", "w")
ffdata = open("treebank_poss.txt", "w")
tokenizer = RegexpTokenizer(r'\w+')

for sent in brown.tagged_sents():
    words, poss = [], []
    for word, pos in sent:
        if len(tokenizer.tokenize(word)) != 1 :
            continue
        words.append(word)
        poss.append(pos)
    fedata.write("{:s}".format(" ".join(words)))
    fedata.write("\n")
    ffdata.write("{:s}".format(" ".join(poss)))
    ffdata.write("\n")

fedata.close()
ffdata.close()

In [17]:
fin = open("treebank_poss.txt", "rb")
diction = {}
for line in fin:
    line=str(line.decode('ascii',"ignore").strip())
    line.strip().split("\t")
    sentence = line.strip().split("\t")
    for tag in sentence:
        #print(tag)
        if tag in diction:
            diction[tag] += 1
        else:
            diction[tag] = 1   

In [18]:
def parse_sentences(filename):
    word_freqs = collections.Counter()
    num_recs, maxlen = 0, 0
    fin = open(filename, "rb")
    for line in fin:
        words = line.strip().lower().split()
        for word in words:
            word_freqs[word] += 1
        if len(words) > maxlen:
            maxlen = len(words)
        num_recs += 1
    fin.close()
    return word_freqs, maxlen, num_recs

s_wordfreqs, s_maxlen, s_numrecs = parse_sentences("treebank_sents.txt")
t_wordfreqs, t_maxlen, t_numrecs = parse_sentences("treebank_poss.txt")
print("Unique Words - {0}\n Max Sentence - {1}\n TotalWords - {2}\nUnique tags - {3}\n Max tag - {4}\n Totaltags - {5}\n".format(len(s_wordfreqs), s_maxlen, s_numrecs, len(t_wordfreqs), t_maxlen, t_numrecs))

Unique Words - 41845
 Max Sentence - 167
 TotalWords - 57340
Unique tags - 339
 Max tag - 167
 Totaltags - 57340



In [19]:
MAX_SEQLEN = 170
S_MAX_FEATURES = 55000
T_MAX_FEATURES = 340

In [20]:
s_vocabsize = min(len(s_wordfreqs), S_MAX_FEATURES) + 2
s_word2index = {x[0]:i+2 for i, x in enumerate(s_wordfreqs.most_common(S_MAX_FEATURES))}
s_word2index["PAD"] = 0
s_word2index["UNK"] = 1
s_index2word = {v:k for k, v in s_word2index.items()}

t_vocabsize = len(t_wordfreqs) + 1
t_word2index = {x[0]:i for i, x in enumerate(t_wordfreqs.most_common(T_MAX_FEATURES))}
t_word2index["PAD"] = 0
t_index2word = {v:k for k, v in t_word2index.items()}

In [21]:
def build_tensor(filename, numrecs, word2index, maxlen,make_categorical=False, num_classes=0):
    data = np.empty((s_numrecs, ), dtype=list)
    fin = open(filename, "rb")
    i = 0
    for line in fin:
        wids = []
        for word in line.strip().lower().split():
            if word in word2index:
                wids.append(word2index[word])
            else:
                wids.append(word2index["UNK"])
        if make_categorical:
            data[i] = np_utils.to_categorical(wids,num_classes=num_classes)
        else:
            data[i] = wids
        i += 1
    fin.close()
    pdata = sequence.pad_sequences(data, maxlen=maxlen)
    return pdata

X = build_tensor(("treebank_sents.txt"),s_numrecs, s_word2index, MAX_SEQLEN)
Y = build_tensor(("treebank_poss.txt"),t_numrecs, t_word2index, MAX_SEQLEN, True, t_vocabsize)


In [22]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

In [25]:
EMBED_SIZE = 128
HIDDEN_SIZE = 64
BATCH_SIZE = 32
NUM_EPOCHS = 1
model = Sequential()
model.add(Embedding(s_vocabsize, EMBED_SIZE,input_length=MAX_SEQLEN))
#model.add(SpatialDropout1D(Dropout(0.2)))
model.add(GRU(HIDDEN_SIZE, dropout=0.2, recurrent_dropout=0.2))
model.add(RepeatVector(MAX_SEQLEN))
model.add(GRU(HIDDEN_SIZE, return_sequences=True))
model.add(TimeDistributed(Dense(t_vocabsize)))
model.add(Activation("softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam",metrics=["accuracy"])

In [26]:
model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,validation_data=[Xtest, Ytest])
score, acc = model.evaluate(Xtest, Ytest, batch_size=BATCH_SIZE)
print("Test score: %.3f, accuracy: %.3f" % (score, acc))

Train on 45872 samples, validate on 11468 samples
Epoch 1/1
Test score: 0.339, accuracy: 0.914
