In [1]:
from IPython.core.display import display
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Bidirectional, LSTM, Dense, Activation, Embedding, Concatenate, Input
from keras.models import Model
from process import *

glove_len = 100
glove = Path(f'../glove.6B.{glove_len}d.txt')
docfile = Path('../corpus/tac/lang/en/eng.2015.train.pickle')
embed = load_glove(glove)

@pickled
def read_and_extract(path, fun):
    with DocumentIO.read(path) as doc:
        return fun(list(doc)) 

Using TensorFlow backend.


In [3]:
def inverted(a):
    return {v:k for k,v in a.items()}

def build_indices(train, gold, glove):
    wordset = set([word["form"] for sentence in train for word in sentence])
    wordset.update(glove.keys())
    word_ind = dict(enumerate(wordset, 2))
    return word_ind

def emb_mat_init(glove, invind):
    def initializer(shape, dtype=None):
        mat = np.random.random_sample(shape)
        for k,v in glove.items():
            mat[invind[k], :] = v
        return mat
    return initializer


core_nlp, docs = read_and_extract(docfile, lambda docs: get_core_nlp(docs, 'en'))
core_nlp_test, docs_test = read_and_extract(docfile, lambda docs: get_core_nlp(docs, 'en'))
train, lbl_sets, gold, out_categories, _, _ = docria_extract(core_nlp, docs)

word_index = build_indices(train, gold, embed)
pos_index = dict(enumerate(lbl_sets['pos']))
ne_index = dict(enumerate(lbl_sets['ne']))
pos_inv = inverted(pos_index)
ne_inv = inverted(ne_index)
out_index = inverted(out_categories)
word_inv = inverted(word_index)

In [4]:
from keras.preprocessing.sequence import pad_sequences

def build_sequence(l, invind, default=None):
    if default:
        return [invind.get(w, default) for w in l]
    return [invind[w] for w in l]

def mapget(key, seq):
    return (collection[key] for collection in seq)

def conll_to_word(sentence):
    return [word['form'] for word in sentence]

def to_categories(data, key, inv, default=None, categorical=True):
    fields = (mapget(key, sentence) for sentence in data)
    cat_seq = [build_sequence(f, inv, default=default) for f in fields]
    padded = pad_sequences(cat_seq)
    if categorical:
        return to_categorical(padded)
    return padded
    
x_word = to_categories(train, 'form', word_inv, default=1, categorical=False)
x_pos = to_categories(train, 'pos', pos_inv)
x_ne = to_categories(train, 'ne', ne_inv)
y = pad_sequences(gold)

In [5]:
def build_model(max_len, embed, npos, nne):
    width = len(word_inv) + 2
    pos = Input(shape=(max_len, npos))
    ne = Input(shape=(max_len, nne))
    form = Input(shape=(max_len,))
    emb = Embedding(width,
                    glove_len,
                    embeddings_initializer=emb_mat_init(embed, word_inv),
                    mask_zero=True,
                    input_length=None)(form)
    
    emb.trainable = True
    
    concat = Concatenate()([emb, pos, ne])
    
    lstm = Bidirectional(LSTM(25, return_sequences=True), input_shape=(None, width))(concat)
    out = Dense(len(out_categories), activation='softmax')(lstm)
    model = Model(inputs= [form, pos, ne], outputs=out)
    #model.summary()
    model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['acc'])
    return model

In [6]:
print(y.shape)

(9886, 223, 50)


In [7]:
def make_model(max_len, x, y, embed, npos, nne, epochs=3, batch_size=128):
    #y = to_categorical(y, num_classes=len(out_categories))
    model = build_model(max_len, embed, npos, nne)
    model.fit(x, y, epochs=epochs, batch_size=batch_size)
    model.summary()
    return model

print(x_word.shape)
model = make_model(x_word.shape[1], [x_word, x_pos, x_ne], y, embed, len(pos_inv), len(ne_inv), epochs=10)

(9886, 223)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 223)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 223, 100)     40160200    input_3[0][0]                    
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 223, 45)      0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 223, 23)      0                               

In [8]:
test, _, gold_test, _, _, _ = docria_extract(core_nlp_test, docs_test)
x_word_test = to_categories(test, 'form', word_inv, default=1, categorical=False)
x_pos_test = to_categories(test, 'pos', pos_inv)
x_ne_test = to_categories(test, 'ne', ne_inv)
y_test = pad_sequences(gold)
pred = model.predict([x_word_test, x_pos_test, x_ne_test])

In [9]:
from collections import Counter
def zip_from_end(a, b):
    shortest = min(len(a), len(b))
    return ((a[i], b[i]) for i in range(-shortest, 0))

actual = Counter()
correct = Counter()
for p, g in zip(pred, gold_test):
    for a,b in zip_from_end(p, g):
        actual_tag = out_index[np.argmax(b)]
        actual[actual_tag] += 1
        if np.argmax(a) == np.argmax(b):
            correct[actual_tag] += 1

for k in actual:
    print(k, correct[k]/actual[k], actual[k])
    
corr_sum = sum(correct[k] for k in correct if k != ('O', 'NOE', 'OUT'))
act_sum = sum(actual[k] for k in actual if k != ('O', 'NOE', 'OUT'))
print(corr_sum/act_sum)

('O', 'NOE', 'OUT') 0.9968802271326701 181744
('B', 'NAM', 'PER') 0.9769230769230769 1170
('E', 'NAM', 'PER') 0.9845626072041166 1166
('S', 'NOM', 'PER') 0.7955625990491284 1262
('S', 'NAM', 'PER') 0.9800435413642961 2756
('S', 'NAM', 'GPE') 0.9831223628691983 2844
('S', 'NAM', 'ORG') 0.9250180245133381 1387
('B', 'NAM', 'ORG') 0.7574692442882249 569
('I', 'NAM', 'ORG') 0.7134328358208956 335
('E', 'NAM', 'ORG') 0.8068181818181818 616
('B', 'NAM', 'FAC') 0.44660194174757284 103
('E', 'NAM', 'FAC') 0.5583333333333333 120
('I', 'NAM', 'PER') 0.719626168224299 107
('B', 'NAM', 'GPE') 0.8421052631578947 209
('E', 'NAM', 'GPE') 0.8957345971563981 211
('B', 'NAM', 'LOC') 0.6454545454545455 110
('E', 'NAM', 'LOC') 0.4424778761061947 113
('S', 'NAM', 'LOC') 0.5989010989010989 182
('I', 'NAM', 'FAC') 0.0 39
('B', 'NOM', 'PER') 0.0 79
('E', 'NOM', 'PER') 0.0 80
('S', 'NAM', 'FAC') 0.0 52
('I', 'NAM', 'LOC') 0.0 11
('S', 'NAM', 'TTL') 0.0 8
('I', 'NOM', 'PER') 0.0 22
('I', 'NAM', 'GPE') 0.0 25
('

In [10]:
#from pickle import dump
#with Path('../model.pickle').open('w+b') as f:
    #dump((model,word_index,out_index), f)