In [1]:
from IPython.core.display import display
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Bidirectional, LSTM, Dense, Activation, Embedding
from keras.models import Sequential
from process import *

glove_len = 100
glove = Path(f'../glove.6B.{glove_len}d.txt')
docfile = Path('../tac_docria/en/eng.2015.train.pickle')
embed = load_glove(glove)

@pickled
def read_and_extract(path, fun):
    with DocumentIO.read(path) as doc:
        return fun(list(doc))
    
def get_core_nlp(docs, lang):
    def call_api(doc):
        text = str(doc.texts['main'])
        return langforia(text, lang).split('\n')
    return [call_api(doc) for doc in docs], docs

Using TensorFlow backend.


In [2]:
def doc_label_idx(doc):
    doc_index = {}
    labels, types = set(), set()
    longest = None
    for node in doc.layers['tac/entity/gold']:
        labels.add(node.fld.label)
        types.add(node.fld.type)
        entity = node.fld.text
        # ignore xml-only entities
        if entity:
            span = (entity.start, entity.stop)
            # TODO: simplify logic if possible
            if longest:
                # there is a greater span with same start
                if span[0] == longest[0] and span[1] > longest[1]:
                    del doc_index[longest]
                    longest = span
                # we are either inside or after the current span
                if span[0] > longest[0]:
                    # this should never happen (overlapping spans)
                    if span[0] <= longest[1] and span[1] > longest[1]:
                        continue
                        raise ValueError("Span %s and Span %s are overlapping"
                                        % (longest, span))
                    # we are inside the span
                    if span[1] <= longest[1]:
                        continue
                    # we have a new span
                    else:
                        longest = span
            else:
                longest = span
            doc_index[span] = (node.fld.type, node.fld.label)
    return doc_index, labels, types

In [3]:
def extract_core_nlp(core_nlp, docs):
    train, gold = [], []
    lbl_sets = defaultdict(set)
    labels, types = set(), set()

    def add(features, name):
        lbl_sets[name].add(features[name])
        
    for c, d in zip(core_nlp, docs):
        gold_std, doc_labels, doc_types = doc_label_idx(d)
        labels |= doc_labels
        types |= doc_types
        def get_entity(span):
            return gold_std.get(span, "O")
        
        itr = iter(c)
        head = next(itr).split('\t')[1:]
        sentences = [[]]
        spans = [[]]
        inside = ""
        for row in itr:
            if row:
                cols = row.split('\t')
                features = dict(zip(head, cols[1:]))
                ne = features['ne']
                if inside:
                    if ne == ')':
                        ne = 'E-' + inside
                        inside = ''
                    else:
                        ne = 'I-' + inside
                elif ne[-1] == ')':
                    ne = 'S-' + ne[1:-1]
                    inside = ''
                else:
                    inside = ne[1:]
                    ne = 'B-' + inside
                features['ne'] = ne
                add(features, 'pos')
                add(features, 'ne')
                sentences[-1].append(features)
                spans[-1].append((features['start'], features['end']))
            else:
                sentences.append([])
                spans.append([])
        if not sentences[-1]:
            sentences.pop(-1)
            spans.pop(-1)
        train.extend(sentences)
        
        entities = [[get_entity(tuple(map(int,span))) for span in sentence] for sentence in spans]
        gold.extend(entities)
    out_categories = {pair: index for index, pair in enumerate(product(types, labels), 1)}
    out_categories['O'] = 0
    return train, lbl_sets, gold, out_categories

In [4]:
def inverted(a):
    return {v:k for k,v in a.items()}

def build_indices(train, gold, glove):
    wordset = set([word["form"] for sentence in train for word in sentence])
    wordset.update(glove.keys())
    word_ind = dict(enumerate(wordset, 2))
    return word_ind

def build_emb_mat(glove, invind):
    size = len(invind)+2
    mat = np.random.rand(size, glove_len)
    for k,v in glove.items():
        mat[invind[k], :] = v
    return mat, size


core_nlp, docs = read_and_extract(docfile, lambda docs: get_core_nlp(docs, 'en'))
core_nlp_test, docs_test = read_and_extract(docfile, lambda docs: get_core_nlp(docs, 'en'))
train, lbl_sets, gold, out_categories = extract_core_nlp(core_nlp, docs)

word_index = build_indices(train, gold, embed)
pos_index = dict(enumerate(lbl_sets['pos']))
ne_index = dict(enumerate(lbl_sets['ne']))
pos_inv = inverted(pos_index)
ne_inv = inverted(ne_index)
out_index = inverted(out_categories)
word_inv = inverted(word_index)

build_emb_mat(embed, word_inv)

(array([[ 0.36081542,  0.44203668,  0.22202288, ...,  0.57384442,
          0.36780209,  0.33160592],
        [ 0.31121769,  0.10917327,  0.42184412, ...,  0.00628286,
          0.90816114,  0.00834343],
        [ 0.19557001,  0.15968999, -0.35591999, ...,  0.098046  ,
         -0.44168001, -0.17889   ],
        ...,
        [-0.28209001, -0.47914001, -0.58642   , ...,  0.67146999,
         -0.741     , -0.41417   ],
        [-0.24793001, -0.6322    ,  0.017061  , ...,  0.20935   ,
         -0.32936999, -0.26133999],
        [ 0.34773001,  0.17393   ,  0.032938  , ..., -0.14029001,
          0.29622   , -0.66772997]]), 406034)

In [5]:
from keras.preprocessing.sequence import pad_sequences

def build_sequence(l, invind, default=None):
    if default:
        return [invind.get(w, default) for w in l]
    return [invind[w] for w in l]

def conll_to_word(sentence):
    return [word['form'] for word in sentence]

x = pad_sequences([build_sequence(conll_to_word(sentence), word_inv, 1) for sentence in train])
y = pad_sequences([build_sequence(sentence, out_categories) for sentence in gold])

In [6]:
def build_model(embed):
    embedding_matrix, mat_size = build_emb_mat(embed, word_inv)
    width = len(word_inv) + 2
    model = Sequential()
    model.add(Embedding(width,
                               glove_len,
                               mask_zero=True,
                               input_length=None))
    
    model.layers[0].set_weights([embedding_matrix])
    model.layers[0].trainable = True
    
    model.add(Bidirectional(LSTM(25, return_sequences=True), input_shape=(None, width)))
    model.add(Dense(len(out_categories), activation='softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['acc'])
    return model

In [7]:
def make_model(x, y, embed, epochs=10, batch_size=128):
    y = to_categorical(y, num_classes=len(out_categories))
    model = build_model(embed)
    model.fit(x, y, epochs=epochs, batch_size=batch_size)
    return model

model = make_model(x, y, embed)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         40603400  
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 50)          25200     
_________________________________________________________________
dense_1 (Dense)              (None, None, 13)          663       
Total params: 40,629,263
Trainable params: 40,629,263
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
from pickle import dump
with Path('../model.pickle').open('w+b') as f:
    dump((model,word_index,out_index), f)