In [2]:
import pandas as pd 
import numpy as np 
from data_reader import *
from evaluate_new import *
from nltk.tokenize import  SpaceTokenizer
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras.callbacks import EarlyStopping
from sklearn import metrics
import re
import os

In [65]:
threshold = 0.4
## prepare data for doc evaluation
def get_doc_test(gold, text):
    ## gold: gold data
    ## text: full text file
    test_labels = []
    test_doc = []
    with open(doc_dir+gold, 'r') as doc_labels, open(doc_dir+text, 'r') as doc_text:
        d_labels = doc_labels.readlines()
        d_text = doc_text.readlines()
        assert len(d_labels) == len(d_text), "Mismatch"
        for i in range(len(d_labels)):
            ## label: start_id end_id data_id pub_id
            test_labels.append(d_labels[i].strip())
            
            text = d_text[i].strip()
            text = re.sub('\d', '0', text)
            text = re.sub('[^ ]- ', '', text)
            
            test_doc.append(text)
    return test_labels, test_doc

## convert one doc data to (text, label) format
def read_doc(doc, labels):
    doc = doc.strip().split()
    labels = labels.strip().split('|')
    labels = [la.split() for la in labels]
    for i in range(len(labels)):
        for j in range(len(labels[i])):
            labels[i][j] = int(labels[i][j])

    res_labels = [0]*len(doc)
    for la in labels:
        if la[2]!=0:
            start = la[0]
            end = la[1]
            res_labels[start : end+1] = [1]*(end+1-start)
    return [(doc[i], str(res_labels[i])) for i in range(len(doc))]

## make prediction of one doc
## split into segments first then combine results
def doc_pred(model, doc, tokenizer, MAXLEN=40):
    splits = []
    for i in range(0, len(doc), MAXLEN):
        splits.append(doc[i : i+MAXLEN])
    splits = tokenizer.texts_to_sequences(splits)
    splits = pad_sequences(splits, maxlen=MAXLEN)
    preds = model.predict(splits)
    preds = np.squeeze(preds)
    preds = [1 if p>=threshold else 0 for pd in preds for p in pd]
    return preds


def sent2labels(sent):
    return [int(label) for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]


def prep_data(neg_ratio=0, val_ratio=0.05, data_dir='../../data/data_40/', maxlen=40, emb_dim=300):
    train_sents, val_sents = data_sampler(neg_ratio, val_ratio, data_dir)

    train_sents = get_sents(train_sents)
    val_sents = get_sents(val_sents)

    X_train = [sent2tokens(s) for s in train_sents]
    Y_train = [sent2labels(s) for s in train_sents]

    X_val = [sent2tokens(s) for s in val_sents]
    Y_val = [sent2labels(s) for s in val_sents]
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    word_index = tokenizer.word_index

    vocab_size = len(word_index)+1
    print ("Vocab size: ", vocab_size)

    all_embs = np.stack(embedding_index.values())
    emb_mean, emb_std = np.mean(all_embs), np.std(all_embs)
    embedding_matrix = np.random.normal(emb_mean, emb_std, (vocab_size, emb_dim))
    counter = 0
    # embedding_matrix = np.zeros((vocab_size, emb_dim))
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            counter += 1
        else:
            embedding_matrix[i] = np.random.randn(emb_dim)
    print ("{}/{} words covered in glove".format(counter, vocab_size))

    X_train = tokenizer.texts_to_sequences(X_train)
    X_val = tokenizer.texts_to_sequences(X_val)

    X_train = pad_sequences(X_train, maxlen=maxlen)
    X_val = pad_sequences(X_val, maxlen=maxlen)

    Y_train = np.asarray(Y_train)
    Y_val = np.asarray(Y_val)

    #labels need to be 3D
    Y_train = np.expand_dims(Y_train, axis=2)
    Y_val = np.expand_dims(Y_val, axis=2)

    return X_train, Y_train, X_val, Y_val, embedding_matrix, vocab_size, tokenizer


def run(X_train, Y_train, X_val, Y_val, embedding_matrix, vocab_size, maxlen=40, emb_dim=300, neg_ratio=0, hidden_dim=300, drop=0.2, r_drop=0.1):
    ##build model
    input = Input(shape=(maxlen,))
    model = Embedding(vocab_size, emb_dim, weights=[embedding_matrix], input_length=maxlen, trainable=False)(input)
    model = Dropout(drop)(model)
    model = Bidirectional(LSTM(hidden_dim, return_sequences=True, recurrent_dropout=r_drop))(model)
    model = Dropout(drop)(model)
    out = TimeDistributed(Dense(1, activation='sigmoid'))(model)

    model = Model(input, out)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    earlyStop = [EarlyStopping(monitor='val_loss', patience=1)]
    history = model.fit(X_train, Y_train, batch_size=64, epochs=10, validation_data=(X_val, Y_val), 
        callbacks=earlyStop) 


    pred = model.predict(X_val)
    Y_pred = np.squeeze(pred)
    test = [[1 if y>=threshold else 0 for y in x] for x in Y_pred]
    test_arr = np.asarray(test)
    test_arr = np.reshape(test_arr, (-1))
    target = np.reshape(Y_val, (-1))

    print (metrics.precision_recall_fscore_support(target, test_arr, average=None,
                                              labels=[0, 1]))

    
#     Y_pred_ = [[1 if y>=threshold else 0 for y in x] for x in Y_pred]
    Y_val_ = np.squeeze(Y_val)

    print ("Evaluate: dev seg exact")
    pred_out_dir = out_dir+'seg_'+str(neg_ratio)+'neg'
    gold_dir = '../../data/val_segs/'+'seg_'+str(neg_ratio)+'neg'
    p, r, f = seg_exact_match(test, Y_val_, pred_out_dir, gold_dir)
    
    return model, history, p, r, f


def doc_eval(model, tokenizer, doc_test, doc_out_dir, gold_dir, MAXLEN=40):
    doc_preds = [doc_pred(model, d, tokenizer, MAXLEN) for d in doc_test]
    doc_preds = [[int(a) for a in x] for x in doc_preds]

    with open(doc_out_dir, 'w') as fout:
        for i in range(len(doc_preds)):
            first = 0
            j = 0
            string = ''
            no_mention = True
            while j<len(doc_preds[i]):
                while j<len(doc_preds[i]) and doc_preds[i][j]== 0:
                    j+=1
                if j<len(doc_preds[i]) and doc_preds[i][j] == 1:
                    no_mention=False
                    start = j
                    while j+1<len(doc_preds[i]) and doc_preds[i][j+1]==1:
                        j+=1
                    end = j 
                    if first > 0:
                        string += " | "
                    string += (str(start)+' '+str(end))
                    j+=1
                    first += 1
            if no_mention:
                fout.write("-1 -1"+'\n')
            else:
                fout.write(string+'\n')
    print ('evaluating data from: ', doc_out_dir)
    print ('doc exact: ', doc_exact_match(doc_out_dir, gold_dir))
    print ('doc partial: ', doc_partial_match(doc_out_dir, gold_dir))

In [16]:
##load glove
embedding_index = {}
f = open('../../glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embedding_index[word] = coefs
f.close()


In [50]:
doc_dir = "../../data/all_test_docs/"
doc_test_y, doc_test_x = get_doc_test('test_doc_gold', 'test_docs')
doc_tests = [read_doc(doc_test_x[d], doc_test_y[d]) for d in range(len(doc_test_x))]
doc_tests = [sent2tokens(s) for s in doc_tests]

zero_shot_y, zero_shot_x = get_doc_test('zero_shot_doc_gold', 'zero_shot_docs')
zero_shot_tests = [read_doc(zero_shot_x[d], zero_shot_y[d]) for d in range(len(zero_shot_x))]
zero_shot_tests = [sent2tokens(s) for s in zero_shot_tests]

MAXLEN = 40
DIR = '../../data/data_40/'
out_dir = '../../outputs/'

if not os.path.exists(out):
        os.makedirs(out)



In [51]:
neg_ratio = 0.025

In [52]:
X_train, Y_train, X_val, Y_val, embedding_matrix, vocab_size, tokenizer = prep_data(neg_ratio=neg_ratio)

26407 pos data sampled
11589 neg data sampled
Vocab size:  68498
19964/68498 words covered in glove


In [None]:
# 0neg: 0.637
#0.025neg: 68.9

In [59]:
model, history, p, r, f = run(X_train, Y_train, X_val, Y_val, embedding_matrix, vocab_size, neg_ratio=neg_ratio)


Train on 36096 samples, validate on 1900 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
(array([0.98915476, 0.69591384]), array([0.98685166, 0.73552059]), array([0.98800186, 0.71516927]), array([73013,  2987]))
Evaluate: dev seg exact
Doc exact: 
 precision: 0.318686401480111, recall: 0.3626315789473684, f1: 0.3392417528311177


In [60]:
doc_eval(model, tokenizer, doc_tests, out_dir+'doc_40_'+str(neg_ratio)+'neg', '../../data/all_test_docs/test_doc_gold')
doc_eval(model, tokenizer, zero_shot_tests, out_dir+'zeroshot_40_'+str(neg_ratio)+'neg', '../../data/all_test_docs/zero_shot_doc_gold')


evaluating data from:  ../../outputs/doc_40_0.025neg
Doc exact: 
 precision: 0.11183673469387755, recall: 0.1454352441613588, f1: 0.12644208583294878
doc exact:  (0.11183673469387755, 0.1454352441613588, 0.12644208583294878)
Doc Token wise: 
 precision: 0.2749412782791445, recall: 0.35675328841835097, f1: 0.3105494658940166
doc partial:  (0.2749412782791445, 0.35675328841835097, 0.3105494658940166)
evaluating data from:  ../../outputs/zeroshot_40_0.025neg
Doc exact: 
 precision: 0.06515264333581534, recall: 0.07536606373815675, f1: 0.06988817891373801
doc exact:  (0.06515264333581534, 0.07536606373815675, 0.06988817891373801)
Doc Token wise: 
 precision: 0.23661183059152957, recall: 0.23126924392747178, f1: 0.23391003460207613
doc partial:  (0.23661183059152957, 0.23126924392747178, 0.23391003460207613)


In [63]:
for hidden_dim in [100, 200, 300]:
    for drop in [0.1, 0.2, 0.3]:
        print ('hidden dim:', hidden_dim)
        print ('drop:', drop)
        model, history, p, r, f = run(X_train, Y_train, X_val, Y_val, embedding_matrix, vocab_size, neg_ratio=neg_ratio, hidden_dim=hidden_dim, drop=drop)
        doc_eval(model, tokenizer, doc_tests, out_dir+'doc_40_'+str(neg_ratio)+'neg', '../../data/all_test_docs/test_doc_gold')
        doc_eval(model, tokenizer, zero_shot_tests, out_dir+'zeroshot_40_'+str(neg_ratio)+'neg', '../../data/all_test_docs/zero_shot_doc_gold')



hidden dim: 100
drop: 0.1
Train on 36096 samples, validate on 1900 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
(array([0.98952836, 0.68842822]), array([0.98620794, 0.74489454]), array([0.98786536, 0.71554912]), array([73013,  2987]))
Evaluate: dev seg exact
Doc exact: 
 precision: 0.31934731934731936, recall: 0.3605263157894737, f1: 0.33868974042027195
evaluating data from:  ../../outputs/doc_40_0.025neg
Doc exact: 
 precision: 0.11833658945018141, recall: 0.15003538570417552, f1: 0.13231393353097204
doc exact:  (0.11833658945018141, 0.15003538570417552, 0.13231393353097204)
Doc Token wise: 
 precision: 0.2958212113197567, recall: 0.35883862688482515, f1: 0.32429689765149317
doc partial:  (0.2958212113197567, 0.35883862688482515, 0.32429689765149317)
evaluating data from:  ../../outputs/zeroshot_40_0.025neg
Doc exact: 
 precision: 0.06567717996289425, recall: 0.07622739018087855, f1: 0.070560095674706
doc exact:  (0.06567717996289425, 0.07622739018087855, 0.070560095

Train on 36096 samples, validate on 1900 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
(array([0.98977921, 0.67897126]), array([0.98546834, 0.75125544]), array([0.98761907, 0.71328671]), array([73013,  2987]))
Evaluate: dev seg exact
Doc exact: 
 precision: 0.3185799907791609, recall: 0.3636842105263158, f1: 0.339641189481445
evaluating data from:  ../../outputs/doc_40_0.025neg
Doc exact: 
 precision: 0.09102960671674767, recall: 0.14578910120311395, f1: 0.11207834602829163
doc exact:  (0.09102960671674767, 0.14578910120311395, 0.11207834602829163)
Doc Token wise: 
 precision: 0.2360191504532953, recall: 0.3716714789862047, f1: 0.2887047535979067
doc partial:  (0.2360191504532953, 0.3716714789862047, 0.2887047535979067)
evaluating data from:  ../../outputs/zeroshot_40_0.025neg
Doc exact: 
 precision: 0.04631578947368421, recall: 0.07579672695951765, f1: 0.05749754982032016
doc exact:  (0.04631578947368421, 0.07579672695951765, 0.05749754982032016)
Doc Token wise: 
 precision: 0.1

In [None]:
## use hidden dim 100, dropout 0.1

In [66]:
hidden_dim = 50
drop = 0.1
r_drop=0.0
model, history, p, r, f = run(X_train, Y_train, X_val, Y_val, embedding_matrix, vocab_size, neg_ratio=neg_ratio, hidden_dim=hidden_dim, drop=drop, r_drop=r_drop)
doc_eval(model, tokenizer, doc_tests, out_dir+'doc_40_'+str(neg_ratio)+'neg', '../../data/all_test_docs/test_doc_gold')
doc_eval(model, tokenizer, zero_shot_tests, out_dir+'zeroshot_40_'+str(neg_ratio)+'neg', '../../data/all_test_docs/zero_shot_doc_gold')

    
    

Train on 36096 samples, validate on 1900 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
(array([0.98926208, 0.69470699]), array([0.98672839, 0.73819886]), array([0.98799361, 0.71579289]), array([73013,  2987]))
Evaluate: dev seg exact
Doc exact: 
 precision: 0.31529850746268656, recall: 0.35578947368421054, f1: 0.33432245301681507
evaluating data from:  ../../outputs/doc_40_0.025neg
Doc exact: 
 precision: 0.09567976568220649, recall: 0.1387119603680113, f1: 0.11324570273003032
doc exact:  (0.09567976568220649, 0.1387119603680113, 0.11324570273003032)
Doc Token wise: 
 precision: 0.256416054267948, recall: 0.3638113570741097, f1: 0.3008157039591485
doc partial:  (0.256416054267948, 0.3638113570741097, 0.3008157039591485)
evaluating data from:  ../../outputs/zeroshot_40_0.025neg
Doc exact: 
 precision: 0.055522740696987594, recall: 0.08096468561584841, f1: 0.06587245970567625
doc exact:  (0.055522740696987594, 0.08096468561584841, 0.06587245970567625)
Doc Token wise: 
 precision: 0

In [67]:
hidden_dim = 100
drop = 0.1
r_drop=0.0
model, history, p, r, f = run(X_train, Y_train, X_val, Y_val, embedding_matrix, vocab_size, neg_ratio=neg_ratio, hidden_dim=hidden_dim, drop=drop, r_drop=r_drop)
doc_eval(model, tokenizer, doc_tests, out_dir+'doc_40_'+str(neg_ratio)+'neg', '../../data/all_test_docs/test_doc_gold')
doc_eval(model, tokenizer, zero_shot_tests, out_dir+'zeroshot_40_'+str(neg_ratio)+'neg', '../../data/all_test_docs/zero_shot_doc_gold')

    
    

Train on 36096 samples, validate on 1900 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
(array([0.9898223 , 0.68256379]), array([0.98568748, 0.75225979]), array([0.98775056, 0.71571906]), array([73013,  2987]))
Evaluate: dev seg exact
Doc exact: 
 precision: 0.3219871205151794, recall: 0.3684210526315789, f1: 0.34364261168384874
evaluating data from:  ../../outputs/doc_40_0.025neg
Doc exact: 
 precision: 0.09178321678321678, recall: 0.14861995753715498, f1: 0.11348284247500674
doc exact:  (0.09178321678321678, 0.14861995753715498, 0.11348284247500674)
Doc Token wise: 
 precision: 0.23396150761828388, recall: 0.37439846005774785, f1: 0.2879703886489821
doc partial:  (0.23396150761828388, 0.37439846005774785, 0.2879703886489821)
evaluating data from:  ../../outputs/zeroshot_40_0.025neg
Doc exact: 
 precision: 0.04813664596273292, recall: 0.08010335917312661, f1: 0.060135790494665366
doc exact:  (0.04813664596273292, 0.08010335917312661, 0.060135790494665366)
Doc Token wise: 
 precis

In [68]:
threshold = 0.5
hidden_dim = 100
drop = 0.1
r_drop=0.0
model, history, p, r, f = run(X_train, Y_train, X_val, Y_val, embedding_matrix, vocab_size, neg_ratio=neg_ratio, hidden_dim=hidden_dim, drop=drop, r_drop=r_drop)
doc_eval(model, tokenizer, doc_tests, out_dir+'doc_40_'+str(neg_ratio)+'neg', '../../data/all_test_docs/test_doc_gold')
doc_eval(model, tokenizer, zero_shot_tests, out_dir+'zeroshot_40_'+str(neg_ratio)+'neg', '../../data/all_test_docs/zero_shot_doc_gold')

Train on 36096 samples, validate on 1900 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
(array([0.98729005, 0.72711205]), array([0.98942654, 0.68865082]), array([0.98835714, 0.70735901]), array([73013,  2987]))
Evaluate: dev seg exact
Doc exact: 
 precision: 0.2980392156862745, recall: 0.32, f1: 0.3086294416243655
evaluating data from:  ../../outputs/doc_40_0.025neg
Doc exact: 
 precision: 0.11169284467713787, recall: 0.13588110403397027, f1: 0.12260536398467432
doc exact:  (0.11169284467713787, 0.13588110403397027, 0.12260536398467432)
Doc Token wise: 
 precision: 0.3036953136061164, recall: 0.3440808469682387, f1: 0.3226291644731895
doc partial:  (0.3036953136061164, 0.3440808469682387, 0.3226291644731895)
evaluating data from:  ../../outputs/zeroshot_40_0.025neg
Doc exact: 
 precision: 0.06393380970289582, recall: 0.07321274763135228, f1: 0.068259385665529
doc exact:  (0.06393380970289582, 0.07321274763135228, 0.068259385665529)
Doc Token wise: 
 precision: 0.2386012

In [69]:
threshold = 0.5
hidden_dim = 80
drop = 0.1
r_drop=0.0
model, history, p, r, f = run(X_train, Y_train, X_val, Y_val, embedding_matrix, vocab_size, neg_ratio=neg_ratio, hidden_dim=hidden_dim, drop=drop, r_drop=r_drop)
doc_eval(model, tokenizer, doc_tests, out_dir+'doc_40_'+str(neg_ratio)+'neg', '../../data/all_test_docs/test_doc_gold')
doc_eval(model, tokenizer, zero_shot_tests, out_dir+'zeroshot_40_'+str(neg_ratio)+'neg', '../../data/all_test_docs/zero_shot_doc_gold')

Train on 36096 samples, validate on 1900 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
(array([0.98707474, 0.72633452]), array([0.98946763, 0.68329428]), array([0.98826973, 0.70415732]), array([73013,  2987]))
Evaluate: dev seg exact
Doc exact: 
 precision: 0.3002008032128514, recall: 0.31473684210526315, f1: 0.30729701952723537
evaluating data from:  ../../outputs/doc_40_0.025neg
Doc exact: 
 precision: 0.11581920903954802, recall: 0.1305732484076433, f1: 0.12275449101796407
doc exact:  (0.11581920903954802, 0.1305732484076433, 0.12275449101796407)
Doc Token wise: 
 precision: 0.2966831333803811, recall: 0.3371831889637472, f1: 0.31563931226068026
doc partial:  (0.2966831333803811, 0.3371831889637472, 0.31563931226068026)
evaluating data from:  ../../outputs/zeroshot_40_0.025neg
Doc exact: 
 precision: 0.06412478336221837, recall: 0.06373815676141258, f1: 0.06393088552915767
doc exact:  (0.06412478336221837, 0.06373815676141258, 0.06393088552915767)
Doc Token wise: 
 

In [1]:
threshold = 0.5
hidden_dim = 300
drop = 0.3
r_drop=0.1
model, history, p, r, f = run(X_train, Y_train, X_val, Y_val, embedding_matrix, vocab_size, neg_ratio=neg_ratio, hidden_dim=hidden_dim, drop=drop, r_drop=r_drop)
doc_eval(model, tokenizer, doc_tests, out_dir+'doc_40_'+str(neg_ratio)+'neg', '../../data/all_test_docs/test_doc_gold')
doc_eval(model, tokenizer, zero_shot_tests, out_dir+'zeroshot_40_'+str(neg_ratio)+'neg', '../../data/all_test_docs/zero_shot_doc_gold')

1