In [3]:
import pandas as pd 
import numpy as np 
from data_reader import *
from evaluate_new import *
from sklearn_crfsuite import CRF 
from sklearn_crfsuite import metrics

In [6]:
import re
## Get the doc level test data
doc_dir = "../../../data_doc/"
def get_doc_test(labels="golden_data", text="test_file"):
    test_labels = []
    test_doc = []
    with open(doc_dir+labels, 'r') as doc_labels, open(doc_dir+text, 'r') as doc_text:
        d_labels = doc_labels.readlines()
        d_text = doc_text.readlines()
        assert len(d_labels) == len(d_text), "Mismatch"
        for i in range(len(d_labels)):
            test_labels.append(d_labels[i].strip())
            
            text = d_text[i].strip()
            text = re.sub('\d', '0', text)
            text = re.sub('[^ ]- ', '', text)
            
            test_doc.append(text)
    return test_labels, test_doc
            

In [7]:
doc_test_y, doc_test_x = get_doc_test()

In [8]:
# convert doc data to (text, label)
def read_doc(doc, labels):
    doc = doc.strip().split()
    labels = labels.strip().split('|')
    labels = [la.split() for la in labels]
    for i in range(len(labels)):
        for j in range(len(labels[i])):
            labels[i][j] = int(labels[i][j])

    res_labels = [0]*len(doc)
    for la in labels:
        if la[2]!=0:
            start = la[0]
            end = la[1]
            res_labels[start : end+1] = [1]*(end+1-start)
    return [(doc[i], str(res_labels[i])) for i in range(len(doc))]

In [9]:
doc_tests = [read_doc(doc_test_x[d], doc_test_y[d]) for d in range(len(doc_test_x))]

In [17]:
# predict one doc
def doc_pred(model, doc, MAXLEN):
    splits = []
    for i in range(0, len(doc), MAXLEN):
        splits.append(doc[i : i+MAXLEN])
    preds = model.predict(splits)
    preds = [p for pd in preds for p in pd]
    return preds

In [10]:
def word2features(sent, i):
    word = sent[i][0]

    ##youmay add more features
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit()
    }

    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })

    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

##CRF takes string as labels
def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]



In [20]:
def run(train_dir, doc_out_dir, gold_dir):
    train_sents = get_sents(train_dir)
    # test_sents = get_sents("../../../data_30_0.1%neg/test.txt")

    ##labels are strings
    X_train = [sent2features(s) for s in train_sents]
    Y_train = [sent2labels(s) for s in train_sents]
    # X_test = [sent2features(s) for s in test_sents]
    # Y_test = [sent2labels(s) for s in test_sents]
    X_doc_test = [sent2features(s) for s in doc_tests]

    crf = CRF(algorithm='lbfgs',
              c1=0.1,
              c2=0.1,
              max_iterations=100,
              all_possible_transitions=False)

    crf.fit(X_train, Y_train)
    
    doc_preds = [doc_pred(crf, d, 30) for d in X_doc_test]
    doc_preds = [[int(a) for a in x] for x in doc_preds]
    ## record the prediceted start and end index
    ## for doc level
    ## write all start and end indices (0 0 if not mention)
    with open(doc_out_dir, 'w') as fout:
        for i in range(len(doc_preds)):
            first = 0
            j = 0
            string = ''
            no_mention = True
            while j<len(doc_preds[i]):
                while j<len(doc_preds[i]) and doc_preds[i][j]== 0:
                    j+=1
                if j<len(doc_preds[i]) and doc_preds[i][j] == 1:
                    no_mention=False
                    start = j
                    while j+1<len(doc_preds[i]) and doc_preds[i][j+1]==1:
                        j+=1
                    end = j 
                    if first > 0:
                        string += " | "
                    string += (str(start)+' '+str(end))
                    j+=1
                    first += 1
            if no_mention:
                fout.write("0 0"'\n')
            else:
                fout.write(string+'\n')
                
    print ('doc exact: ', doc_exact_match(doc_out_dir, gold_dir))
    print ('doc partial: ', doc_partial_match(doc_out_dir, gold_dir))
#     print ('fragment exact: ', discovery_exact_match(doc_out_dir, gold_dir))
#     print ('fragment exact: ', discovery_partial_match(doc_out_dir, gold_dir))


In [19]:
run(train_dir="../../../data_30_0.1%neg/train.txt", doc_out_dir='../../../doc_outputs_30/CRF_0.1_preds', 
    gold_dir='../../../data_doc_30/golden_data')

0.0026109660574412533
0.004351610095735422
0.0017528483786152498
0.0017528483786152498


In [21]:
run(train_dir="../../../data_30_1%neg/train.txt", doc_out_dir='../../../doc_outputs_30/CRF_1_preds', 
    gold_dir='../../../data_doc_30/golden_data')

doc exact:  0.010291595197255575
doc partial:  0.012006861063464836


In [22]:
run(train_dir="../../../data_30_10%neg/train.txt", doc_out_dir='../../../doc_outputs_30/CRF_10_preds', 
    gold_dir='../../../data_doc_30/golden_data')

doc exact:  0.08270676691729323
doc partial:  0.08270676691729323
