In [None]:
# This code is based on the treelstm code
# https://github.com/tensorflow/fold/blob/master/tensorflow_fold/g3doc/sentiment.ipynb
import codecs
import os
import zipfile
import time
import datetime
from gensim import models
from VocabProcessor import VocabProcessor
import re
import itertools
from collections import Counter
from sklearn import metrics
from nltk.tokenize import sexpr
import numpy as np
from six.moves import urllib
import tensorflow as tf
sess = tf.InteractiveSession()
import tensorflow_fold as td
import pickle
import sys
sys.setrecursionlimit(2000)

# Model Hyperparameters
# ==================================================
tf.flags.DEFINE_integer("embedding_dim", 200, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_integer("LEARNING_RATE", 0.0008, "load the previous one, number is the batch order")
tf.flags.DEFINE_integer("KEEP_PROB", 0.75, "load the previous one, number is the batch order")
tf.flags.DEFINE_integer("BATCH_SIZE", 100, "load the previous one, number is the batch order")
tf.flags.DEFINE_integer("EPOCHS", 100, "load the previous one, number is the batch order")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()

# pre-trained word embedding is trained with separate words.
# Original code was taken & modified from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
# ==================================================
def clean_str(string):
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r"  ", " ", string)
    return string.lower()

def separateFeatures(string):
    for line in string:
        line = clean_str(line)
        pid = line.split("\t")[0]
        sen = line.split("\t")[1]
        ddiCheck = line.split("\t")[2]
        ddiType = line.split("\t")[3]
        drug1 = line.split("\t")[4]
        drug1Name = line.split("\t")[5]
        drug1Type = line.split("\t")[6]
        drug2 = line.split("\t")[7]
        drug2Name = line.split("\t")[8]
        drug2Type = line.split("\t")[9]
        binaryParsedTree = line.split("\t")[10].strip()
        parsedWholeSen = line.split("\t")[11]
        
        yield binaryParsedTree, parsedWholeSen

# Loads DDI challenge'13 data from files
# The data should be preprocessed before.
# ==================================================
def load_data_and_labels(string):
    samples = list(open(string, "r").readlines())
    return list(separateFeatures(samples))

data_dir = "./runs/testModel"
print('saving model files to %s' % data_dir)
TrainFeatures = load_data_and_labels("data/DDItrain_recur")
TestFeatures = load_data_and_labels("data/DDItest_recur")

train_trees = [Tf[0] for Tf in TrainFeatures]
test_trees = [Tf[0] for Tf in TestFeatures]

#shuffle the training trees
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(TrainFeatures)))
train_trees = np.array(train_trees)[shuffle_indices]

print(len(train_trees))
print(len(test_trees))

#this part is necessary for making a vocabulary
allSens1 = [Tf[1] for Tf in TrainFeatures] + [Tf[1] for Tf in TestFeatures]
splitted1 = [sentence.split(", ") for sentence in allSens1]
max_document_length1 = max(len(s) for s in splitted1)
vocab_proc1 = VocabProcessor(max_document_length1, tokenizer_fn="splitComma")
np.array(list(vocab_proc1.fit_transform([Tf[1] for Tf in TrainFeatures])))
np.array(list(vocab_proc1.fit_transform([Tf[1] for Tf in TestFeatures])))
vocab_proc1.vocabulary_.freeze()

#Since original word embedding is too large, the training time takes too much time.
#We only use the words appeared in the DDI'13 corpus only.
#We have saved the selected WE words using pickle
ft1 = open('shorten_ensemble/pubpmc.pickle', 'rb')
embedding_for_given_index1 = pickle.load(ft1)
ft1.close()

print(len(embedding_for_given_index1))
   
weight_matrix = embedding_for_given_index1
word_idx = vocab_proc1.vocabulary_

In [None]:
#    This model is based on the model of 'Improved Semantic
#    Representations From Tree-Structured Long Short-Term Memory
#    Networks' <http://arxiv.org/pdf/1503.00075.pdf>, with recurrent
#    dropout as described in 'Recurrent Dropout without Memory Loss'
#    <http://arxiv.org/pdf/1603.05118.pdf>.   
#    Originally, this code is based on the tensorflow fold library.
#
#=====================
class BinaryTreeLSTMCell(tf.contrib.rnn.RNNCell):
#   num_units: int, The number of units in the LSTM cell.
#   keep_prob: Keep probability for recurrent dropout.    
#=====================
    def __init__(self, num_units, keep_prob=1.0):
        super(BinaryTreeLSTMCell, self).__init__()
#   init process        
        self._keep_prob = keep_prob
        self._num_units = num_units
        self.state_size = (num_units, num_units)
        self.output_size = num_units * 1
    def state_size(self):
        self.state_size = (self._num_units, self._num_units)
    def output_size(self):
        self.output_size = (self._num_units * 1)

    def __call__(self, inputs, state, contextVec, ent1Vec, ent2Vec, scope=None):
        with tf.variable_scope(scope or type(self).__name__):
            inputs = tf.nn.dropout(inputs, self._keep_prob)
            lhs, rhs = state
            c_0, h_0 = lhs#cell and hidden states from left child
            c_1, h_1 = rhs#cell and hidden states from right child
            
            concat0 = tf.contrib.layers.fully_connected(
              tf.concat([contextVec, ent1Vec, ent2Vec, inputs, h_0, h_1], 1), 5 * self._num_units, trainable=True)
            
            # i = input_gate, j = new_input, f = forget_gate, o = output_gate
            i_0, j_0, f_00, f_01, o_0 = tf.split(value=concat0, num_or_size_splits=5, axis=1)
            j_0 = tf.tanh(j_0)
            if not isinstance(self._keep_prob, float) or self._keep_prob < 1:
                j_0 = tf.nn.dropout(j_0, self._keep_prob)
                
            new_c0 = (c_0 * tf.sigmoid(f_00 + 1.0) +
                      c_1 * tf.sigmoid(f_01 + 1.0) +
                      tf.sigmoid(i_0) * j_0)
            new_h0 = tf.tanh(new_c0) * tf.sigmoid(o_0)
                        
            resultH = tf.concat([new_h0], 1)
            resultH = tf.nn.dropout(resultH, self._keep_prob)
            return resultH, [new_c0, new_h0]
#dropout keep probability, with a default of 1 (for eval).
keep_prob_ph = tf.placeholder_with_default(1.0, [])
keep_prob_ph = tf.placeholder_with_default(1.0, [])

lstm_num_units = 128   # Tai et al. used 150
tree_lstm = td.ScopedLayer(
    BinaryTreeLSTMCell(lstm_num_units, keep_prob=keep_prob_ph),
    name_or_scope='tree_lstm')
NUM_CLASSES = 2  # number of classes
#fully connected layer
output_layer = td.FC(NUM_CLASSES, activation=None, name='output_layer')

#word embedding for DDI'13 corpus should be false. fine-tuning raise the overfitting issue
word_embedding = td.Embedding(
    *weight_matrix.shape, initializer=weight_matrix, name='word_embedding', trainable = False)

#declare recursive model
embed_subtree = td.ForwardDeclaration(name='embed_subtree')

#convert subtree containment (context) feature to vector
def makeContextMat(input1):
    input1 = int(input1)
    if input1 == 0:
        return [1 for i in range(10)]
    else:
        return [0 for i in range(10)]

#convert position feature to vector
def makeEntPositMat(givenInput):
    position_embed = [[1,1,1,1,1,1,1,1,1,1],
                 [1,0,1,1,1,1,1,1,1,1],
                 [1,0,0,1,1,1,1,1,1,1],
                 [1,0,0,0,1,1,1,1,1,1],
                 [1,0,0,0,0,1,1,1,1,1],
                 [1,0,0,0,0,0,1,1,1,1],
                 [1,0,0,0,0,0,0,1,1,1],
                 [1,0,0,0,0,0,0,0,1,1],
                 [1,0,0,0,0,0,0,0,0,1],
                 [0,0,0,0,0,0,0,0,0,0],
                 [0,0,0,0,0,0,0,0,0,1],
                 [0,0,0,0,0,0,0,0,1,1],
                 [0,0,0,0,0,0,0,1,1,1],
                 [0,0,0,0,0,0,1,1,1,1],
                 [0,0,0,0,0,1,1,1,1,1],
                 [0,0,0,0,1,1,1,1,1,1],
                 [0,0,0,1,1,1,1,1,1,1],
                 [0,0,1,1,1,1,1,1,1,1],
                 [0,1,1,1,1,1,1,1,1,1]]
    intInput = int(givenInput)
    return position_embed[intInput]

def logits_and_state():
    """Creates a block that goes from tokens to (logits, state) tuples."""
    unknown_idx = len(word_idx)
    
    lookup_word = lambda word: word_idx.get(word)#unknown_idx is the default return value
    word2vec = (td.GetItem(0) >> td.GetItem(0) >> td.InputTransform(lookup_word) >>
                td.Scalar('int32') >> word_embedding)#<td.Pipe>: None -> TensorType((200,), 'float32')
    
    #make a copy of vectors for the leaf node or the internal node computation
    context2vec1 = td.GetItem(1) >> td.InputTransform(makeContextMat) >> td.Vector(10)
    context2vec2 = td.GetItem(1) >> td.InputTransform(makeContextMat) >> td.Vector(10)
    ent1posit1 = td.GetItem(2) >> td.InputTransform(makeEntPositMat) >> td.Vector(10)
    ent1posit2 = td.GetItem(2) >> td.InputTransform(makeEntPositMat) >> td.Vector(10)
    ent2posit1 = td.GetItem(3) >> td.InputTransform(makeEntPositMat) >> td.Vector(10)
    ent2posit2 = td.GetItem(3) >> td.InputTransform(makeEntPositMat) >> td.Vector(10)
    
    pair2vec = td.GetItem(0) >> (embed_subtree(), embed_subtree())
    # Trees are binary, so the tree layer takes two states as its input_state.
    zero_state = td.Zeros((tree_lstm.state_size,) * 2)
    # zero initialized
    zero_inp = td.Zeros(word_embedding.output_type.shape[0])#word_embedding.output_type.shape[0] == 200
    
    word_case = td.AllOf(word2vec, zero_state, context2vec1, ent1posit1, ent2posit1)
    pair_case = td.AllOf(zero_inp, pair2vec, context2vec2, ent1posit2, ent2posit2)
    #if leaf case, go to word case
    tree2vec = td.OneOf(lambda pair: len(pair[0]), [(1, word_case), (2, pair_case)])
    
    return tree2vec >> tree_lstm >> (output_layer, td.Identity())#logits and lstm result states

#Define a per-node loss function for training.
def tf_node_loss(logits, labels):
    # Ensures that the loss for examples whose ground truth class is `1` is 2x
    # higher than the loss for negative instances.
    # when test data do not have a labels, just give 0 to all test labels.
    # and calculate the results with logits.
    weight = tf.multiply(2.0, tf.cast(tf.equal(labels, 1), tf.float32)) + 1
    losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
    losses = losses*weight
    return losses

def tf_hits(logits, labels):
    predictions = tf.cast(tf.argmax(logits, 1), tf.int32)
    return tf.cast(tf.equal(predictions, labels), tf.float32)
def tf_pred(logits):
    return tf.cast(tf.argmax(logits, 1), tf.int32)
def tf_logits(logits):
    return logits
def tf_label(labels):
    return labels

def add_metrics(is_root):
    c = td.Composition(
      name='predict(is_root=%s)' % (is_root))
    with c.scope():
        labels = c.input[0]
        logits = td.GetItem(0).reads(c.input[1])        
        state = td.GetItem(1).reads(c.input[1])

        loss = td.Function(tf_node_loss)
        td.Metric('all_loss').reads(loss.reads(logits, labels))#do not need the loss of all nodes
        if is_root: td.Metric('root_loss').reads(loss)

        #save logits for ensemble
        result_logits = td.Function(tf_logits)
        td.Metric('all_logits').reads(result_logits.reads(logits))
        if is_root: td.Metric('root_logits').reads(result_logits)
        #save pred and labels for validation
        pred = td.Function(tf_pred)
        td.Metric('all_pred').reads(pred.reads(logits))
        if is_root: td.Metric('root_pred').reads(pred)
        answer = td.Function(tf_label)
        td.Metric('all_labels').reads(answer.reads(labels))
        if is_root: td.Metric('root_label').reads(answer)

        c.output.reads(state)
    return c

#separate labels, features and contents
def tokenize(s):
    labelAndFeatures, treeContent = s[1:-1].split(None, 1)
    label, outerContext, ent1Posit, ent2Posit = labelAndFeatures.split("/")
    # detection
    if label == '0':
        return '0', (sexpr.sexpr_tokenize(treeContent), outerContext, ent1Posit, ent2Posit)
    else:
        return '1', (sexpr.sexpr_tokenize(treeContent), outerContext, ent1Posit, ent2Posit)
    #classification
#     return label, (sexpr.sexpr_tokenize(treeContent), outerContext, ent1Posit, ent2Posit)

def embed_tree(is_root):
    return td.InputTransform(tokenize) >> (td.Scalar('int32'), logits_and_state()) >> add_metrics(is_root)

model = embed_tree(is_root=True)
#Resolve the forward declaration for embedding subtrees (the non-root case) with a second call to embed_tree.
embed_subtree.resolve_to(embed_tree(is_root=False))
#print('input type: %s' % model.input_type)
#print('output type: %s' % model.output_type)
compiler = td.Compiler.create(model)
#build model end
#==================

In [None]:
pred = compiler.metric_tensors['root_pred']   #validation
labels = compiler.metric_tensors['root_label']#validation
result_logits = compiler.metric_tensors['root_logits']#ensemble

train_feed_dict = {keep_prob_ph: FLAGS.KEEP_PROB}
loss = tf.reduce_sum(compiler.metric_tensors['root_loss'])#only root loss is calculated
opt = tf.train.AdamOptimizer(FLAGS.LEARNING_RATE)#adam optimizer is effective

grads_and_vars = opt.compute_gradients(loss)
train_op = opt.apply_gradients(grads_and_vars)

saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

def train_step(batch):
    train_feed_dict[compiler.loom_input_tensor] = batch
    _, batch_loss = sess.run([train_op, loss], train_feed_dict)
    return batch_loss

def train_epoch(train_set_shuffled):
    return sum(train_step(batch) for batch in td.group_by_batches(train_set_shuffled, FLAGS.BATCH_SIZE))

train_set = compiler.build_loom_inputs(train_trees)
test_feed_dict = compiler.build_feed_dict(test_trees)

#Run the main training loop, save the model after designated epoch
save_path = os.path.join(data_dir, 'DDI_finding_model')
for epoch, shuffled in enumerate(td.epochs(train_set, FLAGS.EPOCHS), 1):
    train_loss = train_epoch(shuffled)
    print("epoch %s finished. train_loss : %s" % (epoch, train_loss))    
    if epoch == 100:
        checkpoint_path = saver.save(sess, save_path, global_step=epoch)
        print('model saved in file: %s' % checkpoint_path)


In [None]:
#test the model that we saved above (single model)
saver.restore(sess, "runs/testModel/DDI_finding_model-100")
_test_loss = tf.reduce_sum(compiler.metric_tensors['root_loss'])
_test_logits = compiler.metric_tensors['root_logits']
test_loss, test_pred, test_labels, test_logits = sess.run([_test_loss, pred, labels, _test_logits],
                                             test_feed_dict)

#rough result test
#Note that this should not be the final result.
#Remember that in the preprocessing phase, 8 positive instances are filtered as negative.
#Among the 8 positive instance, 4 are the int type, 1 is the mechanism type, and 3 are the effect type.
#We report the exact score with our own f1_score calculation.
f1score = metrics.f1_score(test_labels, test_pred, average=None)
precision = metrics.precision_score(test_labels, test_pred, average=None)
recall = metrics.recall_score(test_labels, test_pred, average=None)
print('!!test!! : test_loss_avg: %.3e, test_f1score: [%s]\n, test_prec : [%s], test_recall : [%s]'
      % (test_loss, f1score, precision, recall))

#save the logits for later, (e.g. result calculation, ensemble)
fpred = open("runs/testModel/logits_1", "w")
for i in range(len(test_logits)):
    fpred.write(str(test_logits[i]))
    fpred.write("\n")
fpred.close()