In [2]:
import pandas as pd
import tensorflow as tf
import numpy as np
import re
from gensim.models import Word2Vec, Doc2Vec
from tensorflow.python.layers.core import Dense
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import KFold



In [3]:
df = pd.read_csv('sentence_support_v2.tsv', delimiter='\t')
df.head(n=10)
sentence_support_df = pd.read_csv('sentence_support_v2.tsv', delimiter='\t')
sentence_support_df.drop_duplicates(inplace=True)
sentence_support_df.fillna(" ",inplace=True)
kf = KFold(n_splits=4, random_state=5, shuffle=True)

for train_index, test_index in kf.split(sentence_support_df):
    print("TRAIN:", train_index, "TEST:", test_index)
#     X_train, X_test = sentence_support_df.loc[train_index], sentence_support_df.loc[test_index]
sentence_support_df.head()

TRAIN: [    0     1     2 ..., 42319 42320 42321] TEST: [    3     5     9 ..., 42288 42308 42316]
TRAIN: [    0     2     3 ..., 42317 42318 42320] TEST: [    1     6     7 ..., 42315 42319 42321]
TRAIN: [    1     2     3 ..., 42319 42320 42321] TEST: [    0    11    15 ..., 42311 42314 42318]
TRAIN: [    0     1     3 ..., 42318 42319 42321] TEST: [    2     4    10 ..., 42312 42317 42320]


Unnamed: 0,pathwayA,pathwayB,crosstalk,pmid,sentenceFromPaper,label
0,Adipocytokine signaling pathway,ErbB signaling pathway,yes,23228483,"In the present study, we demonstrate that lept...",1
1,Adipocytokine signaling pathway,ErbB signaling pathway,yes,18945363,"In summary, our results suggest the existence ...",1
2,Adipocytokine signaling pathway,Estrogen signaling pathway,yes,20410173,These observations support the notion that the...,1
3,Adipocytokine signaling pathway,Estrogen signaling pathway,yes,23357303,The crosstalk between leptin and estrogen resc...,1
4,Adipocytokine signaling pathway,Estrogen signaling pathway,yes,22178935,The study supports the existence of a crosstal...,1


In [4]:
tokenizer = RegexpTokenizer(r'[^.?,:;\n\r\s]+')
sentences = df.sentenceFromPaper.unique()
sentences = [tokenizer.tokenize(s.lower()) for s in sentences]
seqlens = [len(s) for s in sentences]
df["sen_length"] = [len(x.split()) for x in df.sentenceFromPaper]
# df["pa_length"] = [len(x.split()) for x in df.pathwayA]
# df["pb_length"] = [len(x.split()) for x in df.pathwayB]
df.head()
df_fine = df[df.sen_length >0]
df_fine = df_fine[df_fine.sen_length <132]
splt = int(0.8*len(df_fine))
train_df = df_fine[:splt]
test_df = df_fine[splt:]
# df.at[20294,'sentenceFromPaper']
# df.at[4278,'sentenceFromPaper']
''' Discard sentences longer than 120/150 words'''
''' replace numbers with n-digit int, float or fraction'''
srtd = df_fine.sort_values(by= 'sen_length', ascending=True)
# srtd = df_fine.sort_values(by= 'pb_length', ascending=False)
srtd
# sentences[:5]

Unnamed: 0,pathwayA,pathwayB,crosstalk,pmid,sentenceFromPaper,label,sen_length
23337,Jak-STAT signaling pathway,Insulin signaling pathway,no,19319849,other direction,0,2
23538,GnRH signaling pathway,Neurotrophin signaling pathway,unclear,15948150,GnRH analogs,0,2
23325,Jak-STAT signaling pathway,Prolactin signaling pathway,no,20826756,other direction,0,2
277,Adipocytokine signaling pathway,TNF signaling pathway,yes,19183933,share receptor,1,2
23272,Jak-STAT signaling pathway,ErbB signaling pathway,no,19088723,other direction,0,2
290,HIF-1 signaling pathway,MAPK signaling pathway,yes,14978738,supports other direction,1,3
322,ErbB signaling pathway,Insulin signaling pathway,yes,19034632,supports other direction,1,3
137,ErbB signaling pathway,Notch signaling pathway,yes,23542173,HER2 inhibition activates Notch1,1,4
38163,NF-kappa B signaling pathway,Apoptosis,no,25967949,"pathways linked, but not crosstalk",0,5
136,ErbB signaling pathway,Notch signaling pathway,yes,23542173,"Interestingly, EGFR signaling inhibits Notch1 ...",1,7


In [147]:
# ''' Run only once to generate the word2vec model'''

# corpus_root = "C:\\Users\pc1\\Downloads\\deep learning project\\pmcids"

# corpus = PlaintextCorpusReader(corpus_root, r'.*raw_text.txt')
# tokenizer2 = RegexpTokenizer(r'[^.?,:;\n\r]+')
# corpus.fileids()
# r = corpus.raw()

# r = re.sub(r'\d','d',r)

# all_sentences = [tokenizer.tokenize(sent.lower()) for sent in tokenizer2.tokenize(r)
#                      if len(sent)>1]

# all_sentences +=sentences
# # len(all_sentences)
# word_model = Word2Vec(all_sentences, size=100, window=5, min_count=1, workers=4)
# word_model.save("w2v.mdl")


In [5]:
w2v_mdl = Word2Vec.load("w2v.mdl")
w2v_mdl.most_similar(positive=["results","suggest"], negative= ["study"])


[('indicate', 0.7018297910690308),
 ('indicating', 0.6676725149154663),
 ('suggesting', 0.6627146601676941),
 ('show', 0.6587152481079102),
 ('demonstrate', 0.651054859161377),
 ('demonstrating', 0.5896004438400269),
 ('suggests', 0.5773453712463379),
 ('suggested', 0.5762311816215515),
 ('showing', 0.5728246569633484),
 ('demonstrates', 0.5626481771469116)]

In [13]:
# shuffled_df = df_fine.sample(frac = 1)
# shuffled_df.head()


Unnamed: 0,pathwayA,pathwayB,crosstalk,pmid,sentenceFromPaper,label,sen_length
18650,mTOR signaling pathway,Hippo signaling pathway,yes,25843706,"However, direct crosstalk between two pathways...",0,11
36403,Adherens junction,TNF signaling pathway,no,25065623,"Venkatesh M(1), Mukherjee S(1), Wang H(1), Li ...",0,39
40071,Estrogen signaling pathway,HIF-1 signaling pathway,yes,23382692,The precise contribution of HIF-1a in the adap...,0,14
8142,Hedgehog signaling pathway,VEGF signaling pathway,no,25141859,BACKGROUND: Researchers in recent studies have...,0,24
34298,Estrogen signaling pathway,Apoptosis,unclear,26345254,One paradigmatic example of this coupling is t...,0,11


In [102]:
# word_model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
word_model.most_similar(positive=["Adipocytokine","GnRH"], negative= ["Notch"])
# word_model.wv["result"]

[('Thyroid', 0.9255616664886475),
 ('[PubMed]', 0.9210473895072937),
 ('Progesterone-mediated', 0.9150570631027222),
 ('pathwayThyroid', 0.908361554145813),
 ('synthesisInsulin', 0.9067593812942505),
 ('process]', 0.9056727290153503),
 ('[PubMed', 0.9045209288597107),
 ('PMID:', 0.9000983834266663),
 ('PMCID:', 0.8999356031417847),
 ('MEDLINE]', 0.8983396291732788)]

In [10]:
df2.describe()

Unnamed: 0,Pathway A,Pathway B,Pubmed Query,PMID,Crosstalk,Transcriptional,Regulation type,Molecule A,Molecule A Identifier,Molecule A Source,Molecule B,Molecule B Identifier,Molecule B Source,Species,Tissue Name,BTO ID,Condition,Sentence from paper,Additional notes,Misleading sentences
count,3474,3474,3439,3474,3413,2698,737,613,613,613,599,599,599,698,701,597,378,735,2613,92
unique,26,26,758,1612,3,3,3,254,268,5,292,313,5,22,248,208,139,666,1459,71
top,Hedgehog signaling pathway,Hedgehog signaling pathway,Hedgehog HIF-1 Pathway,NO_RESULTS_FOR_PUBMED_QUERY,no,no,activating,LEP,hormone,UNIPROT,HIF1A,Q16665,UNIPROT,Homo sapiens,breast,BTO:0000149,cancer,"Indeed, VEGF is shown to upregulate both prese...",The full text was not read because thyroid hor...,This effect takes place in a Wnt-independent m...
freq,229,232,28,33,2685,2377,461,28,31,571,22,22,590,432,35,35,75,4,56,3


In [16]:
# tokenizer = RegexpTokenizer(r'[^.,:;\n\r\s]+')

def get_batch(start, batch_size, data_set):
    if start==0:
        data_set = data_set.sample(frac = 1)
    remainder = start + batch_size - len(data_set)
    if remainder > 0:
        return pd.concat([data_set[start:] , data_set[:remainder]]) , data_set
    else:
        return data_set[start:start+batch_size], data_set

def crossmap(out):
    if out=='yes': return 2
    elif out=='no': return 0
    else: return 1
    
def format_sent(sentence, max_length):
    n_rows = len(sentence)
    n_cols = len(sentence[0])
    if n_rows>=max_length:
        return sentence[:max_length]
    return np.concatenate((sentence,np.zeros([max_length-n_rows,n_cols])))

def att_enc_dec_inputs(batch, max_lengths):
    w2v_sents = [format_sent(w2v_mdl.wv[tokenizer.tokenize(str(s).lower())],max_lengths[0]) for s in batch.abstract]
    w2v_query = [format_sent(w2v_mdl.wv[tokenizer.tokenize(s.lower())],max_lengths[1]) for s in batch.variable]
    w2v_answer = [format_sent(w2v_mdl.wv[tokenizer.tokenize(s.lower())],max_lengths[2]) for s in batch.value]
#     start = [[w2v_mdl.wv['-']]]*len(batch)
#     w2v_answer_shifted = np.array([format_sent(np.concatenate((s,a)),max_lengths[2]) 
#               for s,a in zip(start,w2v_answer)])
#     return w2v_sents, w2v_query, w2v_answer_shifted, w2v_answer
    return w2v_sents, w2v_query, w2v_answer

def enc_dec_inputs(batch, max_lengths):
    w2v_sents = [format_sent(w2v_mdl.wv[tokenizer.tokenize(s.lower())],max_lengths[0]) for s in batch.sentenceFromPaper]
    w2v_pathA = [format_sent(w2v_mdl.wv[tokenizer.tokenize(s.lower())],max_lengths[1]) for s in batch.pathwayA]
    w2v_pathB = [format_sent(w2v_mdl.wv[tokenizer.tokenize(s.lower())],max_lengths[2]) for s in batch.pathwayB]
#     w2v_cross = [int(s) for s in batch.label]
    w2v_cross = [crossmap(s) for s in batch.crosstalk]
    return w2v_sents, w2v_pathA, w2v_pathB, w2v_cross

def single_rnn_inputs(batch, max_length):
    w2v_sents = [w2v_mdl.wv[tokenizer.tokenize(s.lower())] for s in batch.sentenceFromPaper]
    w2v_pathA = [w2v_mdl.wv[tokenizer.tokenize(s.lower())] for s in batch.pathwayA]
    w2v_pathB = [w2v_mdl.wv[tokenizer.tokenize(s.lower())] for s in batch.pathwayB]
    w2v_cross = [crossmap(s) for s in batch.crosstalk]
#     w2v_cross = [int(s) for s in batch.label]
    separator = [[w2v_mdl.wv['-']]]*len(batch)
    in_str = np.array([format_sent(np.concatenate((t,s,a,s,b)),max_length) 
              for t,s,a,b in zip(w2v_sents,separator,w2v_pathA,w2v_pathB)])
#     in_str = [np.concatenate((a,s,b,s,t)).tolist() for t,s,a,b in zip(w2v_sents,separator,w2v_pathA,w2v_pathB) ]
    return in_str, w2v_cross



In [None]:
def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()
    
def encode_sent_lengths(sentences): # sentences should be in the format batch_size * max_sentence_length * embedding_length
    # the max returns zero when the feature vector is a zero vector
    return tf.cast(tf.reduce_sum(tf.sign(tf.reduce_max(tf.abs(sentences), 2)),1),tf.int32)

# may not use these two methods

def cost(output, target):
    # Compute cross entropy for each frame.
    cross_entropy = target * tf.log(output + 1e-10)
    cross_entropy = -tf.reduce_sum(cross_entropy, 2)
    mask = tf.sign(tf.reduce_max(tf.abs(target), 2))
    cross_entropy *= mask
    # Average over actual sequence lengths.
    cross_entropy = tf.reduce_sum(cross_entropy, 1)
    cross_entropy /= tf.reduce_sum(mask, 1)
    return tf.reduce_mean(cross_entropy)


In [None]:
''' 
nested rnns
Experiment Sen_QA_QB: Encoder input is the sentence, Decoder input is the question, Decoder produces the output as a Yes-No.
Loss function is still base on comparing word vectors
'''

max_sentence_length = 132
max_query_length = 6
# num_layers = 2
truncate_length= 5
max_gradient_norm = 5
lrate = 0.001
num_neurons = 64 #128
embedding_length = 100
batch_size = 32 # 1
num_classes = 3 


graph1 = tf.Graph()
with graph1.as_default():

    
    a_queries = tf.placeholder(tf.float32, [None, max_query_length, embedding_length])
    b_queries = tf.placeholder(tf.float32, [None, max_query_length, embedding_length])
    text = tf.placeholder(tf.float32, [None, max_sentence_length, embedding_length])
    true_outputs = tf.placeholder(tf.int32,[None],name='yhat')

    w_out = tf.Variable(tf.truncated_normal([num_neurons, num_classes], stddev=0.1),name="w_out")
    b_out = tf.Variable(tf.truncated_normal([num_classes]),name="b_out")

    
    encoder_cell = tf.contrib.rnn.GRUCell(num_neurons)
    decoder_a_cell = tf.contrib.rnn.GRUCell(num_neurons)
    decoder_b_cell = tf.contrib.rnn.GRUCell(num_neurons)
    
#         encoder_cell = tf.contrib.rnn.BasicLSTMCell(num_neurons)
#         decoder_a_cell = tf.contrib.rnn.BasicLSTMCell(num_neurons)
#         decoder_b_cell = tf.contrib.rnn.BasicLSTMCell(num_neurons)

    _, encoder_state = tf.nn.dynamic_rnn(encoder_cell,text,dtype=tf.float32,
                                         sequence_length=encode_sent_lengths(text), scope = 'rnn1')
    encoder_state = tf.nn.dropout(encoder_state,0.5)

    _, middle_state = tf.nn.dynamic_rnn(decoder_a_cell,a_queries,dtype=tf.float32,
                                        sequence_length=encode_sent_lengths(a_queries),initial_state=encoder_state, scope = 'rnn2')
    middle_state = tf.nn.dropout(middle_state,0.5)
    _, decoder_state = tf.nn.dynamic_rnn(decoder_b_cell,b_queries,dtype=tf.float32,
                                         sequence_length=encode_sent_lengths(b_queries),initial_state=middle_state, scope = 'rnn3')
    decoder_state = tf.nn.dropout(decoder_state,0.5)

    reshaped_rnn_output = tf.one_hot(true_outputs,num_classes)
    yout = tf.matmul(decoder_state, w_out) + b_out
    train_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=yout,labels=reshaped_rnn_output))
    labels = tf.argmax(reshaped_rnn_output,1)
    preds = tf.argmax(yout,1)
    correct_pred = tf.equal(preds, labels)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    precesion = tf.metrics.precision(labels, preds)
    recall = tf.metrics.recall(labels, preds)
    
    # Optimization
    optimizer = tf.train.AdamOptimizer(lrate)
    params = tf.trainable_variables()
#     gradients = optimizer.compute_gradients(train_loss, params)
    gradients = tf.gradients(train_loss, params)
    clipped_gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm)
    update_step = optimizer.apply_gradients(zip(clipped_gradients, params))
    
    
with tf.Session(graph=graph1) as sess:
    init = tf.global_variables_initializer()
    init_l = tf.local_variables_initializer()
    sess = tf.Session()
    sess.run(init)
    sess.run(init_l)

    av_acc = 0.0
    av_err = 0.0
    epochs = 20
    train_size = len(train_df)
    
    for j in range(epochs):
        av_acc = 0.0
        av_err = 0.0    
        i=0
        while i< train_size:
            train_data, train_df = get_batch(i, batch_size, train_df )
            i += batch_size
            w2v_sents, w2v_pathA, w2v_pathB, w2v_cross = enc_dec_inputs(train_data,
                                                                        [max_sentence_length, max_query_length, max_query_length])
            _, acc, ls, prediction = sess.run([update_step,accuracy,train_loss,yout],
                                              {a_queries:w2v_pathA, b_queries:w2v_pathB, 
                                               true_outputs:w2v_cross , text:w2v_sents })
            av_err += ls
            av_acc += acc
        av_acc /= (i/batch_size)
        av_err /= (i/batch_size)
        print("epoch " + str(j+1) + " :")
        print("epoch " + str(j+1) + ": finished epoch training with average loss = " 
                     + str(av_err) + " , training accuracy = " + str(av_acc))
       
        
    print()
    print("--------------------------Evaluation on Test Data---------------------------------------------------")
    print()
    
    w2v_sents, w2v_pathA, w2v_pathB, w2v_cross = enc_dec_inputs(test_df,
                                                                        [max_sentence_length, max_query_length, max_query_length])
    prediction = sess.run([yout],{a_queries:w2v_pathA, b_queries:w2v_pathB, 
                                               true_outputs:w2v_cross , text:w2v_sents })
    
    p,r,f,_ = precision_recall_fscore_support(w2v_cross, np.argmax(prediction[0],axis=1))
    confusion_mat = confusion_matrix(w2v_cross, np.argmax(prediction[0],axis=1))
    
    # acc, prf, conf = evaluate(prediction,w2v_cross)
    # print("accuracy = " + str(acc))
    print("Precision for the three classes (0=No, 1=Unk, 2=yes):")
    print(p)
    print("Recall for the three classes (0=No, 1=Unk, 2=yes):")
    print(r)
    print("F1-measure for the three classes (0=No, 1=Unk, 2=yes):")
    print(f)

    print("confusion matrix:")
    print(confusion_mat)


epoch 1 :
epoch 1: finished epoch training with average loss = 0.477762165925 , training accuracy = 0.820284193841
epoch 2 :
epoch 2: finished epoch training with average loss = 0.382818020165 , training accuracy = 0.854704483696
epoch 3 :
epoch 3: finished epoch training with average loss = 0.332904200987 , training accuracy = 0.866536458333
epoch 4 :
epoch 4: finished epoch training with average loss = 0.299243961144 , training accuracy = 0.880066802536
epoch 5 :
epoch 5: finished epoch training with average loss = 0.27235787373 , training accuracy = 0.889804121377
epoch 6 :
epoch 6: finished epoch training with average loss = 0.24996087661 , training accuracy = 0.896795742754


In [20]:
# # Add ops to save and restore all the variables.
# saver = tf.train.Saver()

# # Later, launch the model, initialize the variables, do some work, and save the
# # variables to disk.
# with tf.Session() as sess:
#     save_path = saver.save(sess, "/tmp/nested_model.ckpt")


    
    
# # Add ops to save and restore all the variables.
# saver = tf.train.Saver()

# # Later, launch the model, use the saver to restore variables from disk, and
# # do some work with the model.
# with tf.Session() as sess:
#   # Restore variables from disk.
#   saver.restore(sess, "/tmp/nested_model.ckpt")


(8825,)
(8825,)
Precision for the three classes (0=No, 1=Unk, 2=yes):
[ 0.75966851  0.          0.3412177 ]
Recall, recall and f1-measure for the three classes (0=No, 1=Unk, 2=yes):
[ 0.77150873  0.          0.36131725]
F1-measure for the three classes (0=No, 1=Unk, 2=yes):
[ 0.76554284  0.          0.35097995]
confusion matrix:


In [None]:
''' 
single network
Experiment Sen_QA_QB: Encoder input is the sentence, Decoder input is the question, Decoder produces the output as a Yes-No.
Loss function is still base on comparing word vectors
'''


max_sentence_length = 132
max_query_length = 6
max_input_length = max_sentence_length + 2 * max_query_length + 2
# num_layers = 2
truncate_length= 5
max_gradient_norm = 5
lrate = 0.001
num_neurons = 64 #128
embedding_length = 100
batch_size = 32 # 1
num_classes = 3 

graph11 = tf.Graph()

with graph11.as_default():

    rnn_input = tf.placeholder(tf.float32, [None, max_input_length, embedding_length])
    true_outputs = tf.placeholder(tf.int32,[None],name='yhat')
  
    w_out = tf.Variable(tf.truncated_normal([num_neurons, num_classes], stddev=0.1),name="w_out")
    b_out = tf.Variable(tf.truncated_normal([num_classes]),name="b_out")

    
    encoder_cell = tf.contrib.rnn.GRUCell(num_neurons)
#         encoder_cell = tf.contrib.rnn.BasicLSTMCell(num_neurons)
#         decoder_a_cell = tf.contrib.rnn.BasicLSTMCell(num_neurons)
#         decoder_b_cell = tf.contrib.rnn.BasicLSTMCell(num_neurons)

    _, out_state = tf.nn.dynamic_rnn(encoder_cell,rnn_input,dtype=tf.float32,
                                         sequence_length=encode_sent_lengths(rnn_input))
  
    reshaped_rnn_output = tf.one_hot(true_outputs,num_classes)
    yout = tf.matmul(out_state, w_out) + b_out
#     yout = tf.nn.softmax(logit)
    train_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=yout,labels=reshaped_rnn_output))
    labels = tf.argmax(reshaped_rnn_output,1)
    preds = tf.argmax(yout,1)
    correct_pred = tf.equal(preds, labels)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    precesion = tf.metrics.precision(labels, preds)
    recall = tf.metrics.recall(labels, preds)
    
    optimizer = tf.train.AdamOptimizer(lrate)
    params = tf.trainable_variables()
#     gradients = optimizer.compute_gradients(train_loss, params)
    gradients = tf.gradients(train_loss, params)
    clipped_gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm)
    update_step = optimizer.apply_gradients(zip(clipped_gradients, params))
    
with tf.Session(graph=graph11) as sess:
    init = tf.global_variables_initializer()
    init_l = tf.local_variables_initializer()
    sess = tf.Session()
    sess.run(init)
    sess.run(init_l)


    av_acc = 0.0
    av_err = 0.0
    epochs = 20
    train_size = len(train_df)
    
    for j in range(epochs):
        print("epoch " + str(j+1) + " :")
        av_acc = 0.0
        av_err = 0.0    
        i=0
        while i< train_size:
#             print(i)
            train_data,train_df = get_batch(i, batch_size, train_df)
            i += batch_size
#             print(w2v_cross)
#             print(w2v_sents)
            w2v_sents, w2v_cross = single_rnn_inputs(train_data, max_input_length)
            _, acc, ls, prediction = sess.run([update_step,accuracy,train_loss,yout],
                                              {true_outputs:w2v_cross , rnn_input:w2v_sents })
            av_err += ls
            av_acc += acc
        av_acc /= (i/batch_size)
        av_err /= (i/batch_size)
        print("epoch " + str(j+1) + ": finished epoch training with average loss = " 
                     + str(av_err) + " , training accuracy = " + str(av_acc))
       
        
    print()
    print("--------------------------Evaluation on Test Data---------------------------------------------------")
    print()
    
    w2v_sents, w2v_cross =  single_rnn_inputs(test_df, max_input_length)
    prediction = sess.run([yout],{true_outputs:w2v_cross , rnn_input:w2v_sents })
    
    
    p,r,f,_ = precision_recall_fscore_support(w2v_cross, np.argmax(prediction[0],axis=1))
    confusion_mat = confusion_matrix(w2v_cross, np.argmax(prediction[0],axis=1))

    # acc, prf, conf = evaluate(prediction,w2v_cross)
    # print("accuracy = " + str(acc))
    print("Precision for the three classes (0=No, 1=Unk, 2=yes):")
    print(p)
    print("Recall, recall and f1-measure for the three classes (0=No, 1=Unk, 2=yes):")
    print(r)
    print("F1-measure for the three classes (0=No, 1=Unk, 2=yes):")
    print(f)

    print("confusion matrix:")
    print(confusion_mat)

In [49]:
p,r,f,_ = precision_recall_fscore_support(w2v_cross, np.argmax(prediction,axis=1))
confusion_mat = confusion_matrix(w2v_cross, np.argmax(prediction,axis=1))
    
    
print(np.sum(np.argmax(prediction,1)==2))
print(np.sum(np.array(w2v_cross)==2))
# acc, prf, conf = evaluate(prediction,w2v_cross)
# print("accuracy = " + str(acc))
print("Precision for the three classes (0=No, 1=Unk, 2=yes):")
print(p)
print("Recall, recall and f1-measure for the three classes (0=No, 1=Unk, 2=yes):")
print(r)
print("F1-measure for the three classes (0=No, 1=Unk, 2=yes):")
print(f)

print("confusion matrix:")
confusion_mat

1960
2156
Precision for the three classes (0=No, 1=Unk, 2=yes):
[ 0.90417036  0.57281553  0.84030612]
Recall, recall and f1-measure for the three classes (0=No, 1=Unk, 2=yes):
[ 0.95293017  0.23320158  0.76391466]
F1-measure for the three classes (0=No, 1=Unk, 2=yes):
[ 0.92791015  0.33146067  0.80029155]
confusion matrix:


array([[6114,   31,  271],
       [ 152,   59,   42],
       [ 496,   13, 1647]], dtype=int64)

In [None]:
''' 
Experiment Sen-Qu: Encoder input is the sentence, 2nd encoder input is the question, Decoder produces the output as a Yes_No.
Loss function is still base on comparing word vectors
'''


max_sentence_length = 131
max_query_length = 20
# num_layers = 2
truncate_length= 5
max_gradient_norm = 5
lrate = 0.001
num_neurons = 64 #128
embedding_length = 100
batch_size = 32 # 1
num_classes = 3 # initially to answer the crosstalk question, but to be changed to address more questions

graph3 = tf.Graph()
with graph3.as_default():

    a_queries = tf.placeholder(tf.float32, [None, max_query_length, embedding_length])
    b_queries = tf.placeholder(tf.float32, [None, max_query_length, embedding_length])
    text = tf.placeholder(tf.float32, [None, max_sentence_length, embedding_length])
    true_outputs = tf.placeholder(tf.int32,[None],name='yhat')

    w_out = tf.Variable(tf.truncated_normal([3*num_neurons, num_classes], stddev=0.1),name="w_out")
    b_out = tf.Variable(tf.truncated_normal([num_classes]),name="b_out")

    sen_encoder_cell = tf.contrib.rnn.GRUCell(num_neurons)
    encoder_a_cell = tf.contrib.rnn.GRUCell(num_neurons)
    encoder_b_cell = tf.contrib.rnn.GRUCell(num_neurons)
    
#         sen_encoder_cell = tf.contrib.rnn.BasicLSTMCell(num_neurons)
#         encoder_a_cell = tf.contrib.rnn.BasicLSTMCell(num_neurons)
#         encoder_b_cell = tf.contrib.rnn.BasicLSTMCell(num_neurons)

    _, s_encoder_state = tf.nn.dynamic_rnn(sen_encoder_cell,text,dtype=tf.float32,
                                         sequence_length=encode_sent_lengths(text), scope = 'rnn1')

    _, a_encoder_state = tf.nn.dynamic_rnn(encoder_a_cell,a_queries,dtype=tf.float32,
                                        sequence_length=encode_sent_lengths(a_queries), scope = 'rnn2')
    _, b_encoder_state = tf.nn.dynamic_rnn(encoder_b_cell,b_queries,dtype=tf.float32,
                                         sequence_length=encode_sent_lengths(b_queries), scope = 'rnn3')

# initial_state=encoder_state,
    
    decoder_init_state = tf.concat([s_encoder_state, a_encoder_state, b_encoder_state],1)
    reshaped_rnn_output = tf.one_hot(true_outputs,num_classes)
    yout = tf.matmul(decoder_init_state, w_out) + b_out
    train_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=yout,labels=reshaped_rnn_output))
    labels = tf.argmax(reshaped_rnn_output,1)
    preds = tf.argmax(yout,1)
    correct_pred = tf.equal(preds, labels)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    precesion = tf.metrics.precision(labels, preds)
    recall = tf.metrics.recall(labels, preds)
    

    # Optimization
    optimizer = tf.train.AdamOptimizer(lrate)
    params = tf.trainable_variables()
#     gradients = optimizer.compute_gradients(train_loss, params)
    gradients = tf.gradients(train_loss, params)
    clipped_gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm)
    update_step = optimizer.apply_gradients(zip(clipped_gradients, params))
    

    '''This is the multi layer part, need to think how to combine'''
#     reshaped_rnn_output = tf.one_hot(y,word_vec_size)
#     x = tf.reshape(x, [-1, sequence_len])
#     time_stamps = tf.split(x,sequence_len,1)
#     multi_layer_cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.BasicLSTMCell(num_neurons) for _ in range(num_layers)])
# #     print(multi_layer_cell.state_size)
#     output , _ = tf.contrib.rnn.static_rnn(multi_layer_cell, time_stamps, dtype=tf.float32)

#     loss = 0.0
# #     for time_step_input in time_stamps:
# #         output, state = multi_layer_cell(time_step_input, state)
# #     final_state = state
#     yout = tf.matmul(output[-1], new_w_out) + new_b_out
# #     yout = tf.nn.softmax(logit)
#     loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=yout,labels=reshaped_rnn_output))
#     correct_pred = tf.equal(tf.argmax(yout,1), tf.argmax(reshaped_rnn_output,1))
#     accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    
#     opt = tf.train.AdamOptimizer(learning_rate=0.1).minimize(loss)
#     opt = tf.train.AdagradOptimizer(learning_rate=0.1).minimize(loss)
    #opt = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss)
#     opt = tf.train.RMSPropOptimizer(learning_rate=0.001).minimize(loss)
    
with tf.Session(graph=graph3) as sess:
    init = tf.global_variables_initializer()
    init_l = tf.local_variables_initializer()
    sess = tf.Session()
    sess.run(init)
    sess.run(init_l)

    av_acc = 0.0
    av_err = 0.0
    epochs = 10
    train_size = len(train_df)
    
    for j in range(epochs):
        av_acc = 0.0
        av_err = 0.0    
        i=0
        while i< train_size:
            train_data, train_df = get_batch(i, batch_size, train_df )
            i += batch_size
            w2v_sents, w2v_pathA, w2v_pathB, w2v_cross = enc_dec_inputs(train_data,
                                                                        [max_sentence_length, max_query_length, max_query_length])
            _, acc, ls, prediction = sess.run([update_step,accuracy,train_loss,yout],
                                              {a_queries:w2v_pathA, b_queries:w2v_pathB, 
                                               true_outputs:w2v_cross , text:w2v_sents })
            av_err += ls
            av_acc += acc
        av_acc /= (i/batch_size)
        av_err /= (i/batch_size)
        print("epoch " + str(j+1) + " :")
        print("epoch " + str(j+1) + ": finished epoch training with average loss = " 
                     + str(av_err) + " , training accuracy = " + str(av_acc))
       
        
    print()
    print("--------------------------Evaluation on Test Data---------------------------------------------------")
    print()
    
    w2v_sents, w2v_pathA, w2v_pathB, w2v_cross = enc_dec_inputs(test_df,
                                                                        [max_sentence_length, max_query_length, max_query_length])
    prediction = sess.run([yout],{a_queries:w2v_pathA, b_queries:w2v_pathB, 
                                               true_outputs:w2v_cross , text:w2v_sents })
    
    p,r,f,_ = precision_recall_fscore_support(w2v_cross, np.argmax(prediction[0],axis=1))
    confusion_mat = confusion_matrix(w2v_cross, np.argmax(prediction[0],axis=1))
    
    # acc, prf, conf = evaluate(prediction,w2v_cross)
    # print("accuracy = " + str(acc))
    print("Precision for the three classes (0=No, 1=Unk, 2=yes):")
    print(p)
    print("Recall for the three classes (0=No, 1=Unk, 2=yes):")
    print(r)
    print("F1-measure for the three classes (0=No, 1=Unk, 2=yes):")
    print(f)

    print("confusion matrix:")
    print(confusion_mat)



epoch 1 :
epoch 1: finished epoch training with average loss = 0.489649592534 , training accuracy = 0.817736639493
epoch 2 :
epoch 2: finished epoch training with average loss = 0.454774797543 , training accuracy = 0.829427083333
epoch 3 :
epoch 3: finished epoch training with average loss = 0.435156128437 , training accuracy = 0.835116621377
epoch 4 :
epoch 4: finished epoch training with average loss = 0.411892075092 , training accuracy = 0.841995018116
epoch 5 :
epoch 5: finished epoch training with average loss = 0.388499312656 , training accuracy = 0.847684556159
epoch 6 :
epoch 6: finished epoch training with average loss = 0.363966335327 , training accuracy = 0.858582427536
epoch 7 :
epoch 7: finished epoch training with average loss = 0.344277458488 , training accuracy = 0.865177762681
epoch 8 :
epoch 8: finished epoch training with average loss = 0.325167374787 , training accuracy = 0.870669157609
epoch 9 :
epoch 9: finished epoch training with average loss = 0.308830730688 , 