In [1]:
#! /usr/bin/env python

import tensorflow as tf
import numpy as np
import re
import os
import time
import datetime
import gc
from input_helpers import InputHelper
from siamese_network import SiameseLSTM
from siamese_network_semantic import SiameseLSTMw2v
from tensorflow.contrib import learn
import gzip
from random import random
# Parameters
# ==================================================

class SiameseLSTMw2v(object):
    """
    A LSTM based deep Siamese network for text similarity.
    Uses an word embedding layer (looks up in pre-trained w2v), followed by a biLSTM and Energy Loss layer.
    """
    #train model in main 
    #sess.run(siameseModel.W.assign(initW))
    
# word2vec_format="bin"
# embedding_dim=300
# dropout_keep_prob=1.0
# l2_reg_lambda=0.0
# hidden_units=50

    
#      siameseModel = SiameseLSTMw2v(
#                 sequence_length=max_document_length,
#                 vocab_size=len(vocab_processor.vocabulary_),
#                 embedding_size=FLAGS.embedding_dim,
#                 hidden_units=FLAGS.hidden_units,
#                 l2_reg_lambda=FLAGS.l2_reg_lambda,
#                 batch_size=FLAGS.batch_size,
#                 trainableEmbeddings=trainableEmbeddings
#             )
#         # Define Training procedure
#         global_step = tf.Variable(0, name="global_step", trainable=False)
#         optimizer = tf.train.AdamOptimizer(1e-3)
#         print("initialized siameseModel object")
    
  #  grads_and_vars=optimizer.compute_gradients(siameseModel.loss)
  #  tr_op_set = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
    
    def stackedRNN(self, x, dropout, scope, embedding_size, sequence_length, hidden_units):
        n_hidden=hidden_units
        n_layers=3
        # Prepare data shape to match `static_rnn` function requirements
        x = tf.unstack(x,axis=1)
        
        # Define lstm cells with tensorflow
        # Forward direction cell

        with tf.name_scope("fw"+scope),tf.variable_scope("fw"+scope):
            stacked_rnn_fw = []
            for _ in range(n_layers):
                fw_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True)
                lstm_fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell,output_keep_prob=dropout)
                stacked_rnn_fw.append(lstm_fw_cell)
            lstm_fw_cell_m = tf.nn.rnn_cell.MultiRNNCell(cells=stacked_rnn_fw, state_is_tuple=True)
            #print ("lstm shape is "+ lstm_fw_cell_m.shape)
            outputs, _ = tf.nn.static_rnn(lstm_fw_cell_m, x, dtype=tf.float32)
            #print ("output length is "+ len(outputs))
            print ("output shape is "+ str(outputs[-1].shape))
        return outputs[-1]

    def contrastive_loss(self, y,d,batch_size):
        tmp= y *tf.square(d) ## when they are almost the same y=1, the square means they are increasing 
        #tmp= tf.mul(y,tf.square(d))
        tmp2 = (1-y) *tf.square(tf.maximum((1 - d),0)) 
        ## hinge loss  when they are not same, the function is decreasing monotonically
        return tf.reduce_sum(tmp +tmp2)/batch_size/2
    
    def __init__(
        self, sequence_length, vocab_size, embedding_size, hidden_units, l2_reg_lambda, batch_size, trainableEmbeddings):

        # Placeholders for input, output and dropout
        self.input_x1 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x1")
        self.input_x2 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x2")
        self.input_y = tf.placeholder(tf.float32, [None], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0, name="l2_loss")
          
        # Embedding layer
        with tf.name_scope("embedding"):
            self.W = tf.Variable(
                tf.constant(0.0, shape=[vocab_size, embedding_size]),
                trainable=trainableEmbeddings,name="W")
            self.embedded_words1 = tf.nn.embedding_lookup(self.W, self.input_x1) # dim:(sequence,word_count,embed_dim)
            self.embedded_words2 = tf.nn.embedding_lookup(self.W, self.input_x2) # dim:(sequence,word_count,embed_dim)
        print ("embeded shape is "+str(self.embedded_words1.shape ))
        # Create a convolution + maxpool layer for each filter size
        with tf.name_scope("output"):
            self.out1=self.stackedRNN(self.embedded_words1, self.dropout_keep_prob, "side1", embedding_size, sequence_length, hidden_units)
            self.out2=self.stackedRNN(self.embedded_words2, self.dropout_keep_prob, "side2", embedding_size, sequence_length, hidden_units)
            self.distance = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(self.out1,self.out2)),1,keep_dims=True))
            self.distance = tf.div(self.distance, tf.add(tf.sqrt(tf.reduce_sum(tf.square(self.out1),1,keep_dims=True)),tf.sqrt(tf.reduce_sum(tf.square(self.out2),1,keep_dims=True))))
            self.distance = tf.reshape(self.distance, [-1], name="distance")
        with tf.name_scope("loss"):
            self.loss = self.contrastive_loss(self.input_y,self.distance, batch_size)
        #### Accuracy computation is outside of this class.
        with tf.name_scope("accuracy"):
            self.temp_sim = tf.subtract(tf.ones_like(self.distance),tf.rint(self.distance), name="temp_sim") #auto threshold 0.5
            correct_predictions = tf.equal(self.temp_sim, self.input_y)
            self.accuracy=tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")




In [2]:


is_char_based=False
word2vec_model="D:\simple_vec\wiki.simple.bin"
word2vec_format="bin"
embedding_dim=300
dropout_keep_prob=1.0
l2_reg_lambda=0.0
hidden_units=50

# Training parameters
batch_size=64
num_epochs=300
evaluate_every=1000
checkpoint_every=1000

# Misc Parameters
allow_soft_placement=True
log_device_placement=False
trainableEmbeddings=False

training_files="train_snli.txt"

max_document_length=100
inpH = InputHelper()
train_set, dev_set, vocab_processor,sum_no_of_batches = inpH.getDataSets(training_files, 10,max_document_length,batch_size, is_char_based)
trainableEmbeddings=False
if is_char_based==True:
    word2vec_model = False

inpH.loadW2V(word2vec_model, word2vec_format)


# initial matrix with random uniform
initW = np.random.uniform(-0.25,0.25,(len(vocab_processor.vocabulary_), embedding_dim))
#initW = np.zeros(shape=(len(vocab_processor.vocabulary_), FLAGS.embedding_dim))
# load any vectors from the word2vec
print("initializing initW with pre-trained word2vec embeddings")


for w in vocab_processor.vocabulary_._mapping:
    arr=[]
    s = re.sub('[^0-9a-zA-Z]+', '', w)
    if w in inpH.pre_emb:
        arr=inpH.pre_emb[w]
    elif w.lower() in inpH.pre_emb:
        arr=inpH.pre_emb[w.lower()]
    elif s in inpH.pre_emb:
        arr=inpH.pre_emb[s]
    elif s.isdigit():
        arr=inpH.pre_emb["zero"]
    if len(arr)>0:
        idx = vocab_processor.vocabulary_.get(w)
        initW[idx]=np.asarray(arr).astype(np.float32)




Loading training data from train_snli.txt
Building vocabulary
Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.
Length of loaded vocabulary =31337
dumping validation 0
Train/Dev split for train_snli.txt: 330635/36738
Loading W2V data...
loaded word2vec len  111051
initializing initW with pre-trained word2vec embeddings


In [3]:
# Training
# ==================================================

session_conf = tf.ConfigProto(
  allow_soft_placement=allow_soft_placement,
  log_device_placement=log_device_placement)
sess = tf.Session(config=session_conf)
print("started session")
trainableEmbeddings=False
with sess.as_default():
    if is_char_based:
        siameseModel = SiameseLSTM(
            sequence_length=max_document_length,
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=embedding_dim,
            hidden_units=hidden_units,
            l2_reg_lambda=l2_reg_lambda,
            batch_size=batch_size
        )
    else:
        siameseModel = SiameseLSTMw2v(
            sequence_length=max_document_length,
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=embedding_dim,
            hidden_units=hidden_units,
            l2_reg_lambda=l2_reg_lambda,
            batch_size=batch_size,
            trainableEmbeddings=trainableEmbeddings
        )
    # Define Training procedure
    global_step = tf.Variable(0, name="global_step", trainable=False)
    optimizer = tf.train.AdamOptimizer(1e-3)
    print("initialized siameseModel object")

grads_and_vars=optimizer.compute_gradients(siameseModel.loss)
tr_op_set = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
print("defined training_ops")


# Output directory for models and summaries
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
print("Writing to {}\n".format(out_dir))

#Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)

# Initialize all variables

sess.run(tf.global_variables_initializer())

print("init all variables")
graph_def = tf.get_default_graph().as_graph_def()
graphpb_txt = str(graph_def)


# initial matrix with random uniform
initW = np.random.uniform(-0.25,0.25,(len(vocab_processor.vocabulary_), embedding_dim))
#initW = np.zeros(shape=(len(vocab_processor.vocabulary_), FLAGS.embedding_dim))
# load any vectors from the word2vec
print("initializing initW with pre-trained word2vec embeddings")
for w in vocab_processor.vocabulary_._mapping:
    arr=[]
    s = re.sub('[^0-9a-zA-Z]+', '', w)
    if w in inpH.pre_emb:
        arr=inpH.pre_emb[w]
    elif w.lower() in inpH.pre_emb:
        arr=inpH.pre_emb[w.lower()]
    elif s in inpH.pre_emb:
        arr=inpH.pre_emb[s]
    if len(arr)>0:
        idx = vocab_processor.vocabulary_.get(w)
        initW[idx]=np.asarray(arr).astype(np.float32)
print("Done assigning intiW. len="+str(len(initW)))
inpH.deletePreEmb()
gc.collect()
sess.run(siameseModel.W.assign(initW))

def train_step(x1_batch, x2_batch, y_batch):
    """
    A single training step
    """
    if random()>0.5:
        feed_dict = {
            siameseModel.input_x1: x1_batch,
            siameseModel.input_x2: x2_batch,
            siameseModel.input_y: y_batch,
            siameseModel.dropout_keep_prob: dropout_keep_prob,
        }
    else:
        feed_dict = {
            siameseModel.input_x1: x2_batch,
            siameseModel.input_x2: x1_batch,
            siameseModel.input_y: y_batch,
            siameseModel.dropout_keep_prob: dropout_keep_prob,
        }
    _, step, loss, accuracy, dist, sim = sess.run([tr_op_set, global_step, siameseModel.loss, siameseModel.accuracy, siameseModel.distance, siameseModel.temp_sim],  feed_dict)
    time_str = datetime.datetime.now().isoformat()
    print("TRAIN {}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
    #print(y_batch, dist, sim)

def dev_step(x1_batch, x2_batch, y_batch):
    """
    A single training step
    """ 
    if random()>0.5:
        feed_dict = {
            siameseModel.input_x1: x1_batch,
            siameseModel.input_x2: x2_batch,
            siameseModel.input_y: y_batch,
            siameseModel.dropout_keep_prob: 0.9,
        }
    else:
        feed_dict = {
            siameseModel.input_x1: x2_batch,
            siameseModel.input_x2: x1_batch,
            siameseModel.input_y: y_batch,
            siameseModel.dropout_keep_prob: 1.0,
        }
    step, loss, accuracy, sim= sess.run([global_step, siameseModel.loss, siameseModel.accuracy, siameseModel.temp_sim],  feed_dict)
    time_str = datetime.datetime.now().isoformat()
    print("DEV {}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
    #dev_summary_writer.add_summary(summaries, step)
    #print (y_batch, sim)
    return accuracy

# Generate batches

started session
embeded shape is (?, 100, 300)
output shape is (?, 50)
output shape is (?, 50)
Instructions for updating:
keep_dims is deprecated, use keepdims instead
initialized siameseModel object
defined training_ops
Writing to C:\Users\pc\deep-siamese-text-similarity-master\runs\1532596722

init all variables
initializing initW with pre-trained word2vec embeddings
Done assigning intiW. len=31337


In [None]:
batches=inpH.batch_iter(
            list(zip(train_set[0], train_set[1], train_set[2])), batch_size, num_epochs)

ptr=0
max_validation_acc=0.0
for nn in range(sum_no_of_batches*num_epochs):
    batch = batches.__next__()

    if len(batch)<1:
        continue
    x1_batch,x2_batch, y_batch = zip(*batch)

    if len(y_batch)<1:
        continue
    #print(x1_batch.shape)
    #print(x2_batch.shape)
    train_step(x1_batch, x2_batch, y_batch)
    current_step = tf.train.global_step(sess, global_step)
    sum_acc=0.0
    if current_step % evaluate_every == 0:
        print("\nEvaluation:")
        dev_batches = inpH.batch_iter(list(zip(dev_set[0],dev_set[1],dev_set[2])), batch_size, 1)
        for db in dev_batches:
            if len(db)<1:
                continue
            x1_dev_b,x2_dev_b,y_dev_b = zip(*db)
            if len(y_dev_b)<1:
                continue
            acc = dev_step(x1_dev_b, x2_dev_b, y_dev_b)
            sum_acc = sum_acc + acc
        print("")
    if current_step % checkpoint_every == 0:
        if sum_acc >= max_validation_acc:
            max_validation_acc = sum_acc
            saver.save(sess, checkpoint_prefix, global_step=current_step)
            tf.train.write_graph(sess.graph.as_graph_def(), checkpoint_prefix, "graph"+str(nn)+".pb", as_text=False)
            print("Saved model {} with sum_accuracy={} checkpoint to {}\n".format(nn, max_validation_acc, checkpoint_prefix))



[[array([   1,    2,    3, 1533,    9,    1, 1535,  318,   41,    2, 2963,
        145,  528,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0], dtype=int64)
  array([   28,     2,     3, 22763,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     

TRAIN 2018-07-26T17:19:48.846366: step 1, loss 0.139178, acc 0.546875
TRAIN 2018-07-26T17:19:49.642676: step 2, loss 0.12708, acc 0.484375
TRAIN 2018-07-26T17:19:49.861427: step 3, loss 0.122953, acc 0.578125
TRAIN 2018-07-26T17:19:50.085293: step 4, loss 0.13272, acc 0.515625
TRAIN 2018-07-26T17:19:50.286599: step 5, loss 0.12469, acc 0.546875
TRAIN 2018-07-26T17:19:50.497568: step 6, loss 0.122795, acc 0.578125
TRAIN 2018-07-26T17:19:50.698529: step 7, loss 0.124881, acc 0.515625
TRAIN 2018-07-26T17:19:50.890072: step 8, loss 0.124837, acc 0.53125
TRAIN 2018-07-26T17:19:51.132081: step 9, loss 0.124921, acc 0.515625
TRAIN 2018-07-26T17:19:51.359791: step 10, loss 0.12411, acc 0.578125
TRAIN 2018-07-26T17:19:51.587254: step 11, loss 0.125158, acc 0.515625
TRAIN 2018-07-26T17:19:51.773696: step 12, loss 0.123935, acc 0.546875
TRAIN 2018-07-26T17:19:51.972753: step 13, loss 0.126123, acc 0.515625
TRAIN 2018-07-26T17:19:52.176720: step 14, loss 0.119639, acc 0.625
TRAIN 2018-07-26T17:19:

TRAIN 2018-07-26T17:20:14.448832: step 119, loss 0.125, acc 0.5
TRAIN 2018-07-26T17:20:14.681893: step 120, loss 0.124898, acc 0.53125
TRAIN 2018-07-26T17:20:14.946961: step 121, loss 0.125028, acc 0.5
TRAIN 2018-07-26T17:20:15.158733: step 122, loss 0.126392, acc 0.375
TRAIN 2018-07-26T17:20:15.401795: step 123, loss 0.124247, acc 0.609375
TRAIN 2018-07-26T17:20:15.656862: step 124, loss 0.125311, acc 0.46875
TRAIN 2018-07-26T17:20:15.929935: step 125, loss 0.124906, acc 0.515625
TRAIN 2018-07-26T17:20:16.162961: step 126, loss 0.124907, acc 0.515625
TRAIN 2018-07-26T17:20:16.412892: step 127, loss 0.125543, acc 0.4375
TRAIN 2018-07-26T17:20:16.603894: step 128, loss 0.125293, acc 0.4375
TRAIN 2018-07-26T17:20:16.805612: step 129, loss 0.125044, acc 0.484375
TRAIN 2018-07-26T17:20:16.988150: step 130, loss 0.125033, acc 0.5
TRAIN 2018-07-26T17:20:17.189808: step 131, loss 0.125082, acc 0.5
TRAIN 2018-07-26T17:20:17.392745: step 132, loss 0.124372, acc 0.546875
TRAIN 2018-07-26T17:20:1

TRAIN 2018-07-26T17:20:38.761051: step 234, loss 0.125128, acc 0.484375
TRAIN 2018-07-26T17:20:39.029704: step 235, loss 0.124709, acc 0.53125
TRAIN 2018-07-26T17:20:39.315779: step 236, loss 0.123378, acc 0.609375
TRAIN 2018-07-26T17:20:39.519860: step 237, loss 0.126457, acc 0.453125
TRAIN 2018-07-26T17:20:39.713877: step 238, loss 0.124968, acc 0.515625
TRAIN 2018-07-26T17:20:39.914335: step 239, loss 0.126039, acc 0.484375
TRAIN 2018-07-26T17:20:40.117273: step 240, loss 0.126084, acc 0.484375
TRAIN 2018-07-26T17:20:40.328538: step 241, loss 0.125525, acc 0.5
TRAIN 2018-07-26T17:20:40.540045: step 242, loss 0.123098, acc 0.578125
TRAIN 2018-07-26T17:20:40.727205: step 243, loss 0.124512, acc 0.53125
TRAIN 2018-07-26T17:20:41.023562: step 244, loss 0.124982, acc 0.515625
TRAIN 2018-07-26T17:20:41.286631: step 245, loss 0.12359, acc 0.5625
TRAIN 2018-07-26T17:20:41.546005: step 246, loss 0.125931, acc 0.484375
TRAIN 2018-07-26T17:20:41.771611: step 247, loss 0.12314, acc 0.578125
TRA

TRAIN 2018-07-26T17:21:03.322725: step 349, loss 0.12475, acc 0.5625
TRAIN 2018-07-26T17:21:03.521018: step 350, loss 0.123783, acc 0.625
TRAIN 2018-07-26T17:21:03.731630: step 351, loss 0.126369, acc 0.4375
TRAIN 2018-07-26T17:21:03.910836: step 352, loss 0.124536, acc 0.53125
TRAIN 2018-07-26T17:21:04.142689: step 353, loss 0.125873, acc 0.484375
TRAIN 2018-07-26T17:21:04.372152: step 354, loss 0.122523, acc 0.59375
TRAIN 2018-07-26T17:21:04.583461: step 355, loss 0.125087, acc 0.515625
TRAIN 2018-07-26T17:21:04.793500: step 356, loss 0.122703, acc 0.578125
TRAIN 2018-07-26T17:21:05.000843: step 357, loss 0.125257, acc 0.515625
TRAIN 2018-07-26T17:21:05.196645: step 358, loss 0.12247, acc 0.578125
TRAIN 2018-07-26T17:21:05.404411: step 359, loss 0.124671, acc 0.53125
TRAIN 2018-07-26T17:21:05.608954: step 360, loss 0.129541, acc 0.4375
TRAIN 2018-07-26T17:21:05.822945: step 361, loss 0.127839, acc 0.46875
TRAIN 2018-07-26T17:21:06.026795: step 362, loss 0.122433, acc 0.578125
TRAIN 2