In [1]:
import pandas as pd
import numpy as np

In [2]:
import tensorflow as tf

# import tensorflow.compat.v1 as tf
from tensorflow.python.framework import constant_op
import sys
import json
import time
import random
from itertools import chain
import os
import math
#from model import LSTMDSSM, _START_VOCAB
import csv

random.seed(1229)
tf.flags.DEFINE_string('f', '', 'kernel')
tf.flags.DEFINE_boolean("is_train", True, "Set to False to inference.")
tf.flags.DEFINE_boolean("read_graph", False, "Set to False to build graph.")
tf.flags.DEFINE_integer("symbols", 400000, "vocabulary size.")
tf.flags.DEFINE_integer("epoch", 25, "Number of epoch.")
tf.flags.DEFINE_integer("embed_units", 300, "Size of word embedding.")
tf.flags.DEFINE_integer("units", 512, "Size of each model layer.")
tf.flags.DEFINE_integer("batch_size", 20, "Batch size to use during training.")
tf.flags.DEFINE_string("data_dir", "./data", "Data directory")
tf.flags.DEFINE_string("train_dir", "./data", "Training directory.")
tf.flags.DEFINE_boolean("log_parameters", True, "Set to True to show the parameters")
tf.flags.DEFINE_string("time_log_path", 'time_log.txt', "record training time")
tf.flags.DEFINE_integer("neg_num", 1, "negative sample number")

FLAGS = tf.flags.FLAGS



In [3]:
import tensorflow as tf


class SimpleLSTMCell(tf.contrib.rnn.RNNCell):
    """
    The simpler version of LSTM cell with forget gate set to 1, according to the paper.
    """

    def __init__(self, num_units, forget_bias=1.0, activation=tf.tanh, reuse=None):
        self._num_units = num_units
        self._forget_bias = forget_bias
        self._activation = activation
        self._reuse = reuse

    @property
    def state_size(self):
        return (self._num_units, self._num_units)

    @property
    def output_size(self):
        return self._num_units

    def __call__(self, inputs, state, scope=None):
        with tf.variable_scope(scope or "simple_lstm_cell", reuse=self._reuse):
            c, h = state
            if not hasattr(self, '_wi'):
                self._wi = tf.get_variable('simple_lstm_cell_wi', dtype=tf.float32, shape=[inputs.get_shape()[-1] + h.get_shape()[-1], self._num_units], initializer=tf.orthogonal_initializer())
                self._bi = tf.get_variable('simple_lstm_cell_bi', dtype=tf.float32, shape=[self._num_units], initializer=tf.constant_initializer(0.0))
                self._wo = tf.get_variable('simple_lstm_cell_wo', dtype=tf.float32, shape=[inputs.get_shape()[-1] + h.get_shape()[-1], self._num_units], initializer=tf.orthogonal_initializer())
                self._bo = tf.get_variable('simple_lstm_cell_bo', dtype=tf.float32, shape=[self._num_units], initializer=tf.constant_initializer(0.0))
                self._wc = tf.get_variable('simple_lstm_cell_wc', dtype=tf.float32, shape=[inputs.get_shape()[-1] + h.get_shape()[-1], self._num_units], initializer=tf.orthogonal_initializer())
                self._bc = tf.get_variable('simple_lstm_cell_bc', dtype=tf.float32, shape=[self._num_units], initializer=tf.constant_initializer(0.0))
            i = tf.nn.sigmoid(tf.matmul(tf.concat([inputs, h], 1), self._wi) + self._bi)
            o = tf.nn.sigmoid(tf.matmul(tf.concat([inputs, h], 1), self._wo) + self._bo)
            _c = self._activation(tf.matmul(tf.concat([inputs, h], 1), self._wc) + self._bc)
            # remove forget gate according to the paper
            new_c = c + i * _c
            new_h = o * self._activation(new_c)

            return new_h, (new_c, new_h)


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [4]:
import tensorflow as tf

from tensorflow.python.ops.nn import dynamic_rnn
from tensorflow.contrib.lookup.lookup_ops import MutableHashTable
import math

PAD_ID = 0
UNK_ID = 1
_START_VOCAB = ['_PAD', '_UNK']


class LSTMDSSM(object):
    """
    The LSTM-DSSM model refering to the paper: Deep Sentence Embedding Using Long Short-Term Memory Networks: Analysis and Application to Information Retrieval.
    papaer available at: https://arxiv.org/abs/1502.06922
    """

    def __init__(self,
                 num_lstm_units,
                 embed,
                 neg_num=1,
                 gradient_clip_threshold=5.0):

        self.queries = tf.placeholder(dtype=tf.string, shape=[None, None])  # shape: batch*len
        self.queries_length = tf.placeholder(dtype=tf.int32, shape=[None])  # shape: batch
        self.docs = tf.placeholder(dtype=tf.string, shape=[neg_num + 1, None, None])  # shape: (neg_num + 1)*batch*len
        self.docs_length = tf.placeholder(dtype=tf.int32, shape=[neg_num + 1, None])  # shape: batch*(neg_num + 1)

        self.word2index = MutableHashTable(
            key_dtype=tf.string,
            value_dtype=tf.int64,
            default_value=UNK_ID,
            #shared_name="in_table",
            name="in_table",
            checkpoint=True
        )

        self.learning_rate = tf.Variable(0.003, trainable=False, dtype=tf.float32)
        self.global_step = tf.Variable(0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        self.epoch_add_op = self.epoch.assign(self.epoch + 1)
        self.momentum = tf.Variable(0.9, trainable=False, dtype=tf.float32)

        self.index_queries = self.word2index.lookup(self.queries)  # batch*len
        self.index_docs = [self.word2index.lookup(doc) for doc in tf.unstack(self.docs)]

        self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed)
        self.embed_queries = tf.nn.embedding_lookup(self.embed, self.index_queries)
        

        self.embed_docs = [tf.nn.embedding_lookup(self.embed, index_doc) for index_doc in self.index_docs]



        with tf.variable_scope('query_lstm'):
            self.cell_q = SimpleLSTMCell(num_lstm_units)
        with tf.variable_scope('doc_lstm'):
            self.cell_d = SimpleLSTMCell(num_lstm_units)

        self.states_q = dynamic_rnn(self.cell_q, self.embed_queries, self.queries_length, dtype=tf.float32,
                                         scope="simple_lstm_cell_query")[1][1]  # shape: batch*num_units
        self.states_d = [dynamic_rnn(self.cell_d, self.embed_docs[i], self.docs_length[i], dtype=tf.float32,
                                            scope="simple_lstm_cell_doc")[1][1] for i in range(neg_num + 1)]  # shape: (neg_num + 1)*batch*num_units
        self.queries_norm = tf.sqrt(tf.reduce_sum(tf.square(self.states_q), axis=1))
        self.docs_norm = [tf.sqrt(tf.reduce_sum(tf.square(self.states_d[i]), axis=1)) for i in range(neg_num + 1)]
        self.prods = [tf.reduce_sum(tf.multiply(self.states_q, self.states_d[i]), axis=1) for i in range(neg_num + 1)]
        
        self.sims = [(self.prods[i] / (self.queries_norm * self.docs_norm[i])) for i in range(neg_num + 1)]  # shape: (neg_num + 1)*batch
        self.sims = tf.convert_to_tensor(self.sims)
        self.gamma = tf.Variable(initial_value=1.0, expected_shape=[], dtype=tf.float32)  # scaling factor according to the paper
        self.origin_sims = self.sims
        self.sims = self.sims * self.gamma
        self.prob = tf.nn.softmax(self.sims, dim=0)  # shape: (neg_num + 1)*batch
        self.hit_prob = tf.transpose(self.prob[0])

        self.loss = -tf.reduce_mean(tf.log(self.hit_prob))

        self.params = tf.trainable_variables()
        opt = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=self.momentum, use_nesterov=True)  # use Nesterov's method, according to the paper
        gradients = tf.gradients(self.loss, self.params)
  
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, gradient_clip_threshold)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step)
        #self.update = None
        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)

    def print_parameters(self):
        for item in self.params:
            print('%s: %s' % (item.name, item.get_shape()))

    def train_step(self, session, queries, docs):
        input_feed = {self.queries: queries['texts'],
                      self.queries_length: queries['texts_length'],
                      self.docs: docs['texts'],
                      self.docs_length: docs['texts_length']}

        output_feed = [self.loss, self.update, self.states_q, self.states_d, self.queries_norm, self.docs_norm, self.prods, self.sims, self.prob, self.hit_prob]
        #output_feed = [self.loss, self.states_q, self.states_d, self.queries_norm, self.docs_norm, self.prods, self.sims, self.prob, self.hit_prob,self.embed_queries,self.embed, self.index_queries]
        #gradients = tf.gradients(self.loss, self.params)
  
        return session.run(output_feed, input_feed)
    

    def test_step(self, session, queries, docs):
        input_feed = {self.queries: queries['texts'],
                      self.queries_length: queries['texts_length'],
                      self.docs: docs['texts'],
                      self.docs_length: docs['texts_length']}
                      
        output_feed = [self.embed_queries, self.embed, self.states_q]
        return session.run(output_feed, input_feed)
   




In [5]:

def load_data(path, fname):
    print('Creating dataset...')
    data = []
    with open('%s/%s' % (path, fname)) as f:
        for idx, line in enumerate(f):
            line = line.strip('\n')
            tokens = line.split()
            data.append(tokens)
    return data


def build_vocab(path, data):
    print("Creating vocabulary...")
    words = set()
    for line in data:
        for word in line:
            if len(word) == 0:
                continue
            words.add(word)
    words = list(words)
    vocab_list = _START_VOCAB + words
    FLAGS.symbols = len(vocab_list)

    print("Loading word vectors...")
    embed = np.random.normal(0.0, np.sqrt(1. / (FLAGS.embed_units)), [len(vocab_list), FLAGS.embed_units])
    # debug
    # embed = np.array(embed, dtype=np.float32)
    # return vocab_list, embed
    with open(os.path.join('./data', 'vector.txt')) as fp:
        while True:
            line = fp.readline()
            if not line:
                break
            info = line.split()
            if info[0] not in vocab_list:
                continue
            embed[vocab_list.index(info[0])] = [float(num) for num in info[1:]]
    embed = np.array(embed, dtype=np.float32)
    return vocab_list, embed


def gen_batch_data(data):
    def padding(sent, l):
        return sent + ['_PAD'] * (l - len(sent))

    max_len = max([len(sentence) for sentence in data])
    texts, texts_length = [], []

    for item in data:
        texts.append(padding(item, max_len)) # padding 0 to make all sentence have the same length
        texts_length.append(len(item))

    batched_data = {'texts': np.array(texts), 'texts_length': np.array(texts_length, dtype=np.int32)}

    return batched_data


def train(model, sess, queries, docs):
    st, ed, loss = 0, 0, .0 # start and end of the interval. For batch.
    lq = len(queries)
    count = 0


    while ed < lq:
        st, ed = ed, ed + FLAGS.batch_size if ed + FLAGS.batch_size < lq else lq
        batch_queries = gen_batch_data(queries[st:ed])
        batch_docs = gen_batch_data(docs[st*(FLAGS.neg_num + 1):ed*(FLAGS.neg_num + 1)])
        texts = []
        texts_length = []
        for i in range(FLAGS.neg_num + 1):
            texts.append(batch_docs['texts'][i::FLAGS.neg_num + 1])
            texts_length.append(batch_docs['texts_length'][i::FLAGS.neg_num + 1])
        batch_docs['texts'] = texts
        batch_docs['texts_length'] = texts_length
        outputs = model.train_step(sess, batch_queries, batch_docs)
        count += 1
        # debug
        if math.isnan(outputs[0]):
            print('nan detected. ')
  
        if math.isinf(outputs[0]):
            print('inf detected. ')

        loss += outputs[0]
        ###
        #  print('hey ', loss)
        


    sess.run([model.epoch_add_op])

    return loss/count
    ###loss / count






In [6]:
def test(model, sess, queries, docs):
    st, ed, loss = 0, 0, .0
    lq = len(queries)
    # debug
    # lq = len(queries) // 2 
    count = 0
    embed_queries_list = list()
    state_list = list()
    while ed < lq:
        st, ed = ed, ed + FLAGS.batch_size if ed + FLAGS.batch_size < lq else lq
        batch_queries = gen_batch_data(queries[st:ed])
        batch_docs = gen_batch_data(docs[st * (FLAGS.neg_num + 1):ed * (FLAGS.neg_num + 1)])
        texts = []
        texts_length = []
        for i in range(FLAGS.neg_num + 1):
            texts.append(batch_docs['texts'][i::FLAGS.neg_num + 1])
            texts_length.append(batch_docs['texts_length'][i::FLAGS.neg_num + 1])
        batch_docs['texts'] = texts
        batch_docs['texts_length'] = texts_length
        outputs = model.test_step(sess, batch_queries, batch_docs)
        count += 1
        embed_queries_list.append(outputs[0])
        state_list.append(outputs[2])

    return embed_queries_list, outputs[1], state_list

In [7]:

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
    if FLAGS.is_train:
        print(FLAGS.__flags)
        data_queries = load_data(FLAGS.data_dir, 'nem_queries.txt')
        data_docs = load_data(FLAGS.data_dir, 'nem_docs.txt')
        vocab, embed = build_vocab(FLAGS.data_dir, data_queries + data_docs)

       
        # tweets data
        tweets = load_data(FLAGS.data_dir, 'nem_words_for_vec.txt')
        docs = load_data(FLAGS.data_dir, 'nem_words_for_vec.txt')
        docs = np.repeat(docs, FLAGS.neg_num + 1)
        

        model = LSTMDSSM(
            FLAGS.units,
            embed,
            FLAGS.neg_num)
        if FLAGS.log_parameters:
            model.print_parameters()

        if tf.train.get_checkpoint_state(FLAGS.train_dir):
            print("Reading model parameters from %s" % FLAGS.train_dir)
            model.saver.restore(sess, tf.train.latest_checkpoint(FLAGS.train_dir))
        else:
            print("Created model with fresh parameters.")
            tf.global_variables_initializer().run()
            op_in = model.word2index.insert(constant_op.constant(vocab),
                                              constant_op.constant(list(range(FLAGS.symbols)), dtype=tf.int64))
            sess.run(op_in)

        # debug
        # test_loss = test(model, sess, test_queries, test_docs, test_ground_truths)

        summary_writer = tf.summary.FileWriter('%s/log' % FLAGS.train_dir, sess.graph)
        pre_losses = [1e18] * 3
        best_val_loss = 100

        while model.epoch.eval() < FLAGS.epoch:
            epoch = model.epoch.eval()
            random_idxs = range(len(data_queries))
            random.shuffle(random_idxs)
            data_queries = [data_queries[i] for i in random_idxs]
            data_docs = np.reshape(data_docs, (len(data_queries), -1))
            data_docs = [data_docs[i] for i in random_idxs]
            data_docs = np.reshape(data_docs, len(data_queries) * (FLAGS.neg_num + 1))
           

            # loss,embedding, state ,e= train(model, sess, data_queries, data_docs)
            loss = train(model, sess, data_queries, data_docs)


            summary = tf.Summary()
            summary.value.add(tag='loss/train', simple_value=loss)
            cur_lr = model.learning_rate.eval()
            summary.value.add(tag='lr/train', simple_value=cur_lr)

            if(loss <best_val_loss):
                best_val_loss = loss
                best_epoch=epoch
                print("best epoch is: %d" % best_epoch)
                model.saver.save(sess, '%s/checkpoint' % FLAGS.train_dir, global_step=model.global_step)
                print("epoch %d learning rate %.10f loss %.8f " % (
                    epoch, cur_lr, loss))
                summary_writer.add_summary(summary, epoch)
            else:
                print("epoch %d learning rate %.10f loss %.8f " % (
                    epoch, cur_lr, loss))
            
            if loss > max(pre_losses):
                op = tf.assign(model.learning_rate, cur_lr * 0.5)
                sess.run(op)
            pre_losses = pre_losses[1:] + [loss]
        #with open(os.path.join(FLAGS.train_dir, FLAGS.time_log_path), 'a') as fp:
            #fp.writelines(['total training time: %f\n' % total_train_time, 'last loss: %.8f' % loss])

        embed_queries, embed, state_q = test(model, sess, tweets, docs)
        #print(embed)



#with open(os.path.join('./data', 'word2vec.txt'), 'w') as f:
#        f.writelines(embeding)

{'read_graph': <absl.flags._flag.BooleanFlag object at 0x10cc39d90>, 'showprefixforinfo': <absl.flags._flag.BooleanFlag object at 0x11e8680d0>, 'stderrthreshold': <absl.logging._StderrthresholdFlag object at 0x11e868050>, 'time_log_path': <absl.flags._flag.Flag object at 0x12290dd90>, 'symbols': <absl.flags._flag.Flag object at 0x10cc39710>, 'op_conversion_fallback_to_while_loop': <absl.flags._flag.BooleanFlag object at 0x1208d8650>, 'test_randomize_ordering_seed': <absl.flags._flag.Flag object at 0x1222720d0>, 'neg_num': <absl.flags._flag.Flag object at 0x12290dd10>, 'data_dir': <absl.flags._flag.Flag object at 0x12290dc10>, 'alsologtostderr': <absl.flags._flag.BooleanFlag object at 0x11e861dd0>, 'embed_units': <absl.flags._flag.Flag object at 0x12290dad0>, 'logtostderr': <absl.flags._flag.BooleanFlag object at 0x11e861cd0>, 'epoch': <absl.flags._flag.Flag object at 0x10cc39750>, 'log_dir': <absl.flags._flag.Flag object at 0x11e861e90>, 'test_tmpdir': <absl.flags._flag.Flag object at 

In [8]:
state_list = [state for batch in state_q for state in batch]

In [9]:
# opening the csv file in 'w+' mode 
file = open('nem_state_q.csv', 'w+') 
  
# writing the data into the file 
with file:     
    write = csv.writer(file) 
    write.writerows(state_list)

In [10]:
"""
with open('state_q.csv') as f:
    reader = csv.reader(f)
    state = list(reader)
"""

"\nwith open('state_q.csv') as f:\n    reader = csv.reader(f)\n    state = list(reader)\n"

In [16]:
#