In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import abc

import tensorflow as tf
import tensorflow.contrib.seq2seq as seq2seq
from tensorflow.contrib.rnn import LSTMCell, LSTMStateTuple, GRUCell
from tensorflow.python.layers import core as layers_core
import math

In [2]:
class Seq2SeqModel():
    def foo(self):
        return self.logits
    def __init__(self, mode, src_vocab_size, tgt_vocab_size, embedding_size, batch_size, learning_rate, beam_search = False,
                 beam_width = 5):
        self.mode = mode
        self.beam_search = beam_search
        self.beam_width = beam_width
        self.learning_rate = learning_rate
        self._init_placeholders()
        self._init_embeddings(src_vocab_size, tgt_vocab_size, embedding_size)
        self._init_bidirectional_encoder()
        self._init_decoder(tgt_vocab_size, batch_size)
        self._init_optimizer(batch_size)
        
    def _init_debug(self):
        self.encoder_inputs = tf.Variable(np.array([[3, 3, 3, 3],[3, 3, 0, 0],[3, 0, 0, 0]]),dtype=np.int32)
        self.encoder_input_length = tf.constant(np.array([4,2,1]),dtype=np.int32)

        self.decoder_inputs = tf.Variable(np.array([[1, 3, 4, 2, 0],[1, 4, 2, 0, 0],[1, 3, 3, 3, 2]]),dtype=np.int32)
        self.decoder_input_length = tf.constant(np.array([4,3,5]),dtype=np.int32)
        self.decoder_outputs = tf.Variable(np.array([[1, 3, 4, 2, 0],[1, 4, 2, 0, 0],[1, 3, 3, 3, 2]]),dtype=np.int32)

    def _init_placeholders(self):
        self.encoder1_inputs = tf.placeholder(
            shape=(None, None),
            dtype=tf.int32,
            name='encoder_inputs',
        )
        self.encoder2_inputs = tf.placeholder(
            shape=(None, None),
            dtype=tf.int32,
            name='encoder_inputs',
        )
        self.decoder_inputs = tf.placeholder(
            shape=(None, None),
            dtype=tf.int32,
            name='decoder_inputs',
        )
        self.decoder_input_length = tf.placeholder(
            shape=(None,),
            dtype=tf.int32,
            name='decoder_input_length',
        )
        self.decoder_outputs = tf.placeholder(
            shape=(None,None),
            dtype=tf.int32,
            name='decoder_outputs',
        )
        self.training=tf.placeholder(dtype=tf.bool)
    
    def _init_embeddings(self, src_vocab_size, tgt_vocab_size, embedding_size):
        with tf.variable_scope("embedding", reuse=tf.AUTO_REUSE) as scope:

            # Uniform(-sqrt(3), sqrt(3)) has variance=1.
            sqrt3 = math.sqrt(3)
            initializer = tf.random_uniform_initializer(-sqrt3, sqrt3)

            self.encoder_embedding_matrix = tf.get_variable(
                name="encoder_embedding_matrix",
                shape=[src_vocab_size, embedding_size],
                initializer=initializer,
                dtype=tf.float32)
            
            self.decoder_embedding_matrix = tf.get_variable(
                name="decoder_embedding_matrix",
                shape=[tgt_vocab_size, embedding_size],
                initializer=initializer,
                dtype=tf.float32)

            self.decoder_inputs_embedded = tf.nn.embedding_lookup(
                self.decoder_embedding_matrix, self.decoder_inputs)  
            
    def _init_bidirectional_encoder(self):
        
        with tf.variable_scope("BidirectionalEncoder", reuse=tf.AUTO_REUSE) as scope:
            
            encoder_cell1 = LSTMCell(512)
            encoder_cell2 = LSTMCell(512)
            
            self.encoder_states = []
            
            for i in range(20):
                self.encoder1_inputs_embedded = tf.nn.embedding_lookup(
                    self.encoder_embedding_matrix, self.encoder1_inputs[i*10:(i*10)+10,:])

                self.encoder2_inputs_embedded = tf.nn.embedding_lookup(
                    self.encoder_embedding_matrix, self.encoder2_inputs[i,:])

                ((encoder_fw_outputs,
                  encoder_bw_outputs),
                 (encoder_fw_state,
                  encoder_bw_state)) = (
                    tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell1,
                                                    cell_bw=encoder_cell1,
                                                    inputs=self.encoder1_inputs_embedded,
                                                    time_major=True,
                                                    dtype=tf.float32)
                    )

                
                if isinstance(encoder_fw_state, LSTMStateTuple):

                    encoder_state_c = tf.concat(
                        (encoder_fw_state.c, encoder_bw_state.c), 1, name='bidirectional_concat_c')
                    encoder_state_h = tf.concat(
                        (encoder_fw_state.h, encoder_bw_state.h), 1, name='bidirectional_concat_h')
                    self.encoder_state = LSTMStateTuple(c=encoder_state_c, h=encoder_state_h)

                elif isinstance(encoder_fw_state, tf.Tensor):
                    self.encoder_state = tf.concat((encoder_fw_state, encoder_bw_state), 2, name='bidirectional_concat')

                #self.encoder_states.append(tf.layers.dropout(self.encoder_state.c,training=self.training,rate=0.5))
                self.encoder_states.append(tf.layers.dropout(tf.concat((self.encoder_state.c, self.encoder2_inputs_embedded), 1),
                                                            training=self.training,rate=0.5))
                
            self.encoder_states = tf.stack(self.encoder_states)
            print(self.encoder_states.get_shape())

        with tf.variable_scope("BidirectionalEncoder1", reuse=tf.AUTO_REUSE) as scope:
            
            ((encoder_fw_outputs,
              encoder_bw_outputs),
             (encoder_fw_state,
              encoder_bw_state)) = (
                tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell2,
                                                cell_bw=encoder_cell2,
                                                inputs=self.encoder_states,
                                                time_major=True,
                                                dtype=tf.float32)
                )
            
            if isinstance(encoder_fw_state, LSTMStateTuple):

                encoder_state_c = tf.concat(
                    (encoder_fw_state.c, encoder_bw_state.c), 1, name='bidirectional_concat_c')
                encoder_state_h = tf.concat(
                    (encoder_fw_state.h, encoder_bw_state.h), 1, name='bidirectional_concat_h')
                self.encoder_state = LSTMStateTuple(c=encoder_state_c, h=encoder_state_h)

            elif isinstance(encoder_fw_state, tf.Tensor):
                self.encoder_state = tf.concat((encoder_fw_state, encoder_bw_state), 2, name='bidirectional_concat')
            self.encoder3_outputs = tf.concat((encoder_fw_outputs, encoder_bw_outputs), 2)
            #self.encoder_state = tf.layers.dropout(self.encoder_state.c,training=self.training,rate=0.5)
                
    def _init_decoder(self, tgt_vocab_size, batch_size):
        with tf.variable_scope("Decoder", reuse=tf.AUTO_REUSE) as scope:
            self.output_layer = layers_core.Dense(
                                tgt_vocab_size, use_bias=False)
            decoder_cell = LSTMCell(1024)

            if self.mode == "train":
                helper = tf.contrib.seq2seq.TrainingHelper(
                            self.decoder_inputs_embedded, self.decoder_input_length, time_major=True)

                # Decoder
                decoder = tf.contrib.seq2seq.BasicDecoder(
                    decoder_cell, helper, self.encoder_state, output_layer=self.output_layer)
                # Dynamic decoding
                outputs, _ , _ = tf.contrib.seq2seq.dynamic_decode(decoder)
                self.logits = outputs.rnn_output
                self.op = outputs.sample_id

            else:
                helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                            self.decoder_embedding_matrix,
                            start_tokens=tf.fill([batch_size], 0),
                            end_token=1)

                # Decoder
                if self.beam_search:
                    self.encoder_state = tf.contrib.seq2seq.tile_batch(
                        self.encoder_state, multiplier=self.beam_width)

                    # Define a beam-search decoder
                    decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                            cell=decoder_cell,
                            embedding=self.decoder_embedding_matrix,
                            start_tokens=tf.fill([batch_size], 0),
                            end_token=1,
                            initial_state=self.encoder_state,
                            beam_width=self.beam_width,
                            output_layer=self.output_layer,
                            length_penalty_weight=0.0)

                else:
                    decoder = tf.contrib.seq2seq.BasicDecoder(
                        decoder_cell, helper, self.encoder_state,
                        output_layer=self.output_layer)
                # Dynamic decoding
                outputs, _ , _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder, maximum_iterations=100)
                if self.beam_search:
                    self.op = outputs.predicted_ids
                else:
                    self.op = outputs.sample_id
    
    def _init_optimizer(self,batch_size):
        if not self.beam_search and self.mode == 'train': 
            with tf.variable_scope("Optimizer", reuse=tf.AUTO_REUSE) as scope:
                #target_output = tf.transpose(self.decoder_outputs)
                max_time = tf.reduce_max(self.decoder_input_length)
                crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=self.decoder_outputs, logits=self.logits)
                #target_weights = tf.sequence_mask(
                #    self.decoder_input_length, max_time, dtype=self.logits.dtype)
                #target_weights = tf.transpose(target_weights)
                self.loss = tf.reduce_sum(crossent)
                self.train_op = tf.train.AdamOptimizer().minimize(self.loss)

In [3]:
import numpy as np

def readFile(fileName,word2id):
    with open(fileName) as f:
        content = f.readlines()
    content = [x.split() for x in content]
    i = len(word2id)
    for line in content:
        for word in line:
            if word not in word2id:
                word2id[word] = i
                i+=1
    return content,word2id

def sequence_converter(content, word2id):
    
    input_lengths = np.zeros(len(content))
    
    for i in range(len(content)):
        for j in range(len(content[i])):
            content[i][j] = word2id[content[i][j]]
            
    return np.array(content).T

In [5]:
encoder_vocab = {}

encoder1_input, encoder_vocab = readFile('WeatherGov/train/train.proc',encoder_vocab)

encoder2_input, encoder_vocab = readFile('WeatherGov/train/train.field',encoder_vocab)

encoder1_input = sequence_converter(encoder1_input, encoder_vocab)
encoder2_input = sequence_converter(encoder2_input, encoder_vocab)

In [6]:
def d_readFile(fileName):
    with open(fileName) as f:
        content = f.readlines()
    content = [x.split() for x in content]
    word2id = {}
    word2id['sos'] = 0
    word2id['eos'] = 1
    i = 2
    for line in content:
        for word in line:
            if word not in word2id:
                word2id[word] = i
                i+=1
    return content,word2id

def d_sequence_converter(content, word2id, decoder_inputs = False, no_pad = False):
    
    input_max_length = 0
    input_lengths = np.zeros(len(content))
    
    for i in range(len(content)):
        for j in range(len(content[i])):
            input_lengths[i] = len(content[i])
            input_max_length = max(input_max_length,len(content[i]))

    for i in range(len(content)):
        if decoder_inputs:
            content[i].insert(0,'sos')
        if not no_pad:
            while len(content[i]) <= input_max_length:
                content[i].append('eos')

    for i in range(len(content)):
        for j in range(len(content[i])):
            content[i][j] = word2id[content[i][j]]
            
    if not no_pad:    
        return np.array(content).T, input_lengths
    else:
        return content, input_lengths

decoder_op, decoder_vocab = d_readFile('WeatherGov/train/summaries.txt')

decoder_target, decoder_target_lengths = d_sequence_converter(decoder_op, decoder_vocab, no_pad = True)

decoder_op, _ = d_readFile('WeatherGov/train/summaries.txt')
decoder_input, _ = d_sequence_converter(decoder_op, decoder_vocab, True)

decoder_id2word = {}
for i,j in decoder_vocab.iteritems():
    decoder_id2word[j] = i

In [7]:
def v_readFile(fileName):
    with open(fileName) as f:
        content = f.readlines()
    content = [x.split() for x in content]
    return content

def v_sequence_converter(content, word2id):
    
    input_lengths = np.zeros(len(content))
    
    for i in range(len(content)):
        for j in range(len(content[i])):
            content[i][j] = word2id[content[i][j]]
            
    return np.array(content).T

In [8]:
v_encoder1_input = v_readFile('WeatherGov/dev/dev.proc')

v_encoder2_input = v_readFile('WeatherGov/dev/dev.field')

v_encoder1_input = v_sequence_converter(v_encoder1_input, encoder_vocab)
v_encoder2_input = v_sequence_converter(v_encoder2_input, encoder_vocab)

In [8]:
from main import evaluate

def validate_bleu(model1, v_encoder1_input, v_encoder2_input, decoder_input, decoder_target_lengths, epoch):
    bi = []
    for b in range(0,20,bs):
        max_len = np.max(decoder_target_lengths[b:b+bs])
        x = []
        for a in decoder_target[b:b+bs]:
            x.append((a + [1] * int((max_len - len(a)))))
        x= np.array(x).T
        feed_dict = {model1.encoder1_inputs: v_encoder1_input[:,b:b+bs],
                             model1.encoder2_inputs: v_encoder2_input[:,b:b+bs],
                             model1.decoder_inputs: decoder_input[:,b:b+bs],
                             model1.decoder_input_length: decoder_target_lengths[b:b+bs],
                             model1.decoder_outputs: x,
                             model1.training: False
                            }
        bi.append(sess.run(model1.op, feed_dict=feed_dict))
        
    f = open('op.txt','w+')

    for b in bi:
        for i in range(len(b[:,:,0])):
            for j in range(len(b[i,:,0])):
                if b[i,j,0] == -1 or b[i,j,0] == 1:
                    break
                else:
                    f.write(decoder_id2word[b[i,j,0]]+' ')
            f.write('\n')

    f.close()
    
    f = open('op_epoch'+str(epoch)+'.txt','w+')

    i = 0

    for lines in open("op.txt"):
        if i < 3528:
            f.write(lines.strip()+'\n')
            i=i+1
        else:
            break
    f.close()
    
    evaluate('op_epoch'+str(epoch)+'.txt', 'WeatherGov/dev/summaries.txt', 1)

In [9]:
bs = 10

In [10]:
model = Seq2SeqModel(
                     mode = "train",
                     src_vocab_size=len(encoder_vocab), 
                     tgt_vocab_size=len(decoder_vocab), 
                     embedding_size=256, 
                     batch_size=bs, 
                     learning_rate = 0.001
                    )

model1 = Seq2SeqModel(
                     mode = "train",
                     src_vocab_size=len(encoder_vocab), 
                     tgt_vocab_size=len(decoder_vocab), 
                     embedding_size=256, 
                     batch_size=bs, 
                     learning_rate = 0.001,
                     beam_search = True,
                     beam_width = 5
                    )

In [11]:
config = tf.ConfigProto(allow_soft_placement = True)
sess = tf.Session(config = config)
sess.run(tf.global_variables_initializer())

In [38]:
print(validate_bleu(model1, v_encoder1_input, v_encoder2_input, decoder_input, decoder_target_lengths, 0))

OSError: [Errno 2] No such file or directory

In [32]:
epochs = 2
for e in range(epochs):
    e_loss = 0
    for b in range(0,20,bs):
        max_len = np.max(decoder_target_lengths[b:b+bs])
        x = []
        for a in decoder_target[b:b+bs]:
            x+=((a + [1] * int((max_len - len(a)))))
        x= np.array(x).T
        x = np.reshape(x,(x.shape[0],1))
        feed_dict = {model.encoder1_inputs: encoder1_input[:,b:b+bs],
                     model.encoder2_inputs: encoder2_input[:,b:b+bs],
                     model.decoder_inputs: decoder_input[:,b:b+bs],
                     model.decoder_input_length: decoder_target_lengths[b:b+bs],
                     model.decoder_outputs: x,
                     model.training: True
                    }
        a, b1, c = sess.run([model.logits,model.loss,model.train_op], feed_dict=feed_dict)
        e_loss += b1
    print(e_loss)
    print(validate_bleu(model1, v_encoder1_input, v_encoder2_input, decoder_input, decoder_target_lengths, e))

OSError: [Errno 2] No such file or directory

In [14]:
# Testing
v_encoder1_input = v_readFile('WeatherGov/WeatherGov/test/test.proc')

v_encoder2_input = v_readFile('WeatherGov/WeatherGov/test/test.field')

v_encoder1_input = v_sequence_converter(v_encoder1_input, encoder_vocab)
v_encoder2_input = v_sequence_converter(v_encoder2_input, encoder_vocab)

(20, ?, 1280)


In [29]:
f = open('op.txt','w+')

for b in bi:
    for i in range(len(b[:,:,0])):
        for j in range(len(b[i,:,0])):
            if b[i,j,0] == -1 or b[i,j,0] == 1:
                break
            else:
                f.write(decoder_id2word[b[i,j,0]]+' ')
        f.write('\n')

f.close()

In [30]:
f = open('op_processed_15.txt','w+')

i = 0

for lines in open("op.txt"):
    if i < 3528:
        f.write(lines.strip()+'\n')
        i=i+1
    else:
        break
f.close()