In [1]:
import sys
import os
import time
import random
import re
import json
import pickle
from typing import List, Tuple, Dict, Callable, Optional, Any, Sequence, Mapping, NamedTuple

In [2]:
import tensorflow as tf
from tensorflow import Tensor
import numpy as np
import matplotlib as plt

  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
from utils.config import Config
#from data_loader.docomo_datasource import DocomoDataSource

In [4]:
units = [256, 512, 1024]
layers = [2, 4, 6]
lrs = [0.01, 0.001, 0.0001]
configs = []
for l in layers:
    for u in units:
        for lr in lrs:
            configs.append(Config(num_layers=l, num_units=u, learning_rate=lr, log_dir='./logs/rnn/'))

In [5]:
configs[0]

Config(num_units=256, num_layers=2, num_heads=8, num_outputs=10000, batch_size=128, max_length=50, dropout_in_rate=0.1, dropout_out_rate=0.2, learning_rate=0.01, grad_clip=5.0, is_layer_norm=False, data_path='./data/', log_dir='./logs/rnn/', scheduled_sampling_rate=0.0)

In [6]:
class DocomoDataSource:
    
    def __init__(self, config):
        self._config = config
        self._ask, self._res = self.load_data()
        self._create_tokenizer()
    
    def load_data(self):
        data = json.load(open(os.path.join(self._config.data_path, 'docomo-weight.json'), 'r'))
        ask = [s['a'] for s in data['dialogue']]
        res = [s['r'] for s in data['dialogue']]
        return ask, res
    
    def feed_dict(self, model, batch_size: int):
        inputs = []
        inputs_length = []
        targets = []
        targets_length = []
        
        num_batch = len(self._ask) // batch_size
        batch = []
        
        for i in range(num_batch):
            start_index = batch_size * i
            end_index = batch_size * (i + 1)
            inputs = self._ask[start_index:end_index]
            targets = self._res[start_index:end_index]
            d = self._create_dict(model, inputs, targets)
            
            batch.append(d)

        return batch
    
    def shuffle(self):
        data = list(zip(self._ask, self._res))
        data = np.array(random.sample(data, len(data)))
        self._ask, self._res = data[:, 0], data[:, 1]
        
    def _create_dict(self, model, inputs: List[str], targets: List[str]):
        inputs = self.batch(inputs, suffix=[self.eos_id])
        inputs_length = [len(input) for input in inputs]
        encoder_targets = self.batch(targets, prefix=[self.bos_id])
        decoder_targets = self.batch(targets, suffix=[self.eos_id])
        encoder_targets_length = [len(target) for target in encoder_targets]
        decoder_targets_length = [len(target) for target in decoder_targets]
        d = {
            model.inputs: inputs,
            model.inputs_length: inputs_length,
            model.encoder_targets: encoder_targets,
            model.encoder_targets_length: encoder_targets_length,
            model.decoder_targets: decoder_targets,
            model.decoder_targets_length: decoder_targets_length
        }
        return d
    
    def _create_tokenizer(self):
        with open('./data/vocab.pkl', 'rb') as f:
            self._word_to_id, self._id_to_word = pickle.load(f)
            
    def id_list_to_sentence(self, id_list: List[int]):
        return [self._id_to_word[idx] for idx in id_list]
    
    def sentence_to_id_list(self, sentence: str):
        return [self._word_to_id[word] for word in sentence]
    
    def batch(self, batch: List[str], prefix=None, suffix=None):
        prefix = prefix or []
        suffix = suffix or []
        batch_list = [prefix + self.sentence_to_id_list(b) + suffix for b in batch]
        batch_list = [batch + [0] * (self._config.max_length - len(batch)) for batch in batch_list]
        return batch_list
    
    @property
    def eos_id(self):
        return self._word_to_id['<eos>']
    
    @property
    def bos_id(self):
        return self._word_to_id['<bos>']
    
    @property
    def vocab_size(self):
        return len(self._word_to_id)
                       
            

In [7]:
ds = DocomoDataSource(configs[0])

In [8]:
class RNN:
    
    def __init__(self, config: Config, vocab_size):
        self.config = config
        self.vocab_size = vocab_size
        self._create_placeholder()
        self._create_model()
        self.loss = self._create_loss()
        self.accuracy = self._create_acc()
    
    def _create_placeholder(self):
        self.is_training = tf.placeholder(shape=(), dtype=tf.bool, name='is_training')
        self.inputs = tf.placeholder(shape=[None, self.config.max_length], dtype=tf.int32, name='inputs')
        self.inputs_length = tf.placeholder(shape=[None], dtype=tf.int32, name='inputs_length')
        self.encoder_targets = tf.placeholder(shape=[None, self.config.max_length], dtype=tf.int32, name='encoder_targets')
        self.encoder_targets_length = tf.placeholder(shape=[None], dtype=tf.int32, name='encoder_targets_length')
        self.decoder_targets = tf.placeholder(shape=[None, self.config.max_length], dtype=tf.int32, name='decoder_targets')
        self.decoder_targets_length = tf.placeholder(shape=[None], dtype=tf.int32, name='decoder_targets_length')
    
    def _create_model(self):
        self.global_step = tf.train.get_or_create_global_step()
        embedded_inputs = self._embedding(self.inputs)
        encoder_outputs, encoder_state = self._encode(embedded_inputs)
        #attention_mechanism = self._prepare_encoder_attention(encoder_outputs, self.inputs_length)
        attention_mechanism = None
        self.outputs_logits = self._train_decoder(encoder_state, self.encoder_targets, self.encoder_targets_length, attention_mechanism)
        self.predicted_id = tf.to_int32(tf.argmax(self.outputs_logits, axis=-1))
        
    def _create_loss(self):
        target_ids_one_hot = tf.one_hot(self.decoder_targets, self.vocab_size)
        target_ids_smoothed = self._label_smoothing(target_ids_one_hot)
        cross_ent = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.outputs_logits, labels=target_ids_smoothed)
        return tf.reduce_mean(cross_ent)
        
    def _create_acc(self):
        return tf.reduce_mean(tf.to_float(tf.equal(self.decoder_targets, self.predicted_id)))
    
    def _create_perplexity(self):
        probs = tf.nn.softmax(self.outputs_logits)
        target_probs = np.zeros(self.decoder_targets.shape[0])
        for (i, target) in enumerate(self.decoder_targets):
            target_probs[i] = probs[i, target]
        return 1.0/target_probs
    
    def _embedding(self, inputs):
        self.lookup_table = tf.get_variable('lookup_table', shape=[self.vocab_size, self.config.embedding_size], dtype=tf.float32)
        embedded_inputs = tf.nn.embedding_lookup(self.lookup_table, inputs)
        return embedded_inputs
    
    def _encode(self, embedded_inputs):
        outputs, final_state = self._bidirectional_cell(
            embedded_inputs,
            self.config.num_layers,
            self.config.num_units,
            self.config.dropout_in_rate,
            self.config.dropout_out_rate
        )
        return outputs, final_state
    
    def _train_decoder(self, encoder_state: Tuple[Tensor, Tensor],
                      inputs_data: Tensor,
                      inputs_length: Tensor,
                      attention_mechanism: Tensor) -> Tensor:
        '''
        :params encoder_state: thought vector from encoder
        :params [batch_size, num_text_id] inputs_data:
        :params [batch_size] inputs_length:
        :return: (10, ?, 3023)
        '''
        multi_cells = self._gru(self.config.num_layers, self.config.num_units * 2, self.config.dropout_in_rate, self.config.dropout_out_rate, 'decoder_cell')
        #attention_cells = self._prepare_decoder_attention(attention_mechanism, multi_cells, attention_size=self.config.num_units*2)
        output_layer = tf.layers.Dense(self.vocab_size, use_bias=False, name="output_layer")

        decoder_inputs = tf.nn.embedding_lookup(self.lookup_table, inputs_data)
        this_batch_size, _ = tf.unstack(tf.shape(inputs_data))
        #decoder_initial_state = attention_cells.zero_state(batch_size=this_batch_size, dtype=tf.float32)
        #decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
        decoder_initial_state = encoder_state

        helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
            inputs=decoder_inputs,
            sequence_length=tf.cast(inputs_length, dtype=tf.int32),
            embedding=self.lookup_table,
            sampling_probability=self.config.scheduled_sampling_rate
        )

        decoder = tf.contrib.seq2seq.BasicDecoder(
            multi_cells, helper, decoder_initial_state,
            output_layer=output_layer
        )
        outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, swap_memory=True, scope='decoder')
        return outputs.rnn_output
    
    def _bidirectional_cell(self, inputs, num_layers, num_units, dropout_in_rate, dropout_out_rate):
        cell_fw = self._gru(num_layers, num_units, dropout_in_rate, dropout_out_rate, name='cell_fw')
        cell_bw = self._gru(num_layers, num_units, dropout_in_rate, dropout_out_rate, name='cell_bw')
        (fw_outputs, bw_outputs), (fw_state, bw_state) = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=cell_fw,
            cell_bw=cell_bw,
            inputs=inputs,
            sequence_length=self.inputs_length,
            dtype=tf.float32,
            scope='bidirectional_cells')
        outputs = tf.concat([fw_outputs, bw_outputs], axis=-1)
        final_state = []
        for i in range(num_layers):
            final_state.append(tf.concat([fw_state[i], bw_state[i]], axis=-1))
        final_state = tuple(final_state)
        return outputs, final_state
    
    def _gru(self, num_layers: int, num_units: int, dropout_in_rate: float, dropout_out_rate: float, name: str):
        cells = []
        for l in range(num_layers):
            cell = tf.nn.rnn_cell.GRUCell(num_units, tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer(), name=name + '_{}'.format(l))
            if l == 0:
                cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=1-dropout_in_rate)
            if l == num_layers-1:
                cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=1-dropout_out_rate)
            cells.append(cell)
        return tf.nn.rnn_cell.MultiRNNCell(cells)
    
    def _prepare_encoder_attention(self, encoder_outputs: Tensor, inputs_length: Tensor) -> Tensor:
        return tf.contrib.seq2seq.LuongAttention(
            self.config.num_units*2, encoder_outputs, memory_sequence_length=inputs_length
        )

    def _prepare_decoder_attention(self, attention: Tensor, cells: Tensor, attention_size: int) -> Tensor:
        return tf.contrib.seq2seq.AttentionWrapper(
            cells, attention, attention_layer_size=attention_size
        )
    
    def _label_smoothing(self, inputs, epsilon: float=0.1):
        feature_dim = inputs.get_shape().as_list()[-1]
        return (1-epsilon) * inputs + (epsilon / feature_dim)

In [9]:
num_epochs = 100

In [None]:
for config in configs:
    with tf.Graph().as_default():
        ds = DocomoDataSource(config)
        rnn = RNN(config, ds.vocab_size)
        train_loss = rnn.loss

        #decoder_batch_size, decoder_max_time = tf.unstack(tf.shape(rnn.decoder_targets))
        #target_weights = tf.sequence_mask(rnn.decoder_targets_length, decoder_max_time, dtype=tf.float32)
        #target_weights = target_weights / tf.reduce_sum(target_weights, 0)
        #train_loss = (tf.reduce_sum(crossent * target_weights) / tf.cast(decoder_batch_size, dtype=tf.float32))
        # Gradient and Optimization
        global_step = tf.train.get_or_create_global_step()
        params = tf.trainable_variables()
        gradients = tf.gradients(train_loss, params)
        clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
        optimizer = tf.train.AdamOptimizer(rnn.config.learning_rate)
        train_op = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=global_step)


        with tf.name_scope('summary'):
            loss_smr = tf.summary.scalar('loss', train_loss)
            acc_smr = tf.summary.scalar('acc', rnn.accuracy)
            merged_summary = tf.summary.merge_all()

        with tf.Session() as sess:
            writer = tf.summary.FileWriter(rnn.config.to_log_dir() , sess.graph)
            sess.run(tf.global_variables_initializer())
            for epoch in range(num_epochs):
                ds.shuffle()
                batch_list = ds.feed_dict(rnn, rnn.config.batch_size)
                for fd in batch_list:
                    _, step, loss, acc, smr = sess.run([train_op, global_step, train_loss, rnn.accuracy, merged_summary], feed_dict=fd)
                    #step = sess.run(global_step)
                    writer.add_summary(smr, step)
                    #if step % 100 == 0:
                    #print('step: {}, loss: {:.3f}, acc: {:.3f}'.format(step, loss, acc))
                print('epoch {}/{} finished.'.format(epoch, num_epochs))

Instructions for updating:
seq_dim is deprecated, use seq_axis instead
Instructions for updating:
batch_dim is deprecated, use batch_axis instead


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


epoch 0/100 finished.
epoch 1/100 finished.
epoch 2/100 finished.
epoch 3/100 finished.
epoch 4/100 finished.
epoch 5/100 finished.
epoch 6/100 finished.
epoch 7/100 finished.
epoch 8/100 finished.
epoch 9/100 finished.
epoch 10/100 finished.
epoch 11/100 finished.
epoch 12/100 finished.
epoch 13/100 finished.
epoch 14/100 finished.
epoch 15/100 finished.
epoch 16/100 finished.
epoch 17/100 finished.
epoch 18/100 finished.
epoch 19/100 finished.
epoch 20/100 finished.
epoch 21/100 finished.
epoch 22/100 finished.
epoch 23/100 finished.
epoch 24/100 finished.
epoch 25/100 finished.
epoch 26/100 finished.
epoch 27/100 finished.
epoch 28/100 finished.
epoch 29/100 finished.
epoch 30/100 finished.
epoch 31/100 finished.
epoch 32/100 finished.
epoch 33/100 finished.
epoch 34/100 finished.
epoch 35/100 finished.
epoch 36/100 finished.
epoch 37/100 finished.
epoch 38/100 finished.
epoch 39/100 finished.
epoch 40/100 finished.
epoch 41/100 finished.
epoch 42/100 finished.
epoch 43/100 finished