In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import os
from collections import Counter
from typing import NamedTuple, List, Dict, Tuple
import random

In [3]:
max_length = 50
batch_size = 128
learning_rate = 0.01
num_classes = 2
num_layers = 4
num_units = 256
vocab_size = 3000
embedding_size = 256
dropout_in_rate = 0.1
dropout_out_rate = 0.2

# config

In [4]:
class Config(NamedTuple):
    num_units: int = 256
    num_layers: int = 4
    num_classes: int = 2
    vocab_size: int = 3000
    embedding_size = 256
    batch_size: int = 128
    max_length: int = 50
    dropout_in_rate: float = 0.1
    dropout_out_rate: float = 0.2
    learning_rate: float = 0.001
    data_path: str = './data/'

In [5]:
config = Config()

# data

In [6]:
class PTBDataSource:
    
    def __init__(self, config: Config):
        self.config = config
        train_path = os.path.join(self.config.data_path, 'ptb.train.txt')
        test_path = os.path.join(self.config.data_path, 'ptb.test.txt')
        valid_path = os.path.join(self.config.data_path, 'ptb.valid.txt')
        
        self._word_to_id = self._create_tokenizer(train_path)
        self._id_to_word = {v: k for k, v in self._word_to_id.items()}
        
        self.train = self._create_data(train_path)
        self.test = self._create_data(test_path)
        self.valid = self._create_data(valid_path)
        
    def shuffle(self):
        random.shuffle(self.train)
        
    def feed_dict_list(self, model):
        num_batch = len(self.train) // self.config.batch_size
        data_list = []
        batch_list = []
        inputs = []
        inputs_length = []
        target_ids = []
        
        # まず全部feedの形にする
        for (i, sentence) in enumerate(self.train):
            for j in range(len(sentence)-1):
                inputs_words = sentence[:j+1][-self.config.max_length:] 
                inputs.append(inputs_words + [0] * (self.config.max_length - len(inputs_words)))
                inputs_length.append(len(inputs_words))
                target_ids.append(sentence[j+1])
        inputs = np.array(inputs)
        inputs_length = np.array(inputs_length)
        target_ids = np.array(target_ids)
        
        # batch_sizeに分ける
        for i in range(num_batch):
            index_from = i * self.config.batch_size
            index_to = (i + 1) * self.config.batch_size
            batch_range = range(index_from, index_to)
            fd = {
                model.inputs: inputs[batch_range],
                model.inputs_length: inputs_length[batch_range],
                model.target_ids: target_ids[batch_range]
            }
            batch_list.append(fd)
        return batch_list
            
    def _read_all_words(self, path) -> List[str]:
        with open(path, 'r') as f:
            return f.read().replace('\n', '<eos>').split()
        
    def _read_sentences(self, path) -> List[List[str]]:
        with open(path, 'r') as f:
            sentences = f.read().split('\n')
            return [sentence.split() for sentence in sentences]

    def _create_tokenizer(self, path: str):
        data = self._read_all_words(path)
        counter = Counter(data)
        sorted_counter = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
        words, _ = list(zip(*sorted_counter))
        word_to_id = dict(zip(words, range(1, len(words)+1)))
        return word_to_id
        
    def _get_id_from_word(self, word: str) -> int:
        return self._word_to_id.get(word, self.unk_id)
    
    def _sentence_to_id_list(self, sentence: List[str]) -> List[int]:
        return [self._get_id_from_word(word) for word in sentence]
    
    def _get_word_from_id(self, word_id: int) -> str:
        return self._id_to_word.get(word_id, self.unk_str)
    
    def _create_data(self, path: str):
        return [sentence for sentence in self._read_sentences(path)]
        #return [self._sentence_to_id_list(sentence) for sentence in self._read_sentences(path)]
    
    @property
    def vocab_size(self) -> int:
        return len(self._word_to_id)
    
    @property
    def pad_id(self) -> int:
        return 0
    
    @property
    def unk_id(self) -> int:
        return self._word_to_id.get('<unk>', self.pad_id)
    
    @property
    def eos_id(self) -> int:
        return self._word_to_id.get('<eos>', self.pad_id)

In [7]:
datasource = PTBDataSource(config)

# model

In [8]:
class RNN:
    
    def __init__(self, config: Config, vocab_size):
        self.config = config
        self.vocab_size = vocab_size
        self._create_placeholder()
        self._create_model()
        self.loss = self._create_loss()
        self.accuracy = self._create_acc()
    
    def _create_placeholder(self):
        self.is_training = tf.placeholder(shape=(), dtype=tf.bool, name='is_training')
        self.inputs = tf.placeholder(shape=[None, self.config.max_length], dtype=tf.int32, name='inputs')
        self.inputs_length = tf.placeholder(shape=[None], dtype=tf.int32, name='inputs_length')
        self.target_ids = tf.placeholder(shape=[None], dtype=tf.int32, name='target_ids')
    
    def _create_model(self):
        self.global_step = tf.train.get_or_create_global_step()
        embedded_inputs = self._embedding(self.inputs)
        _, encoder_state = self._encode(embedded_inputs)
        # encoder_state = tf.layers.dense(encoder_state, num_units, activation=tf.nn.relu, name='hidden_layer')
        self.outputs_logits = tf.layers.dense(encoder_state, self.vocab_size, activation=tf.nn.softmax, name='outputs_layer')
        self.predicted_id = tf.to_int32(tf.argmax(self.outputs_logits, axis=-1))
        
    def _create_loss(self):
        is_target = tf.to_float(tf.not_equal(self.target_ids, 0))
        target_ids_one_hot = tf.one_hot(self.target_ids, self.vocab_size)
        target_ids_smoothed = self._label_smoothing(target_ids_one_hot)
        cross_ent = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.outputs_logits, labels=target_ids_smoothed)
        return tf.reduce_sum(cross_ent * is_target) / tf.reduce_sum(is_target)
        
    def _create_acc(self):
        return tf.reduce_mean(tf.to_float(tf.equal(self.target_ids, self.predicted_id)))
    
    def _embedding(self, inputs):
        lookup_table = tf.get_variable('lookup_table', shape=[self.vocab_size, self.config.embedding_size], dtype=tf.float32)
        embedded_inputs = tf.nn.embedding_lookup(lookup_table, inputs)
        return embedded_inputs
    
    def _encode(self, embedded_inputs):
        outputs, final_state = self._bidirectional_cell(
            embedded_inputs,
            self.config.num_layers,
            self.config.num_units,
            self.config.dropout_in_rate,
            self.config.dropout_out_rate
        )
        return outputs, final_state
    
    def _bidirectional_cell(self, inputs, num_layers, num_units, dropout_in_rate, dropout_out_rate):
        cell_fw = self._gru(num_layers, num_units, dropout_in_rate, dropout_out_rate, name='cell_fw')
        cell_bw = self._gru(num_layers, num_units, dropout_in_rate, dropout_out_rate, name='cell_bw')
        (fw_outputs, bw_outputs), (fw_state, bw_state) = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=cell_fw,
            cell_bw=cell_bw,
            inputs=inputs,
            sequence_length=self.inputs_length,
            dtype=tf.float32,
            scope='bidirectional_cells')
        outputs = tf.concat([fw_outputs, bw_outputs], axis=-1)
        final_state = tf.reduce_sum([fw_state, bw_state], axis=0)
        final_state = tf.concat(tf.unstack(final_state, axis=0), axis=-1)
        print('final_state: ', final_state.shape)
        return outputs, final_state
    
    def _gru(self, num_layers: int, num_units: int, dropout_in_rate: float, dropout_out_rate: float, name: str):
        cells = []
        for l in range(num_layers):
            cell = tf.nn.rnn_cell.GRUCell(num_units, tf.nn.relu, kernel_initializer=tf.orthogonal_initializer, name=name)
            if l == 0:
                cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=1-dropout_in_rate)
            if l == num_layers-1:
                cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=1-dropout_out_rate)
            cells.append(cell)
        return tf.nn.rnn_cell.MultiRNNCell(cells)
    
    def _label_smoothing(self, inputs, epsilon: float=0.1):
        feature_dim = inputs.get_shape().as_list()[-1]
        return (1-epsilon) * inputs + (epsilon / feature_dim)

In [9]:
rnn = RNN(config, datasource.vocab_size)

Instructions for updating:
seq_dim is deprecated, use seq_axis instead
Instructions for updating:
batch_dim is deprecated, use batch_axis instead
final_state:  (?, 1024)


In [10]:
optimizer = tf.train.AdamOptimizer(config.learning_rate)
train_op = optimizer.minimize(rnn.loss, global_step=rnn.global_step)

In [11]:
num_epoch = 5

In [17]:
d = datasource.feed_dict_list(rnn)
d

[{<tf.Tensor 'inputs:0' shape=(?, 50) dtype=int32>: array([[ 929,    0,    0, ...,    0,    0,    0],
         [ 929,    7,    0, ...,    0,    0,    0],
         [ 929,    7,    2, ...,    0,    0,    0],
         ...,
         [  50, 8230,   27, ...,    0,    0,    0],
         [  50, 8230,   27, ...,    0,    0,    0],
         [  50, 8230,   27, ...,    0,    0,    0]]),
  <tf.Tensor 'inputs_length:0' shape=(?,) dtype=int32>: array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,
         13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  1,  2,  3,  4,
          5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
         22, 23, 24, 25, 26, 27, 28, 29,  1,  2,  3,  4,  5,  6,  7,  8,  9,
         10, 11, 12, 13, 14, 15, 16, 17,  1,  2,  3,  4,  5,  6,  7,  8,  9,
         10, 11, 12, 13, 14,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,
         13, 14,  1,  2,  3, 

In [26]:
d[0][rnn.inputs][20:24]

array([[ 929,    7,    2,  272,  684,   23,  251, 3997,    8,  118,    4,
           4,    9,    4, 6285, 2930,  942,    2,    7,    2, 9309,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0],
       [ 929,    7,    2,  272,  684,   23,  251, 3997,    8,  118,    4,
           4,    9,    4, 6285, 2930,  942,    2,    7,    2, 9309,  143,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0],
       [3106,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,   

In [27]:
d[0][rnn.target_ids][20:24]

array([ 143, 3870,   16,    1])

In [14]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for i in range(num_epoch):
        datasource.shuffle()
        batch_list = datasource.feed_dict_list(rnn)
        losses = []
        accuracies = []
        for (j, fd) in enumerate(batch_list):
            loss, acc, _ = sess.run([rnn.loss, rnn.accuracy, train_op], feed_dict=fd)
            losses.append(loss)
            accuracies.append(acc)
            if j % 100 == 0:
                print('loss: {:.3f}, acc: {:.3f}'.format(loss, acc))
                print(acc)
        print('epoch {}/{} finished. average loss: {:.3f}, average accuracy: {:.3f}'.format(i, num_epoch, np.average(losses), np.average(accuracies)))

loss: 9.211, acc: 0.000
0.0
loss: 9.134, acc: 0.086
0.0859375
loss: 9.141, acc: 0.078
0.078125
loss: 9.155, acc: 0.062
0.0625
epoch 327/5 finished. average loss: 9.173, average accuracy: 0.051
loss: 9.162, acc: 0.055
0.0546875
loss: 9.183, acc: 0.031
0.03125
loss: 9.176, acc: 0.039
0.0390625
loss: 9.162, acc: 0.055
0.0546875
epoch 327/5 finished. average loss: 9.165, average accuracy: 0.051
loss: 9.155, acc: 0.062
0.0625
loss: 9.141, acc: 0.078
0.078125
loss: 9.162, acc: 0.055
0.0546875


KeyboardInterrupt: 