In [62]:
from distutils.version import LooseVersion

In [63]:
import tensorflow as tf
from tensorflow.python.layers.core import Dense

#check tensorflow version
assert LooseVersion(tf.__version__) >= LooseVersion('1.1'),'please use tensorflow version 1.1 or newer'
print('tensorflow version: {}'.format(tf.__version__))

tensorflow version: 1.3.0


In [64]:
import numpy as np
import time

with open('data/letters_source.txt','r',encoding='utf8') as f:
    source_data = f.read()

with open('data/letters_target.txt','r',encoding='utf8') as f:
    target_data = f.read()

In [65]:
# data preview
source_data.split('\n')[:10]

['bsaqq',
 'npy',
 'lbwuj',
 'bqv',
 'kial',
 'tddam',
 'edxpjpg',
 'nspv',
 'huloz',
 'kmclq']

In [66]:
target_data.split('\n')[:10]

['abqqs',
 'npy',
 'bjluw',
 'bqv',
 'aikl',
 'addmt',
 'degjppx',
 'npsv',
 'hlouz',
 'cklmq']

In [67]:
#data preprocess
def extract_character_vocab(data):
    '''
    construct mapping table
    '''
    special_words = ['<PAD>','<UNK>','<GO>','<EOS>']
    
    set_words = list(set([character for line in data.split('\n') for character in line]))
    # add the four special words into the vocabulary
    int_to_vocab = {idx:word for idx,word in enumerate(special_words+set_words)}
    vocab_to_int = {word:idx for idx,word in int_to_vocab.items()}
    
    return int_to_vocab,vocab_to_int

In [68]:
#construct the mapping table
source_int_to_letter,source_letter_to_int = extract_character_vocab(source_data)
target_int_to_letter,target_letter_to_int = extract_character_vocab(target_data)

#transform the raw data
source_int = [[source_letter_to_int.get(letter,source_letter_to_int['<UNK>'])
              for letter in line] for line in source_data.split('\n')]
target_int = [[target_letter_to_int.get(letter,target_letter_to_int['<UNK>'])
              for letter in line] for line in target_data.split('\n')]

In [69]:
#check the result of the transformation
source_int[:10]

[[26, 25, 13, 19, 19],
 [18, 4, 9],
 [20, 26, 28, 11, 5],
 [26, 19, 6],
 [24, 10, 13, 20],
 [17, 29, 29, 13, 23],
 [12, 29, 8, 4, 5, 4, 27],
 [18, 25, 4, 6],
 [21, 11, 20, 14, 15],
 [24, 23, 16, 20, 19]]

In [70]:
target_int[:10]

[[13, 26, 19, 19, 25],
 [18, 4, 9],
 [26, 5, 20, 11, 28],
 [26, 19, 6],
 [13, 10, 24, 20],
 [13, 29, 29, 23, 17],
 [29, 12, 27, 5, 4, 4, 8],
 [18, 4, 25, 6],
 [21, 20, 14, 11, 15],
 [16, 24, 20, 23, 19]]

In [71]:
#construct the model

In [72]:
#input layer
def get_inputs():
    '''
    model input tensor
    '''
    inputs = tf.placeholder(tf.int32,[None,None],name='inputs')
    targets = tf.placeholder(tf.int32,[None,None],name='targets')
    learning_rate = tf.placeholder(tf.float32,name='learning_rate')
    
    # define the maximum length of the sequence
    target_sequence_length = tf.placeholder(tf.int32,(None,),name='target_sequence_length')
    max_target_sequence_length = tf.reduce_max(target_sequence_length,name='max_target_len')
    source_sequence_length = tf.placeholder(tf.int32,(None,),name='source_sequence_length')
    
    return inputs,targets,learning_rate,target_sequence_length,max_target_sequence_length,source_sequence_length

In [73]:
def get_encoder_layer(input_data,rnn_size,num_layers,
                     source_sequence_length,source_vocab_size,
                     encoding_embedding_size):
    '''
    construct Encoder layer
    
    params definition:
    - input_data: input tensor
    - rnn_size: rnn hidden units number
    - num_layers: the number of layers
    - source_sequence_length: the size of source data
    - encoding_embedding_size: the size of embedding
    '''
    # Encoder embedding
    encoder_embed_input = tf.contrib.layers.embed_sequence(input_data,source_vocab_size,
                                                          encoding_embedding_size)
    
    # RNN cell
    def get_lstm_cell(rnn_size):
        lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1,0.1,seed=2))
        return lstm_cell
    
    cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(rnn_size) for _ in range(num_layers)])
    
    encoder_output,encoder_state = tf.nn.dynamic_rnn(cell,encoder_embed_input,
                                                    sequence_length=source_sequence_length,dtype=tf.float32)
    return encoder_output,encoder_state

In [74]:
def process_decoder_input(data,vocab_to_int,batch_size):
    '''
    supply <GO>, remove the last character
    '''
    # cut the last character
    # tf.strided_slice(input_, start, end, strides)
    ending = tf.strided_slice(data,[0,0],[batch_size,-1],[1,1])
    decoder_input = tf.concat([tf.fill([batch_size,1],vocab_to_int['<GO>']),ending],1)
    
    return decoder_input

In [75]:
def decoding_layer(target_letter_to_int,decoding_embedding_size,num_layers,rnn_size,
                  target_sequence_length,max_target_sequence,encoder_state,decoder_input):
    '''
    construct the decoder layer
    
    @params:
    - target_letter_to_int: target data
    - decoding_embedding_size: embed size
    - num_layers: the number of RNN layers
    - rnn_size: the number of rnn units
    - target_sequence_length: target data length
    - max_target_sequence_length: the maximum length of target data sequences
    - encoder_state: the state vector of encoder RNN
    - decoder_input: decoder input
    '''
    # 1. Embedding
    target_vocab_size = len(target_letter_to_int)
    decoder_embeddings = tf.Variable(tf.random_uniform([target_vocab_size,decoding_embedding_size]))
    decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings,decoder_input)
    
    # 2. construct the rnn cell of decoder
    def get_decoder_cell(rnn_size):
        decoder_cell = tf.contrib.rnn.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1,0.1,seed=2))
        return decoder_cell
    cell = tf.contrib.rnn.MultiRNNCell([get_decoder_cell(rnn_size) for _ in range(num_layers)])
    
    # 3. construct the fully connected layer
    output_layer = Dense(target_vocab_size,kernel_initializer=tf.truncated_normal_initializer(mean=0.0,stddev=0.1))
    
    # 4. training decoder
    with tf.variable_scope('decoder'):
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embed_input,
                                                            sequence_length=target_sequence_length,time_major=False)
        # construct decoder
        training_decoder = tf.contrib.seq2seq.BasicDecoder(cell,training_helper,encoder_state,
                                                          output_layer)
        training_decoder_output, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                                       impute_finished=True,
                                                                       maximum_iterations=max_target_sequence_length)
    
    # 5. Predicting decoder
    # share parameters with training
    with tf.variable_scope('decoder',reuse=True):
        # construct a constant tensor which has the size of batch_size
        start_tokens = tf.tile(tf.constant([target_letter_to_int['<GO>']],dtype=tf.int32),[batch_size],name='start_tokens')
        predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings,
                                                                    start_tokens,
                                                                    target_letter_to_int['<EOS>'])
        predicting_decoder = tf.contrib.seq2seq.BasicDecoder(cell,predicting_helper,
                                                            encoder_state,
                                                            output_layer)
        predicting_decoder_output,_ = tf.contrib.seq2seq.dynamic_decode(predicting_decoder,
                                                                       impute_finished=True,
                                                                       maximum_iterations=max_target_sequence_length)
        
    return training_decoder_output,predicting_decoder_output        

In [76]:
def seq2seq_model(input_data,targets,lr,target_sequence_length,
                 max_target_sequence_length,source_sequence_length,
                 source_vocab_size,target_vocab_size,encoder_embedding_size,
                 decoder_embedding_size,rnn_size,num_layers):
    
    # get the state from encoder
    _,encoder_state = get_encoder_layer(input_data,rnn_size,num_layers,
                                       source_sequence_length,source_vocab_size,
                                       encoding_embedding_size)
    # the inputs of decoder
    decoder_input = process_decoder_input(targets,target_letter_to_int,batch_size)
    
    # pass the state vector to decoder
    training_decoder_output,predicting_decoder_output = decoding_layer(target_letter_to_int,
                                                                      decoding_embedding_size,
                                                                      num_layers,
                                                                      rnn_size,
                                                                      target_sequence_length,
                                                                      max_target_sequence_length,
                                                                      encoder_state,
                                                                      decoder_input)
    return training_decoder_output,predicting_decoder_output

In [77]:
# hyperparams
# number of epochs
epochs = 60
# batch size
batch_size = 128
# rnn size
rnn_size = 50
# number of layers
num_layers = 2
# embedding_size
encoding_embedding_size = 15
decoding_embedding_size = 15
# learning rate
learning_rate = 0.001

In [78]:
#construct graph
train_graph = tf.Graph()

with train_graph.as_default():
    
    # get the inputs of the model
    input_data,targets,lr,target_sequence_length,max_target_sequence_length,source_sequence_length = get_inputs()
    
    training_decoder_output,predicting_decoder_output = seq2seq_model(input_data,
                                                                     targets,
                                                                     lr,
                                                                     target_sequence_length,
                                                                     max_target_sequence_length,
                                                                     source_sequence_length,
                                                                     len(source_letter_to_int),
                                                                     len(target_letter_to_int),
                                                                     encoding_embedding_size,
                                                                     decoding_embedding_size,
                                                                     rnn_size,
                                                                     num_layers)
    
    training_logits = tf.identity(training_decoder_output.rnn_output,'logits')
    predicting_logits = tf.identity(predicting_decoder_output.sample_id,name='predictions')
    
    masks = tf.sequence_mask(target_sequence_length,max_target_sequence_length,dtype=tf.float32,
                            name='masks')
    
    with tf.name_scope('optimization'):
        
        # loss function
        cost = tf.contrib.seq2seq.sequence_loss(training_logits,
                                               targets,masks)
        
        # optimizer
        optimizer = tf.train.AdamOptimizer(lr)
        
        # gradient clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad,-5.,5.),var) for grad,var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

ValueError: too many values to unpack (expected 2)