# A Basic seq2seq Model 

Input seq: titles

Target seq: Abstract 

Code Source: https://github.com/NELSONZHAO/zhihu/tree/master/basic_seq2seq?1521452873816

In [1]:
# Load data
import numpy as np
import time
import tensorflow as tf

with open("/Users/jingyunyang/Desktop/OneDrive - Dalhousie University/Courses/CSCI 4190 Spec Topics-Computer Science/Project/dlproject/titles.txt", 'r', encoding='utf-8') as f:
    source_data = f.read()

with open("/Users/jingyunyang/Desktop/OneDrive - Dalhousie University/Courses/CSCI 4190 Spec Topics-Computer Science/Project/dlproject/abstract.txt", 'r', encoding='utf-8') as f:
    target_data = f.read()

  from ._conv import register_converters as _register_converters


In [2]:
from distutils.version import LooseVersion
import tensorflow as tf
from tensorflow.python.layers.core import Dense


# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.1'), 'Please use TensorFlow version 1.1 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))


TensorFlow Version: 1.8.0


In [3]:
# Preview of data
source_data.split('\n')[:10]


["['A Double Layer Electromagnetic Cloak And GL EM Modeling']",
 "['Extremal problems for the central projection']",
 "['On the spectral theory of groups of affine transformations of compact\\n  nilmanifolds']",
 "['Doubly Special Relativity with a minimum speed and the Uncertainty\\n  Principle']",
 "['Single Mode Approximation for sub-Ohmic Spin-Boson Model: Adiabatic\\n  Limit and Critical Properties']",
 "['Asymptotics for the Late Arrivals Problem']",
 "['Monoidal ring and coring structures obtained from wreaths and cowreaths']",
 "['Constructing numerically stable Kalman filter-based algorithms for\\n  gradient-based adaptive filtering']",
 "['Random subgraphs make identification affordable']",
 "['A theory of minimal K-types for flat G-bundles']"]

In [4]:
target_data.split('\n')[:10]

["['  We present wide-field JHKs-band photometric observations of the three compact\\nHII regions G48.9-0.3, G49.0-0.3, and G49.2-0.3 in the active star-forming\\nregion W51B. The star clusters inside the three compact HII regions show the\\nexcess number of stars in the J-Ks histograms compared with reference fields.\\nWhile the mean color excess ratio E(J-H)/E(H-Ks) of the three compact HII\\nregions are similar to ~ 2.07, the visual extinctions toward them are somewhat\\ndifferent: ~ 17 mag for G48.9-0.3 and G49.0-0.3; ~ 23 mag for G49.2-0.3. Based\\non their sizes and brightnesses, we suggest that the age of each compact HII\\nregion is =< 2 Myr. The inferred total stellar mass, ~ 1.4 x 10^4 M_sun, of\\nW51B makes it one of the most active star forming regions in the Galaxy with\\nthe star formation efficiency of ~ 10 %.\\n', 'Comment: 12 pages, 10 eps figures, uses jkas.sty']",
 "['  A charge injection technique is applied to the X-ray CCD camera, XIS (X-ray\\nImaging Spectrometer

### Preprocessing Data


In [5]:
def extract_character_vocab(data):
    '''
    Mapping table
    '''
    special_words = ['<PAD>', '<UNK>', '<GO>',  '<EOS>']

    set_words = list(set([character for line in data.split('\n') for character in line]))
    # Add to Dict 
    int_to_vocab = {idx: word for idx, word in enumerate(special_words + set_words)}
    vocab_to_int = {word: idx for idx, word in int_to_vocab.items()}

    return int_to_vocab, vocab_to_int

In [6]:
# Mapping table
source_int_to_letter, source_letter_to_int = extract_character_vocab(source_data)
target_int_to_letter, target_letter_to_int = extract_character_vocab(target_data)

# transfer characters 
source_int = [[source_letter_to_int.get(letter, source_letter_to_int['<UNK>']) 
               for letter in line] for line in source_data.split('\n')]
target_int = [[target_letter_to_int.get(letter, target_letter_to_int['<UNK>']) 
               for letter in line] + [target_letter_to_int['<EOS>']] for line in target_data.split('\n')]

In [7]:
# Result 
source_int[:10]


[[34,
  20,
  73,
  41,
  36,
  64,
  95,
  35,
  76,
  46,
  41,
  21,
  79,
  28,
  46,
  58,
  41,
  45,
  76,
  46,
  33,
  42,
  58,
  64,
  96,
  79,
  51,
  29,
  46,
  42,
  5,
  33,
  41,
  75,
  76,
  64,
  79,
  54,
  41,
  73,
  29,
  16,
  41,
  86,
  21,
  41,
  45,
  15,
  41,
  15,
  64,
  16,
  46,
  76,
  5,
  29,
  51,
  20,
  89],
 [34,
  20,
  45,
  39,
  42,
  58,
  46,
  96,
  79,
  76,
  41,
  94,
  58,
  64,
  35,
  76,
  46,
  96,
  87,
  41,
  66,
  64,
  58,
  41,
  42,
  55,
  46,
  41,
  33,
  46,
  29,
  42,
  58,
  79,
  76,
  41,
  94,
  58,
  64,
  43,
  46,
  33,
  42,
  5,
  64,
  29,
  20,
  89],
 [34,
  20,
  74,
  29,
  41,
  42,
  55,
  46,
  41,
  87,
  94,
  46,
  33,
  42,
  58,
  79,
  76,
  41,
  42,
  55,
  46,
  64,
  58,
  28,
  41,
  64,
  66,
  41,
  51,
  58,
  64,
  95,
  94,
  87,
  41,
  64,
  66,
  41,
  79,
  66,
  66,
  5,
  29,
  46,
  41,
  42,
  58,
  79,
  29,
  87,
  66,
  64,
  58,
  96,
  79,
  42,
  5,
  64,
  29,
  87,
 

In [8]:
target_int[:10]


[[34,
  20,
  41,
  41,
  55,
  46,
  41,
  94,
  58,
  46,
  87,
  46,
  29,
  42,
  41,
  11,
  5,
  16,
  46,
  81,
  66,
  5,
  46,
  76,
  16,
  41,
  82,
  97,
  69,
  87,
  81,
  35,
  79,
  29,
  16,
  41,
  94,
  56,
  64,
  42,
  64,
  96,
  46,
  42,
  58,
  5,
  33,
  41,
  64,
  35,
  87,
  46,
  58,
  71,
  79,
  42,
  5,
  64,
  29,
  87,
  41,
  64,
  66,
  41,
  42,
  56,
  46,
  41,
  42,
  56,
  58,
  46,
  46,
  41,
  33,
  64,
  96,
  94,
  79,
  33,
  42,
  8,
  29,
  97,
  84,
  84,
  41,
  58,
  46,
  51,
  5,
  64,
  29,
  87,
  41,
  86,
  47,
  62,
  48,
  52,
  81,
  91,
  48,
  77,
  78,
  41,
  86,
  47,
  52,
  48,
  91,
  81,
  91,
  48,
  77,
  78,
  41,
  79,
  29,
  16,
  41,
  86,
  47,
  52,
  48,
  40,
  81,
  91,
  48,
  77,
  41,
  5,
  29,
  41,
  42,
  56,
  46,
  41,
  79,
  33,
  42,
  5,
  71,
  46,
  41,
  87,
  42,
  79,
  58,
  81,
  66,
  64,
  58,
  96,
  5,
  29,
  51,
  8,
  29,
  58,
  46,
  51,
  5,
  64,
  29,
  41,
  55,
  25,
  3

## Model 

#### Input layer 

In [9]:
def get_inputs():
    '''
    Model input tensor
    '''
    inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    learning_rate = tf.placeholder(tf.float32, name='learning_rate')
    
    # Def max target seq length（target_sequence_length AND source_sequence_length as input of feed_dict）
    target_sequence_length = tf.placeholder(tf.int32, (None,), name='target_sequence_length')
    max_target_sequence_length = tf.reduce_max(target_sequence_length, name='max_target_len')
    source_sequence_length = tf.placeholder(tf.int32, (None,), name='source_sequence_length')
    
    return inputs, targets, learning_rate, target_sequence_length, max_target_sequence_length, source_sequence_length

### Encoder
1. Input embedding 
2. Pass embedding vector to RNN 

In [10]:
def get_encoder_layer(input_data, rnn_size, num_layers,
                   source_sequence_length, source_vocab_size, 
                   encoding_embedding_size):

    '''
    Encoder Layer
    
    Inputs：
    - input_data: input tensor
    - rnn_size: # of rnn  hidden layers
    - num_layers: # of stacked rnn cell
    - source_sequence_length: length of source seq
    - source_vocab_size: dict size of sourse seq
    - encoding_embedding_size: size of embedding matrix
    '''
    # Encoder embedding
    encoder_embed_input = tf.contrib.layers.embed_sequence(input_data, source_vocab_size, encoding_embedding_size)

    # RNN cell
    def get_lstm_cell(rnn_size):
        lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return lstm_cell

    cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(rnn_size) for _ in range(num_layers)])
    
    encoder_output, encoder_state = tf.nn.dynamic_rnn(cell, encoder_embed_input, 
                                                      sequence_length=source_sequence_length, dtype=tf.float32)
    
    return encoder_output, encoder_state

### Decoder

#### Preprocessing to target 

In [11]:
def process_decoder_input(data, vocab_to_int, batch_size):
    '''
    supplement <GO>，remove last character
    '''
    # cut last char
    ending = tf.strided_slice(data, [0, 0], [batch_size, -1], [1, 1])
    decoder_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)

    return decoder_input

#### Embedding to targets 
Embedding to targets s.t. can be passed to rnn of Decoder

In [22]:
def decoding_layer(target_letter_to_int, decoding_embedding_size, num_layers, rnn_size,
                   target_sequence_length, max_target_sequence_length, encoder_state, decoder_input):
    '''
    构造Decoder层
    
    参数：
    - target_letter_to_int: target mapping table
    - decoding_embedding_size: embed size
    - num_layers: # of stakced RNN neuron 
    - rnn_size: # of hidden states of RNN neurons
    - target_sequence_length: length of target seq
    - max_target_sequence_length: max length of target seq 
    - encoder_state: embedded encoder state vector 
    - decoder_input: input from decoder
    '''
    # 1. Embedding
    target_vocab_size = len(target_letter_to_int)
    decoder_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings, decoder_input)

    # 2. construct Decoder RNN neurons
    def get_decoder_cell(rnn_size):
        decoder_cell = tf.contrib.rnn.LSTMCell(rnn_size,
                                           initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return decoder_cell
    cell = tf.contrib.rnn.MultiRNNCell([get_decoder_cell(rnn_size) for _ in range(num_layers)])
     
    # 3. Output fully connected
    output_layer = Dense(target_vocab_size,
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))


    # 4. Training decoder
    with tf.variable_scope("decode"):
        # get object of help
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embed_input,
                                                            sequence_length=target_sequence_length,
                                                            time_major=False)
        # construct decoder
        training_decoder = tf.contrib.seq2seq.BasicDecoder(cell,
                                                           training_helper,
                                                           encoder_state,
                                                           output_layer) 
        training_decoder_output, _,_ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                                       impute_finished=True,
                                                                       maximum_iterations=max_target_sequence_length)
    # 5. Predicting decoder
    # training parameter sharing 
    with tf.variable_scope("decode", reuse=True):
        # const tensor <- batch_size
        start_tokens = tf.tile(tf.constant([target_letter_to_int['<GO>']], dtype=tf.int32), [batch_size], 
                               name='start_tokens')
        predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings,
                                                                start_tokens,
                                                                target_letter_to_int['<EOS>'])
        predicting_decoder = tf.contrib.seq2seq.BasicDecoder(cell,
                                                        predicting_helper,
                                                        encoder_state,
                                                        output_layer)
        predicting_decoder_output, _,_ = tf.contrib.seq2seq.dynamic_decode(predicting_decoder,
                                                            impute_finished=True,
                                                            maximum_iterations=max_target_sequence_length)
    
    return training_decoder_output, predicting_decoder_output
        

## seq2seq

In [23]:
def seq2seq_model(input_data, targets, lr, target_sequence_length, 
                  max_target_sequence_length, source_sequence_length,
                  source_vocab_size, target_vocab_size,
                  encoder_embedding_size, decoder_embedding_size, 
                  rnn_size, num_layers):
    
    # get output pf encoder state 
    _, encoder_state = get_encoder_layer(input_data, 
                                  rnn_size, 
                                  num_layers, 
                                  source_sequence_length,
                                  source_vocab_size, 
                                  encoding_embedding_size)
    
    
    # preprocessed decoder input 
    decoder_input = process_decoder_input(targets, target_letter_to_int, batch_size)
    
    # pass satte vector and inputs to decoder
    training_decoder_output, predicting_decoder_output = decoding_layer(target_letter_to_int, 
                                                                       decoding_embedding_size, 
                                                                       num_layers, 
                                                                       rnn_size,
                                                                       target_sequence_length,
                                                                       max_target_sequence_length,
                                                                       encoder_state, 
                                                                       decoder_input) 
    
    return training_decoder_output, predicting_decoder_output

In [24]:
len(target_data.split('\n')[3])

1513

In [28]:
# Hyperparameters 
# Number of Epochs
epochs = 60
# Batch Size
batch_size = 128
# RNN Size
rnn_size = 50
# Number of Layers
num_layers = 2
# Embedding Size
encoding_embedding_size = 10
decoding_embedding_size = 2000
# Learning Rate
learning_rate = 0.001

### Computation Graph


In [29]:
# Graph
train_graph = tf.Graph()

with train_graph.as_default():
    
    # Model input    
    input_data, targets, lr, target_sequence_length, max_target_sequence_length, source_sequence_length = get_inputs()
    
    training_decoder_output, predicting_decoder_output = seq2seq_model(input_data, 
                                                                      targets, 
                                                                      lr, 
                                                                      target_sequence_length, 
                                                                      max_target_sequence_length, 
                                                                      source_sequence_length,
                                                                      len(source_letter_to_int),
                                                                      len(target_letter_to_int),
                                                                      encoding_embedding_size, 
                                                                      decoding_embedding_size, 
                                                                      rnn_size, 
                                                                      num_layers)    
    
    training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')
    predicting_logits = tf.identity(predicting_decoder_output.sample_id, name='predictions')
    
    masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(lr)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)


## Batches

In [30]:
def pad_sentence_batch(sentence_batch, pad_int):
    '''
    complement the seq in batch s.t. each line in bacth has same sequence_length
    
    Inputs：
    - sentence batch
    - pad_int: index of corresponding <PAD> 
    '''
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [31]:
def get_batches(targets, sources, batch_size, source_pad_int, target_pad_int):
    '''
    Get batch size
    '''
    for batch_i in range(0, len(sources)//batch_size):
        start_i = batch_i * batch_size
        sources_batch = sources[start_i:start_i + batch_size]
        targets_batch = targets[start_i:start_i + batch_size]
        # make up the seq
        pad_sources_batch = np.array(pad_sentence_batch(sources_batch, source_pad_int))
        pad_targets_batch = np.array(pad_sentence_batch(targets_batch, target_pad_int))
        
        # record length of seq
        targets_lengths = []
        for target in targets_batch:
            targets_lengths.append(len(target))
        
        source_lengths = []
        for source in sources_batch:
            source_lengths.append(len(source))
        
        yield pad_targets_batch, pad_sources_batch, targets_lengths, source_lengths

## Train

In [None]:
# shffile to train and validation
train_source = source_int[batch_size:]
train_target = target_int[batch_size:]
# save one batch for validation
valid_source = source_int[:batch_size]
valid_target = target_int[:batch_size]
(valid_targets_batch, valid_sources_batch, valid_targets_lengths, valid_sources_lengths) = next(get_batches(valid_target, valid_source, batch_size,
                           source_letter_to_int['<PAD>'],
                           target_letter_to_int['<PAD>']))

display_step = 50 # print every 50steps loss

checkpoint = "trained_model.ckpt" 
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
        
    for epoch_i in range(1, epochs+1):
        for batch_i, (targets_batch, sources_batch, targets_lengths, sources_lengths) in enumerate(
                get_batches(train_target, train_source, batch_size,
                           source_letter_to_int['<PAD>'],
                           target_letter_to_int['<PAD>'])):
            
            _, loss = sess.run(
                [train_op, cost],
                {input_data: sources_batch,
                 targets: targets_batch,
                 lr: learning_rate,
                 target_sequence_length: targets_lengths,
                 source_sequence_length: sources_lengths})

            if batch_i % display_step == 0:
                
                # compute validation loss
                validation_loss = sess.run(
                [cost],
                {input_data: valid_sources_batch,
                 targets: valid_targets_batch,
                 lr: learning_rate,
                 target_sequence_length: valid_targets_lengths,
                 source_sequence_length: valid_sources_lengths})
                 
                print('Epoch {:>3}/{} Batch {:>4}/{} - Training Loss: {:>6.3f}  - Validation loss: {:>6.3f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(train_source) // batch_size, 
                              loss, 
                              validation_loss[0]))

    
    
    # Save Model 
    saver = tf.train.Saver()
    saver.save(sess, checkpoint)
    print('Model Trained and Saved')

Epoch   1/60 Batch    0/402 - Training Loss:  4.619  - Validation loss:  4.529


## Prediction

In [None]:
def source_to_seq(text):
    '''
    transfer the source data
    '''
    sequence_length = 7
    return [source_letter_to_int.get(word, source_letter_to_int['<UNK>']) for word in text] + [source_letter_to_int['<PAD>']]*(sequence_length-len(text))

In [None]:
# Input a title
input_word = 'common'
text = source_to_seq(input_word)

checkpoint = "./trained_model.ckpt"

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load Model 
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)

    input_data = loaded_graph.get_tensor_by_name('inputs:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    source_sequence_length = loaded_graph.get_tensor_by_name('source_sequence_length:0')
    target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
    
    answer_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                      target_sequence_length: [len(input_word)]*batch_size, 
                                      source_sequence_length: [len(input_word)]*batch_size})[0] 


pad = source_letter_to_int["<PAD>"] 

print('Initial input:', input_word)

print('\nSource')
print('  Word index:    {}'.format([i for i in text]))
print('  Input Words: {}'.format(" ".join([source_int_to_letter[i] for i in text])))

print('\nTarget')
print('  Word index:       {}'.format([i for i in answer_logits if i != pad]))
print('  Response Words: {}'.format(" ".join([target_int_to_letter[i] for i in answer_logits if i != pad])))