# COMP5329 - Deep Learning 

## Tutorial 7 - LSTM and GRU

**Semester 1, 2018**

**Objectives:**

* How to implement LSTM and GRU in tensorflow
* How to use LSTM, GRU, RNN cell in tensorflow
* How to process sequece data by deep learning


**Instructions:**

* Learning to count letter by using RNN, LSTM and GRU

Lecturers: Dalu Guo, Jiayan Qiu, Chaoyue Wang, Xinyuan Chen, Zheyu Feng and Sanjeev Sharma

# Generate sequence data

** Data instruction **
* A-Z, a-z, 0-9
* The differece number between the upper letters and lower letters with some noise.
* 80000 line data, 64000 train data, 16000 validate data

** Example **
* aAA304     -1
* bbB234BbB   0
* ccccccC     5


In [None]:
import random

def generate_line(input_char):
    num1 = random.randint(1, 30)
    num2 = random.randint(1, 30)
    src = [chr(input_char) for _ in range(num1)] # lowercase 
    target = [chr(input_char - 32) for _ in range(num2)] # uppercase
    src.extend(target)
    
    noise_num = random.randint(0, 100)
    for _ in range(noise_num):
        src.append(str(random.randint(0, 9))) # noise number
    random.shuffle(src)    
    
    return ''.join(src), num1 - num2 + 29
    

def generate_data(size, filename):
    f = open(filename, "w")
    s = set()
    count = 0
    while count < size:
        c = random.randint(ord('a'), ord('z'))
        src, target = generate_line(c)
        if src in s or src[::-1] in s:
            continue
        count += 1
        if count % 10000 == 0:
            print ("generate %d line" % count)
        s.add(src)
        f.write('\t'.join([src, str(target)]))
        f.write('\n')
    f.close()
    
if __name__=="__main__":
    generate_data(80000, "seq.txt")

# Model Training
* Create train and val data split
* Zero padding data and map letter to number 
* Embedding input to vector
* Feed the RNN/LSTM/GRU with embedding sequence 
* Predict the label from the last state of sequence

## Create model and train data

In [None]:
import os
import random
import seq_label
import time
import tensorflow as tf
import sys
import gc
import numpy as np

MAX_ITERATIONS = 100000
VAL_INTERVAL = 1000
MODEL_SAVE_INTERVAL = 3000
PRINT_INTERVAL = 100
batch_size = 64

def read_dataset(file_name):
    f = open(file_name)
    ls = []
    for line in f.readlines():
        line = line.strip() 
        l = line.split('\t')
        ls.append([l[0], int(l[1])])
    
    random.shuffle(ls)
    return ls[:64000], ls[64000:]

#map letter to number 
#a-z -> 1 - 26
#A-Z -> 27 - 52
#0-9 -> 53 - 62
def create_maps():
    dic = {}
    counter = 1
    for i in range(ord('a'), ord('z') + 1):
        dic[chr(i)] = counter
        counter += 1
    
    for i in range(ord('A'), ord('Z') + 1):
        dic[chr(i)] = counter
        counter += 1
        
    for i in range(ord('0'), ord('9') + 1):
        dic[chr(i)] = counter
        counter += 1
    
    return dic
        
def create_model(is_train, session, model_path, func_type):
    model = seq_label.SeqLabel(func_type, is_train)
    ckpt = tf.train.get_checkpoint_state(model_path)
    
    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
        #restore model
        print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
        model.saver.restore(session, ckpt.model_checkpoint_path)
    else:
        #creat new model
        print("Created model with fresh parameters.")
        session.run(tf.global_variables_initializer())
    return model

def create_batch(datas, maps):
    size = len(datas)
    seqs = np.zeros((size, seq_label.input_length), dtype = np.int32)
    labels = np.zeros(size, dtype = np.int32)
    for i in range(size):
        labels[i] = datas[i][1]
        seq = datas[i][0]
        l = seq_label.input_length - len(seq) # zero padding
        for j in range(len(seq)):
            seqs[i][l + j] = maps[seq[j]] 
        
    return seqs, labels

def train_model(maps, func_type):
    folder = "model_" + func_type
    if not os.path.exists(folder):
        os.makedirs(folder)
        
    checkpoint_path = os.path.join(folder, "seq.ckpt")
    train_data, val_data = read_dataset("seq.txt")
    pointer = 0
    
    with tf.Session() as sess:
        model = create_model(True, sess, folder, func_type)
        step_time, loss = 0.0, 0.0
        for step in range(model.global_step.eval() + 1, MAX_ITERATIONS + 1):
            start_time = time.time()
            if pointer + batch_size >= len(train_data):
                random.shuffle(train_data)
                pointer = 0
            datas = train_data[pointer:pointer + batch_size]
            pointer += batch_size               
            input_seq, label = create_batch(datas, maps)
            step_loss, learning_rate = model.step(sess, "train", input_seq, label)
            end_time = time.time()
            step_time += (end_time - start_time)
            loss += step_loss
            if step % MODEL_SAVE_INTERVAL == 0:
                model.saver.save(sess, checkpoint_path, global_step = step)
            if step % PRINT_INTERVAL == 0:
                step_time = step_time / PRINT_INTERVAL
                loss = loss / PRINT_INTERVAL
                print ("step %d, time %.3f, loss %.3f, rate %.5f" % (model.global_step.eval(), step_time, loss, learning_rate))
                step_time, loss = 0.0, 0.0
                sys.stdout.flush()
            if step % VAL_INTERVAL == 0:
                val_rate = val_model(sess, model, val_data, maps)
                print ("val accuracy is %.3f" % (val_rate))


def val_model(sess, model, dataset, maps):
    start_pointer = 0
    end_pointer = start_pointer + batch_size
    total = 0
    while start_pointer < len(dataset):
        datas = dataset[start_pointer : end_pointer]
        start_pointer += batch_size
        end_pointer = min(start_pointer + batch_size, len(dataset))               
        input_seq, label = create_batch(datas, maps)
        answers = model.step(sess, "test", input_seq, label)
        answer_ids = np.argmax(answers, axis = -1)
        total += np.sum(label == answer_ids)
    return 1.0 * total / len(dataset)

if __name__=="__main__":
    maps = create_maps()
    if len(sys.argv) < 2:
        print ("python train.py rnn/lstm/gru")
        exit()
        
    func_type = sys.argv[1]
    train_model(maps, func_type)

## Define Sequene Model

In [None]:
import tensorflow as tf
from tensorflow.python.ops import variable_scope
from tensorflow.python.ops import init_ops
from tensorflow.python.ops import array_ops, embedding_ops
from tensorflow.contrib.layers.python.layers import initializers

gru_size = 50
embedding_size = 20
input_length = 160
vob_size = 52 + 10 + 1
output_size = 60
gru_keep_prob = 0.5
init_lr_rate = 0.001
decay_step = 4000
decay_rate = 0.5
max_gradient_norm = 3

class SeqLabel(object):
    def __init__(self, func_type, is_train = True, dtype = tf.float32):
        self.global_step = tf.Variable(0, trainable = False)
        self.gru_size = gru_size
        self.input_seq = tf.placeholder(tf.int32, [None, input_length])
        self.label = tf.placeholder(tf.int32, [None])
        
        input_embed = self.model_embedding(self.input_seq, vob_size)
        _, state = self.model_seq(input_embed, func_type, dtype)
        self.predict = self.model_answer(state)
        
        if is_train:
            params = tf.trainable_variables()
            self.loss = self.model_loss(self.predict, self.label)
            self.learning_rate = tf.train.exponential_decay(init_lr_rate, self.global_step, decay_step, decay_rate, True)
            opt = tf.train.AdamOptimizer(self.learning_rate)
            gradients = tf.gradients(self.loss, params)
            clipped_gradients, self.norm = tf.clip_by_global_norm(gradients, max_gradient_norm)
            self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)
        
        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep = 3)
    
    def step(self, session, func, input_seq, label):
        feed_dict = {}
        feed_dict[self.input_seq] = input_seq
        feed_dict[self.label] = label
        if func == "train":
            output_feeds = [self.loss, self.learning_rate, self.updates, self.norm]
            outputs = session.run(output_feeds, feed_dict)
            return outputs[0], outputs[1]
        elif func == "test":
            output_feeds = [self.predict]
            outputs = session.run(output_feeds, feed_dict)
            return outputs[0]

    def fully_connected(self, inputs, out_size, scope = "fully_connected", reuse = False, dtype = tf.float32):
        with variable_scope.variable_scope(scope, reuse = reuse, dtype = dtype):
            input_depth = inputs.get_shape()[1].value
            w = tf.get_variable("weight", shape=[input_depth, out_size])
            b = tf.get_variable("bias", shape=[out_size], initializer = init_ops.zeros_initializer())
            r = tf.matmul(inputs, w) + b
        return r
    
    #word embedding
    def model_embedding(self, seq, vob_size, dtype = tf.float32):
        with variable_scope.variable_scope("embedding", dtype = dtype):
            init = initializers.xavier_initializer()
            word_embedding = tf.get_variable("word_embedding", shape = [vob_size, embedding_size], initializer = init, dtype = dtype)
            embeddings = embedding_ops.embedding_lookup(word_embedding, seq)
            return embeddings        
    
    def model_seq(self, seq, func_type, dtype = tf.float32):
        seq = tf.transpose(seq, [1, 0, 2])
        input_list = tf.unstack(seq, axis = 0)
        if func_type == "rnn":
            outputs, state = self.model_rnn(input_list, dtype)
        elif func_type == "lstm":
            outputs, state = self.model_lstm(input_list, dtype)
        elif func_type == "gru":
            outputs, state = self.model_gru(input_list, dtype)
        else:
            raise Exception("Nothing to do")
         
        return outputs, state        
    
    def model_rnn(self, input_list, dtype = tf.float32):
        first_input = input_list[0]
        input_shape = first_input.get_shape()
        fixed_batch_size = input_shape[0]
        if fixed_batch_size.value:
            batch_size = fixed_batch_size.value
        else:
            batch_size = array_ops.shape(first_input)[0]
        state = tf.zeros([batch_size, gru_size], dtype)
            
        outputs = []
        for index in range(len(input_list)):
            with variable_scope.variable_scope("rnn", dtype = dtype):
                reuse = True if index > 0 else False
                concat = tf.concat([input_list[index], state], axis = 1)
                state = tf.tanh(self.fully_connected(concat, gru_size, reuse = reuse))
                outputs.append(state)
        return outputs, state 
    
    def model_lstm(self, input_list, dtype = tf.float32):
        #fill lstm code here
        None
    
    def model_gru(self, input_list, dtype = tf.float32):
        #fill gru code here
        None
    
    def model_tf_lstm(self, input_list, dtype = tf.float32):
        #file tensorflow gru, lstm code here
        None
    
    #project last state to answer 
    def model_answer(self, state, dtype = tf.float32):
        with variable_scope.variable_scope("model_answer", dtype = dtype):
            predict = self.fully_connected(state, output_size)
            return predict
        
    #softmax with cross entropy 
    def model_loss(self, predict, labels):
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = predict, labels = labels)
        loss = tf.reduce_mean(loss)
        return loss  

# Exercise 
* Implement LSTM, GRU by basic tensorflow function
* Try to use tensorflow high level function of LSTM and GRU
* Try to adjust parameters in model to achieve better performance
* Try to implement attention to improve performance