# COMP5329 - Deep Learning 

## Tutorial 8 - LSTM and GRU

**Semester 1, 2022**

**Objectives:**

* How to implement LSTM and GRU in pytorch
* How to use LSTM, GRU, RNN cell in pytorch
* How to process sequece data by deep learning


**Instructions:**

* Learning to count letter by using RNN, LSTM and GRU


# Generate sequence data

** Data instruction **
* A-Z, a-z, 0-9
* The differece number between the upper letters and lower letters with some noise.
* 80000 line data, 64000 train data, 16000 validate data

** Example **
* aAA304     -1
* bbB234BbB   0
* ccccccC     5


In [2]:
import random

def generate_line(input_char):
    num1 = random.randint(1, 30)
    num2 = random.randint(1, 30)
    src = [chr(input_char) for _ in range(num1)] # lowercase 
    target = [chr(input_char - 32) for _ in range(num2)] # uppercase
    src.extend(target)
    
    noise_num = random.randint(0, 100)
    for _ in range(noise_num):
        src.append(str(random.randint(0, 9))) # noise number
    random.shuffle(src)    
    
    return ''.join(src), num1 - num2 + 29
    

def generate_data(size, filename):
    f = open(filename, "w")
    s = set()
    count = 0
    while count < size:
        c = random.randint(ord('a'), ord('z'))
        src, target = generate_line(c)
        if src in s or src[::-1] in s:
            continue
        count += 1
        if count % 10000 == 0:
            print ("generate %d line" % count)
        s.add(src)
        f.write('\t'.join([src, str(target)]))
        f.write('\n')
    f.close()
    
generate_data(80000, "seq.txt")

generate 10000 line
generate 20000 line
generate 30000 line
generate 40000 line
generate 50000 line
generate 60000 line
generate 70000 line
generate 80000 line


# Model Training
* Create train and val data split
* Zero padding data and map letter to number 
* Embedding input to vector
* Feed the RNN/LSTM/GRU with embedding sequence 
* Predict the label from the last state of sequence

## Import Pytorch library, and setup global hyperparameters

In [3]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

gru_size = 50
hidden_size = 50
embedding_size = 20
input_length = 160
vob_size = 52 + 10 + 1
output_size = 60
gru_keep_prob = 0.5
init_lr_rate = 0.001
# decay_step = 4000
# decay_rate = 0.5
max_gradient_norm = 3

  from .autonotebook import tqdm as notebook_tqdm


##Define a RNN Model

In [None]:
class RNNModel(nn.Module):
  def __init__(self, in_feature, hidden_size, n_class):
    super(RNNModel, self).__init__()
    self.in_feature = in_feature
    self.hidden_size = hidden_size
    self.n_class = n_class
    self.fully_connected = nn.Linear(in_feature+self.hidden_size, self.hidden_size)
    self.pred_layer = nn.Linear(self.hidden_size, self.n_class)
    self.tanh = nn.Tanh()
    
  def forward(self, input, dtype=torch.float):
    T = input.shape[0]
    batch_size = input.shape[1]
    outputs = torch.zeros(size=(T, batch_size, self.hidden_size), dtype=dtype)
    state = torch.zeros(size=(batch_size, self.hidden_size), dtype=dtype)

    for t in range(T):
      concat = torch.cat([input[t], state], dim=1)
      state = self.tanh(self.fully_connected(concat))
      outputs[t] = state
    return outputs, state
  
  def predict(self, input_state, dtype=torch.float):
    _, last_state = self.forward(input_state)
    predict = self.tanh(self.pred_layer(last_state))
    return predict

##Define a LSTM Model

In [4]:
class LSTMModel(nn.Module):
  def __init__(self, in_feature, hidden_size, n_class):
    super(LSTMModel, self).__init__()
    self.in_feature = in_feature
    self.hidden_size = hidden_size
    self.n_class = n_class
    self.fully_connected = nn.Linear(self.in_feature+self.hidden_size, 4 * self.hidden_size)
    self.pred_layer = nn.Linear(self.hidden_size, self.n_class)
    self.tanh = nn.Tanh()
    self.sigmoid = nn.Sigmoid()
  
  def forward(self, input, dtype=torch.float):
    T = input.shape[0]
    batch_size = input.shape[1]
    outputs = torch.zeros(size=(T, batch_size, self.hidden_size), dtype=dtype)
    c, h = torch.unbind(torch.zeros([2, batch_size, self.hidden_size]), dim=0)
    #implement LSTM forward procedure here
    for t in range(T):
      concat = torch.cat([input[t], h], dim=1)
      concat = self.fully_connected(concat)
      i, f, o, g = torch.split(concat, self.hidden_size, dim=1)
      i, f, o, g = self.sigmoid(i), self.sigmoid(f), self.sigmoid(o), self.tanh(g)
      c = f * c + i * g
      h = o * self.tanh(c)
      outputs[t] = h
    
    return outputs, h
  
  def predict(self, input_state, dtype=torch.float):
    _, last_state = self.forward(input_state)
    predict = self.pred_layer(last_state)
    return predict


##Define a GRU Model

In [None]:
class GRUModel(nn.Module):
  def __init__(self, in_feature, hidden_size, n_class):
    super(GRUModel, self).__init__()
    self.in_feature = in_feature
    self.hidden_size = hidden_size
    self.n_class = n_class
    self.fully_connected_1 = nn.Linear(in_feature+self.hidden_size, 2 * self.hidden_size)
    self.fully_connected_2 = nn.Linear(in_feature+self.hidden_size, self.hidden_size)
    self.pred_layer = nn.Linear(self.hidden_size, self.n_class)
    self.tanh = nn.Tanh()
    self.sigmoid = nn.Sigmoid()
  
  def forward(self, input, dtype=torch.float):
    T = input.shape[0]
    batch_size = input.shape[1]
    outputs = torch.zeros(size=(T, batch_size, self.hidden_size), dtype=dtype)
    state = torch.zeros(size=(batch_size, self.hidden_size), dtype=dtype)
    # implement GRU forward procedure here
   
    return outputs, state

  def predict(self, input_state, dtype=torch.float):
    _, last_state = self.forward(input_state)
    predict = self.pred_layer(last_state)
    return predict

## Setup Dataset, Train and Evaluate Model

In [5]:
import os
import random
import time
import sys
import gc
import numpy as np

MAX_ITERATIONS = 100000
VAL_INTERVAL = 1000
PRINT_INTERVAL = 10
batch_size = 64

def read_dataset(file_name):
    f = open(file_name)
    ls = []
    for line in f.readlines():
        line = line.strip() 
        l = line.split('\t')
        ls.append([l[0], int(l[1])])
    
    random.shuffle(ls)
    return ls[:64000], ls[64000:]


#map letter to number 
#a-z -> 1 - 26
#A-Z -> 27 - 52
#0-9 -> 53 - 62
def create_maps():
    dic = {}
    counter = 1
    for i in range(ord('a'), ord('z') + 1):
        dic[chr(i)] = counter
        counter += 1
    
    for i in range(ord('A'), ord('Z') + 1):
        dic[chr(i)] = counter
        counter += 1
        
    for i in range(ord('0'), ord('9') + 1):
        dic[chr(i)] = counter
        counter += 1
    
    return dic

def word_embedding(input_seq, vob_size, dtype=torch.float):
    word_embed = nn.Embedding(vob_size, embedding_size)
    embeddings = word_embed(input_seq.long())
    return embeddings

def create_batch(datas, maps):
    size = len(datas)
    seqs = np.zeros((size, input_length), dtype = np.int32)
    labels = np.zeros(size, dtype = np.int32)
    for i in range(size):
        labels[i] = datas[i][1]
        seq = datas[i][0]
        l = input_length - len(seq) # zero padding
        for j in range(len(seq)):
            seqs[i][l + j] = maps[seq[j]] 
        
    return seqs, labels


def train_model(maps, func_type):
  train_data, val_data = read_dataset("seq.txt")
  pointer = 0
  if func_type == 'rnn':
    model = RNNModel(embedding_size, hidden_size, output_size)
  elif func_type == 'lstm':
    model = LSTMModel(embedding_size, gru_size, output_size)
  elif func_type == 'gru':
    model = GRUModel(embedding_size, gru_size, output_size)
  else:
    print('Please specify a valid model!')
    return
  model.train()
  optimizer = optim.Adam(model.parameters(), lr=init_lr_rate)
  model_loss = nn.CrossEntropyLoss()
  step_time, loss = 0.0, 0.0
  for step in range(MAX_ITERATIONS + 1):
    start_time = time.time()
    if pointer + batch_size >= len(train_data):
      random.shuffle(train_data)
      pointer = 0
    datas = train_data[pointer:pointer + batch_size]
    pointer += batch_size
    input_seq, label = create_batch(datas, maps)
    input_seq, label = torch.from_numpy(input_seq), torch.from_numpy(label)
    seq_emb = word_embedding(input_seq, vob_size)
    input_seq_emb = seq_emb.permute(1,0,2) 
    out = model.predict(input_seq_emb)
    step_loss = model_loss(out, label.long())
    optimizer.zero_grad()
    step_loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_gradient_norm)
    optimizer.step()
    end_time = time.time()
    step_time += (end_time - start_time)
    loss += step_loss
    
    if step % PRINT_INTERVAL == 0:
      step_time = step_time / PRINT_INTERVAL
      loss = loss / PRINT_INTERVAL
      print ("step %d, time %.3f, loss %.3f" % (step, step_time, loss))
      step_time, loss = 0.0, 0.0
      
    if step % VAL_INTERVAL == 0:
      model.eval()
      val_rate = val_model(model, val_data, maps)
      model.train()
      print("val accuracy is %.3f " % (val_rate))

def val_model(model, dataset, maps):
  start_pointer = 0
  end_pointer = start_pointer + batch_size
  total = 0
  with torch.no_grad():
    while start_pointer < len(dataset):
      datas = dataset[start_pointer: end_pointer]
      start_pointer += batch_size
      end_pointer = min(start_pointer + batch_size, len(dataset))
      input_seq, label = create_batch(datas, maps)
      input_seq, label = torch.from_numpy(input_seq), torch.from_numpy(label)
      seq_emb = word_embedding(input_seq, vob_size)
      input_seq_emb = seq_emb.permute(1,0,2)
      answers = model.predict(input_seq_emb)
      answer_ids = np.argmax(answers.detach().numpy(), axis = -1)
      total += np.sum(label.detach().numpy() == answer_ids)
    return 1.0 * total / len(dataset)

maps = create_maps()
func_type = "lstm" # rnn/lstm/gru
train_model(maps, func_type)
    
    

  

NameError: name 'RNNModel' is not defined

# Exercise 
* Implement RNN, LSTM, GRU by basic pytorch function
* Try to adjust parameters in model to achieve better performance