In [2]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import os
import re
import pandas as pd
%matplotlib inline

In [3]:
def vocab2vec(vocab_size, vocab_length=10**7):
    f = open("Combined_String.txt", "r")
    s = f.read()
    f.close()
    D = 'abcdefghijklmnopqrstuvwxyz .,\'1234567890";'
    res = []
    for i in range(vocab_length):
        c = s[i].lower()
        v = np.zeros((vocab_size))
        try:
            idx = D.index(c)
            v[idx] = 1
            res.append(v)
        except (ValueError, IndexError) as e:
            pass
        
        
    ret = np.array(res) # A list of shape (vocab_length,) one-hot encoded characters
    print ("shape is: {}".format(ret.shape))
    return ret

#vocab2vec(40)

## Gated Recurrent Unit ##

- Tutorial [here](http://www.wildml.com/2015/10/recurrent-neural-network-tutorial-part-4-implementing-a-grulstm-rnn-with-python-and-theano/)
- How the seq2seq NMT model is set up here:
  Update gate:  
  $$z = \sigma ([x_t, C_{t-1}] W_z)$$
  Information retain gate:  
  $$i = \sigma ([x_t, C_{t-1}] W_r)$$
  Update cell state:  
  $$C_t = (1-z) h + z C_{t-1}$$
  Output gate:  
  $$h = tanh ([x_r, C_{t-1}, i] W_h)$$

In [7]:
# My implementation of GRU on character reading - based on the equation above

class GRU:
    def __init__(self, vocab_size, cell_size, batch_size, continue_training = False, global_step = -1):
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.cell_size = cell_size
        self.global_step = global_step
        
        self.global_step = global_step
        self.MODEL_NAME = "./model/GRU"
        self.TEST_SAMPLE_SEQ_LENGTH = 100
        self._construct_networks(vocab_size, cell_size, batch_size, continue_training, global_step)
        
        
        
    def _weight(self, shape, dtype=tf.float32, name=None):
        m = 0
        s = 1
        return tf.Variable(tf.random_normal(shape=shape, mean=m, stddev=s, dtype=dtype), dtype, name=name)
    
    def _const(self, shape, name, dtype=tf.float32):
        d0 = shape[0]
        d1 = shape[1]
        tmp = np.zeros(shape=shape)
        tmp[:, 0] = np.ones(shape=[d0, 1])
        return tf.constant(tmp, dtype=dtype, name=name)
    
    def _ohe2char(self, ohe_vec): # takes only the first row in ohe_vec
        assert ohe_vec.shape[1] == self.vocab_size
        chars = 'abcdefghijklmnopqrstuvwxyz .,\'1234567890";'
        choice_id = np.random.choice(self.vocab_size, p=ohe_vec[0,:].ravel())
        return chars[choice_id]
            
    def _clip_if_not_none(self, grad, grad_lo, grad_hi):
        if grad == None:
            return grad
        else:
            return tf.clip_by_value(grad, grad_lo, grad_hi)
        
    def _construct_networks(self, vocab_size, cell_size, batch_size, continue_training, global_step):
        graph = tf.Graph()
        with graph.as_default():
            x = tf.placeholder(tf.float32, [batch_size, vocab_size], name="x")
            y = tf.placeholder(tf.float32, [batch_size, vocab_size], name="y")
            init_C = tf.placeholder(tf.float32, [batch_size, vocab_size], name="init_C")
        
            if not continue_training:
                # Update gate
                Wz = self._weight([2 * vocab_size, cell_size], name="Wz")              
                z = tf.nn.softmax(tf.matmul(tf.concat([init_C, x], axis=1), Wz), dim=1)

                # Reset gate
                Wi = self._weight([2 * vocab_size, cell_size], name="Wi")
                i = tf.nn.softmax(tf.matmul(tf.concat([init_C, x], axis=1), Wi), dim=1)

                # Output layers
                Wo = self._weight([3 * vocab_size, vocab_size], name="Wo")
                h_ = tf.tanh(tf.matmul(tf.concat([init_C, x, i], axis=1), Wo))
                h = tf.nn.dropout(h_, keep_prob = 0.9)
                hs = tf.nn.softmax(h, dim=1)
                
                # Update cell state
                C = (1 - z) * h + z * init_C

                # Loss function, etc.
                loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=h, name="loss"))
                optimizer = tf.train.AdamOptimizer()
                grad_limit = tf.constant(5.0, dtype=tf.float32, name="grad_limit")
                grads_and_vars = optimizer.compute_gradients(loss)
                clipped_grads_and_vars = []
                for grad, var in grads_and_vars:
                    clipped_grad = self._clip_if_not_none(grad, -grad_limit, grad_limit)
                    clipped_grads_and_vars.append((clipped_grad, var))
                train_step = optimizer.apply_gradients(clipped_grads_and_vars, name="train")

                # Session, Saver, etc.
                saver = tf.train.Saver()
                sess = tf.Session()
                sess.run(tf.global_variables_initializer())
                saver.save(sess, self.MODEL_NAME,global_step=0)

            else:
                sess = tf.Session()

                saver = tf.train.import_meta_graph(self.MODEL_NAME + "-{}.meta".format(global_step))
                saver.restore(sess,tf.train.latest_checkpoint('./'))

                graph = tf.get_default_graph()

                x = graph.get_tensor_by_name("x:0")
                y = graph.get_tensor_by_name("y:0")
                init_C = graph.get_tensor_by_name("init_C:0")
                C = graph.get_tensor_by_name("C:0")
                h = graph.get_tensor_by_name("h:0")
                Wz = graph.get_tensor_by_name("Wz:0")
                Wi = graph.get_tensor_by_name("Wi:0")
               
                Wo = graph.get_tensor_by_name("Wo:0")
                loss = graph.get_tensor_by_name("loss:0")

                train_step = graph.get_tensor_by_name("train:0")
            
        
        # After creation, save to class variables
        self.x = x
        self.y = y
        self.init_C = init_C
        self.C = C
        self.h = h
        self.hs = hs
        self.loss = loss
        self.train_step = train_step
        self.saver = saver
        self.sess = sess
        
        

        
    def train(self, steps, training_data, sample = True, sample_every = 200000, save_per_step = 1000):
        save_per_steps = 10
        batch_size = self.batch_size
        vocab_size = self.vocab_size
        cell_size = self.cell_size
        
        
        for stp in range(steps):
            next_C = np.random.rand(batch_size, cell_size)
            
            p = 0
            while p < (len(training_data) - batch_size - 1):
                prev_C = next_C
                
                fdata = {self.init_C: prev_C,
                         self.x: training_data[p : p + batch_size], 
                         self.y: training_data[p+1 : p+1+batch_size]
                         }
                _, next_C, loss = self.sess.run([self.train_step, self.C, self.loss], feed_dict = fdata)
                
                p += batch_size
                
            
                if sample and p % sample_every == 0 and stp % 100 == 0:
                    # Perform a trial of sample run 
                    words_outputs = ""
                    for i in range(self.TEST_SAMPLE_SEQ_LENGTH):
                        fdata = {self.init_C: prev_C,
                                self.x: training_data[p : p + batch_size],
                                 self.y: training_data[p+1 : p+1+batch_size]
                                }
                        vec_ohe, loss = self.sess.run([self.hs, self.loss], feed_dict = fdata)
                        words_outputs += self._ohe2char(vec_ohe)
                    
                    print ("--- n = {}, p = {}, loss = {} ---".format(self.global_step+1, p, loss))
                    print ("{}\n".format(words_outputs))
                    
                    
            if self.global_step % save_per_steps == 0:
                self.saver.save(self.sess, self.MODEL_NAME, global_step = self.global_step)
        
            self.global_step += 1
    
    


In [5]:
if __name__ == "__main__":
    print ("Started!")
    gru = GRU(vocab_size = 40,
                    cell_size = 40,
                    batch_size = 1000,
                    continue_training = False,
                    global_step = -1)
    training_words = vocab2vec(40, 10 ** 6)-
    gru.train(steps = 10**5, training_data = training_words, sample = True, sample_every = 500000, save_per_step = 100000)
# Used 0.00001 as the standard deviation of the weights

Started!
shape is: (992170, 40)
--- n = -1, p = 500000, loss = 2.9279861450195312 ---
qfsls rdcw62s , srtr'jaks'anscn8unb pso56dsfxi m  xtayt bib3ovv ethe 2iuc sa6epksmezhppedt.li aafkc.

--- n = 99, p = 500000, loss = 2.756561517715454 ---
otoux'q4csfts46e'tllo s1nnsocdndrohdckdc3pq0ts5,xtwnnr0 notgf sntu etd5nnfs8nop ytotwlctgs9v ougissw

--- n = 199, p = 500000, loss = 2.757103204727173 ---
tsc'nvdclgsttn2vsgpntt7sxf3ostvnd4aos tt0ptytve5rstenc mslty,gto esn've8tdttrn8 yqomc uttc s3sh5slrs

--- n = 299, p = 500000, loss = 2.7569212913513184 ---
vyc0uraqz8g ostltolns, u9gfenp21sgedctdtgsznetn4lkpnsclrwgl1tstijttzgbtstsn'ndls5ena9.tgsnetws.tnur8

--- n = 399, p = 500000, loss = 2.7572059631347656 ---
vxftcijhyhtnxwgmokl1gfbe.fkc52s3tn3oel6astizr tlg0llnscy7ytiqs05rn0xgdz8cimhecatneftlpto00v'tgclopf'

--- n = 499, p = 500000, loss = 2.757526159286499 ---
u8cs we cso nlthqsna7ae9cnzh3nm'7dazod rnet da'twccldkfv 7cgnnfasttsmnsosynfn0ossmnnonnwstn2c.tw mt'

--- n = 599, p = 500000, loss =

KeyboardInterrupt: 

In [7]:
if __name__ == "__main__":
    print ("Started!")
    gru = GRU(vocab_size = 40,
                    cell_size = 40,
                    batch_size = 1000,
                    continue_training = False,
                    global_step = -1)
    training_words = vocab2vec(40, 10 ** 6)
    gru.train(steps = 10**5, training_data = training_words, sample = True, sample_every = 500000, save_per_step = 100000)
# Used 1 as the standard deviation of the weights

Started!
shape is: (992170, 40)
--- n = -1, p = 500000, loss = 2.9273436069488525 ---
wq v 57gp8n34sm weao'nt coeottbgf y e ltgtcraef3dfralgfnskln  ccn3meet778qos7s9,hb8 d3nja3tt z,ztdeb

--- n = 99, p = 500000, loss = 2.7560577392578125 ---
ezven96,'id'nttt,e,yc nt. setrtr'znny7 zdtrfsnfh6lua1lgcvcnh5ov7ftjs6o 'c,'0ecd6kcunnl1 nsrsvtd  tss

--- n = 199, p = 500000, loss = 2.7572073936462402 ---
rnsc9cagc68en0sugtcv5oensn'avo9,94kgg34snttntymwenx26mcyltcjn8f'dnfkddgs yar1nl3st60dac29s,tgxlnek'c

--- n = 299, p = 500000, loss = 2.7574381828308105 ---
t5f862s eczts'rs jl oadd3enp  n88n,9cle8 svvl 8vsnmn8vtddhedfo6aue6cxvvbtjjdvzg. gsit98ssgmncj9nssss

--- n = 399, p = 500000, loss = 2.7579586505889893 ---
aonmuwdt tttdlsndgxccvscsg,t'nso7lon0cd9.stn 0c00,ylhnneg dtsnss'ndq95g no,otdct8cnntn6pv5avog.eqjzt

--- n = 499, p = 500000, loss = 2.7577033042907715 ---
vmorcpl8.1rnosovn5tsctyeesf8ttnu f.lqdcdk 3ncv8m sccmg tvdw1ystxnascom0n5ginconvnn nsllusnhn3yvwog 2

--- n = 599, p = 500000, los

KeyboardInterrupt: 

In [8]:
# Applied drouput layer beore h
if __name__ == "__main__":
    print ("Started!")
    gru = GRU(vocab_size = 40,
                    cell_size = 40,
                    batch_size = 1000,
                    continue_training = False,
                    global_step = -1)
    training_words = vocab2vec(40, 10 ** 6)
    gru.train(steps = 10**5, training_data = training_words, sample = True, sample_every = 500000, save_per_step = 100000)
# Used 1 as the standard deviation of the weights

Started!
shape is: (992170, 40)
--- n = 0, p = 500000, loss = 3.7101454734802246 ---
munk..i.0xx2'ht  hg,jksih9j9 utx2m.motukyhidnn.i.5nkfu9.uperim6'wsozekme2ceouy8tco9jkkmnw89h2xbtjs1.

--- n = 100, p = 500000, loss = 2.804236888885498 ---
k.4bt4scaeyauxo tdrlutkc5txmtc4ead.t4 sssd7ennfstcwnd ers sj9'zltztsfy6wg2s138ybscc6d.1htc.clp1snqjo

--- n = 200, p = 500000, loss = 2.80615234375 ---
en7ein 57sn scnh3xcet9 ccv9ssnkzcqohe2c 00 gofedo9. vqg.e8etn8t5gq9n lng6cn wa noo st tnt tvn'vutnn4

--- n = 300, p = 500000, loss = 2.8105077743530273 ---
0nsc vrdnldicov8,scts3l.m.ncgsgnss,mez.eccysfseyshlpsjulce bhlv 'stvoetqip0ougnmxedpnnnnjascrgb1kmam

--- n = 400, p = 500000, loss = 2.7862095832824707 ---
t qrjdpsttgdct 9ut3 0vz4m9gecwotsemmtbtnsbgpd'dnbnsdsl5dsdgn2nc1irsnssnkrovstcnkcujucyuskogjdtctfnqe

--- n = 500, p = 500000, loss = 2.7972395420074463 ---
nht0iksv0s14casn5lqb2tnncttnfurst'antc16pdl94xnwnxssdsesnwstls2eensocnjcrctwtvetg tbjokttfcocjsplfnk

--- n = 600, p = 500000, loss = 2.

KeyboardInterrupt: 