In [1]:
import utils
import numpy as np
import edf
from time import time
import pickle
import os

train_data, trcnt = utils.load_data_onechar('data/ptb.train.txt')
valid_data, vacnt = utils.load_data_onechar('data/ptb.valid.txt')
test_data, tecnt = utils.load_data_onechar('data/ptb.test.txt')

#train_data, trcnt = utils.load_data_onechar('data/ptb.train.short.txt')
#valid_data, vacnt = utils.load_data_onechar('data/ptb.valid.short.txt')
#test_data, tecnt = utils.load_data_onechar('data/ptb.test.short.txt')

In [2]:
hidden_dim = 200 #hidden size
n_vocab = utils.n_vocab
batch = 50
parameters = []
model = 'model_LSTM.pkl'
eta = 0.5
decay = 0.9

inp = edf.Value()
np.random.seed(0)


edf.params = []
# LSTM parameters
# input embedding
C2V = edf.Param(edf.xavier((n_vocab, hidden_dim)))
# forget gate
Wf = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bf = edf.Param(np.zeros((hidden_dim)))
# input gate
Wi = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bi = edf.Param(np.zeros((hidden_dim)))
# carry cell
Wc = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bc = edf.Param(np.zeros((hidden_dim)))
# output cell
Wo = edf.Param(edf.xavier((2*hidden_dim, hidden_dim)))
bo = edf.Param(np.zeros((hidden_dim)))
# output embedding
V = edf.Param(edf.xavier((hidden_dim, n_vocab)))
# for sake of saving
parameters.extend([C2V, Wf, bf, Wi, bi, Wc, bc, Wo, bo, V])
#https://gist.github.com/karpathy/d4dee566867f8291f086
#Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden. C2V
#Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden Wi,Wf,Wc,Wo
#Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output  V

# load the trained model if exist
if os.path.exists(model):
    with open(model, 'rb') as f:
        p_value = pickle.load(f)
        idx = 0
        for p in p_value:
            parameters[idx].value = p
            idx += 1
                    

# Please finish your LSTM cell in this function. it will build the model given the input inp, it should
# return loss and prob score
def LSTM(x,cprev, hprev): #previous c and previous h
    #reference: CS231 LSTM
    #print(x.shape)
    #print(hprev.shape)
    f = edf.Sigmoid(edf.Add(bf,edf.VDot(edf.ConCat(x,hprev),Wf))) #should this be vdot?
    i = edf.Sigmoid(edf.Add(bi,edf.VDot(edf.ConCat(x,hprev),Wi))) #should this be vdot? and the order of multiplication
    g = edf.Tanh(edf.Add(bc,edf.VDot(edf.ConCat(x,hprev),Wc))) # g here is carry. should this be sigmoid?
    o = edf.Sigmoid(edf.Add(bo,edf.VDot(edf.ConCat(x,hprev),Wo))) #should this be vdot?
    #print(i.x.shape,g.x.shape)
    c = edf.Add(edf.Mul(f,cprev),edf.Mul(i,g))
    h = edf.Mul(o,edf.Tanh(c))
    
    return c,h

def BuildModel():
 
    edf.components = []
    score = []
    loss = edf.Value(edf.DT(0))
    B,T = inp.value.shape #batch size, sentence length
    
    c0 = edf.Value(np.random.randn(B,hidden_dim))
    h0 = edf.Value(np.random.randn(B,hidden_dim))
    
    #c = []
    #h = []
    for i in range(T-1):
        x = edf.Reshape(edf.Embed(edf.Value(inp.value[:,i]), C2V),[-1, hidden_dim])
        c0,h0 = LSTM(x,c0,h0)
        #print(xt.value.shape)
        #print(c0.value.shape)
        #print(h0.value.shape)
        
        prediction =  edf.SoftMax(edf.VDot(h0, V)) #Dot or Mul
        
        #loss func reference: mingda chen.
        score.append(prediction)
        mask = np.zeros(B * n_vocab)
        idx = np.int32(inp.value[:,i+1])
        mask_mask = [i*n_vocab+j for (i, j) in zip(range(B),idx) if j!=0]
        mask[mask_mask] = 1
        mask = edf.Value(mask.reshape(B, n_vocab))
        loss = edf.Add(edf.MeanwithMask(edf.LogLoss(prediction),mask),loss)
        
    loss = edf.Mul(loss,edf.Value(1./T))
    #wrand = np.random.randn(*score.shape)
    #loss = np.sum(score * wrand)
        
    return loss, score
    
    
# calculate the perplexity         
def CalPerp(score):
    
    prob = [p.value for p in score]
    prob = np.transpose(np.stack(prob, axis = 0),(1,0,2))
    
    B = prob.shape[0]
    T = prob.shape[1]
    V = prob.shape[2]
    
    masks = np.zeros((B, T), dtype=np.int32)
    masks[inp.value[:,1:] != 0] = 1
    
    prob = prob.reshape(-1)
    idx = np.int32(inp.value[:,1:].reshape(-1))
    outer_dim = len(idx)
    inner_dim = len(prob)/outer_dim
    pick = np.int32(np.array(range(outer_dim))*inner_dim + idx)
    prob = prob[pick].reshape(B, T)
        
    return -np.sum(np.log(prob[np.nonzero(prob*masks)]))


# predict the sequence
def Predict(max_step, prefix):
   
    edf.components = []

    T = max_step       
    h = edf.Value(np.zeros((1, hidden_dim))) 
    c = edf.Value(np.zeros((1, hidden_dim))) 
    
    prediction = []

    for t in range(T):
   
        if t < len(prefix):
            pred = edf.Value(prefix[t])
            prediction.append(pred)              
        else:
            prediction.append(pred)

        wordvec = edf.Embed(pred, C2V)
        xt = edf.Reshape(wordvec, [-1, hidden_dim])
        h_next,c_next = LSTM(xt, h, c)
        p = edf.SoftMax(edf.VDot(h_next, V))
        pred = edf.ArgMax(p)
        h = h_next
        c = c_next   
            
    edf.Forward()
    
    idx = [pred.value for pred in prediction]
    stop_idx = utils.to_index('}')
    
    if stop_idx in idx:
        return idx[0:idx.index(stop_idx)+1]
    else:
        return idx

def Eval(data, cnt):
    
    perp = 0.
    avg_loss = 0.
    test_batches = range(0, len(data), batch)
    test_minbatches = [data[idx:idx+batch] for idx in test_batches]
    
    for minbatch in test_minbatches:
        
        x_padded = utils.make_mask(minbatch)
        inp.set(x_padded)
        loss, score = BuildModel()
        edf.Forward()
        avg_loss += loss.value
        perp += CalPerp(score)
           
    perp = np.exp(perp/cnt)
    avg_loss /= len(test_batches)
    return perp, avg_loss


############################################### training loop #####################################################

batches = range(0, len(train_data), batch)
minbatches = [train_data[idx:idx+batch] for idx in batches]

epoch = 30

# initial Perplexity and loss
perp, loss = Eval(valid_data, vacnt)
print("Initial: Perplexity: %0.5f Avg loss = %0.5f" % (perp, loss))    
best_loss = loss
prefix = 'the agreements bring'  
generation = Predict(400, utils.to_idxs(prefix))
print("Initial generated sentence ")
print (utils.to_string(generation))
    
    
for ep in range(epoch):

    perm = np.random.permutation(len(minbatches)).tolist() 
    stime=time()
    
    for k in range(len(minbatches)):
        
        minbatch = minbatches[perm[k]]
        x_padded = utils.make_mask(minbatch)
        inp.set(x_padded)
        loss, score = BuildModel();
        #print(inp.shape)
        edf.Forward()
        edf.Backward(loss)
        edf.GradClip(10)
        edf.SGD(eta)
       
    duration = (time() - stime)/60.
    
    perp, loss = Eval(valid_data, vacnt)
    print("Epoch %d: Perplexity: %0.5f Avg loss = %0.5f [%.3f mins]" % (ep, perp, loss, duration))
    
    # generate some text given the prefix and trained model
    prefix = 'the agreements bring'  
    generation = Predict(400, utils.to_idxs(prefix))
    print("Epoch %d: generated sentence " % ep)
    print (utils.to_string(generation)) 

    if loss < best_loss:
        
        best_loss = loss
        # save the model
        f = open(model, 'wb')
        p_value = []
        for p in parameters:
            p_value.append(p.value)
        pickle.dump(p_value, f)
        
    else:
        
        # load the last best model and decay the learning rate
        eta *= decay
        with open(model, 'rb') as f:
            p_value = pickle.load(f)
            idx = 0
            for p in p_value:
                parameters[idx].value = p
                idx += 1

Initial: Perplexity: 49.16116 Avg loss = 3.88498
Initial generated sentence 
the agreements bringwx*//659e&@$$vl77#*b8bffdx<p3u7dm77dmx3h77dmal77#*b/655zssd<>yyy.wxp37dm77dmx3h77dmal77#*b/655zssd<>yyy.wxp37dm77dmx3h77dmal77#*b/655zssd<>yyy.wxp37dm77dmx3h77dmal77#*b/655zssd<>yyy.wxp37dm77dmx3h77dmal77#*b/655zssd<>yyy.wxp37dm77dmx3h77dmal77#*b/655zssd<>yyy.wxp37dm77dmx3h77dmal77#*b/655zssd<>yyy.wxp37dm77dmx3h77dmal77#*b/655zssd<>yyy.wxp37dm77dmx3h77dmal77#*b/655zssd<>yyy.wxp
Epoch 0: Perplexity: 8.42353 Avg loss = 2.15851 [17.282 mins]
Epoch 0: generated sentence 
the agreements bringe to the are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are are
Epoch 1: Perplexity: 6.80625 A

KeyboardInterrupt: 