In [3]:
import os
import trax
import trax.fastmath.numpy as np
import pickle
import itertools
import numpy
import random as rnd
from trax import fastmath
from trax import layers as tl

# set random seed
rnd.seed(32)

In [4]:
dirname = 'data/'
lines = [] 
for filename in os.listdir(dirname):
    with open(os.path.join(dirname, filename)) as files:
        for line in files:
            # remove leading and trailing whitespace
            pure_line = line.strip()
            # if pure_line is not the empty string,
            if pure_line:
                lines.append(pure_line.lower())
                

print(f"Number of lines: {len(lines)}")
print(lines[100])

Number of lines: 125097
hath beaten down young hotspur and his troops,


## make training and eval from data

In [5]:
eval_lines = lines[-1000:] 
lines = lines[:-1000] 

print(f"Number of lines for training: {len(lines)}")
print(f"Number of lines for validation: {len(eval_lines)}")

Number of lines for training: 124097
Number of lines for validation: 1000


## make tensor from each letter(convert to ascii code)

In [6]:
def line_to_tensor(line, EOS_int=1):
    
    tensor = []
    # for each character:
    for c in line:
        c_int = ord(c)
        tensor.append(c_int)

    tensor.append(EOS_int)
    return tensor

In [7]:
line_to_tensor('I like deeplearning!')

[73,
 32,
 108,
 105,
 107,
 101,
 32,
 100,
 101,
 101,
 112,
 108,
 101,
 97,
 114,
 110,
 105,
 110,
 103,
 33,
 1]

## batch data generator

In [8]:
def batch_data_generator(batch_size, max_length, data_lines, line_to_tensor=line_to_tensor):
    index =[]
    while True:
        if len(index)<len(data_lines):
            #find index of lines less than maxlength
            index = numpy.where([1 if len(line)<max_length else 0 for line in data_lines])[0]
        batch_index = numpy.random.choice(index,batch_size)
        #remove used index 
        index = [x for x in index if x not in batch_index]
        #make a batch
        batch = [data_lines[i] for i in batch_index]  

        batch_ = []
        mask = []
        # make a tensor
        for li in batch:
            tensor = line_to_tensor(li)
            pad = [0] * (max_length - len(tensor))
            tensor_pad = tensor + pad
            example_mask = [0 if t == 0 else 1 for t in tensor_pad]
            mask.append(example_mask)
            batch_.append(tensor_pad)
        batch_np_arr = np.array(batch_)
        mask_np_arr = np.array(mask)
        yield batch_np_arr,batch_np_arr,mask_np_arr

In [9]:
# Try out batch data generator
tmp_lines = ['12345678901', #length 11
             '123456789', # length 9
             '2345690', # length 9
             '345678901'] # length 9

# Get a batch size of 2, max length 10
tmp_data_gen = batch_data_generator(batch_size=2, 
                              max_length=15, 
                              data_lines=tmp_lines,
                            )

# get one batch
tmp_batch = next(tmp_data_gen)

# view the batch
tmp_batch



(DeviceArray([[49, 50, 51, 52, 53, 54, 55, 56, 57,  1,  0,  0,  0,  0,  0],
              [50, 51, 52, 53, 54, 57, 48,  1,  0,  0,  0,  0,  0,  0,  0]],            dtype=int32),
 DeviceArray([[49, 50, 51, 52, 53, 54, 55, 56, 57,  1,  0,  0,  0,  0,  0],
              [50, 51, 52, 53, 54, 57, 48,  1,  0,  0,  0,  0,  0,  0,  0]],            dtype=int32),
 DeviceArray([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
              [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]], dtype=int32))

## GRU model

In [10]:
def GRULM(vocab_size=256, d_model=512, n_layers=2, mode='train'):
    model = tl.Serial(
      tl.ShiftRight(mode=mode), # Stack the ShiftRight layer
      tl.Embedding(vocab_size=vocab_size, d_feature=d_model), # Stack the embedding layer
      [tl.GRU(n_units=d_model) for _ in range(n_layers)], # Stack GRU layers of d_model units keeping n_layer parameter in mind (use list comprehension syntax)
      tl.Dense(n_units=vocab_size), # Dense layer
      tl.LogSoftmax() # Log Softmax
    )
    return model


In [11]:
model = GRULM()
print(model)

Serial[
  Serial[
    ShiftRight(1)
  ]
  Embedding_256_512
  GRU_512
  GRU_512
  Dense_256
  LogSoftmax
]


## train model

In [18]:
from trax.supervised import training


def train_model(model, data_generator, batch_size=32, max_length=64, lines=lines, eval_lines=eval_lines, n_steps=1, output_dir='model/'): 

    bare_train_generator = data_generator(batch_size, max_length, data_lines=lines)
    infinite_train_generator = itertools.cycle(bare_train_generator)
    
    bare_eval_generator = data_generator(batch_size, max_length, data_lines=eval_lines)
    infinite_eval_generator = itertools.cycle(bare_eval_generator)
   
    train_task = training.TrainTask(
        labeled_data=infinite_train_generator, 
        loss_layer=tl.CrossEntropyLoss(),   
        optimizer=trax.optimizers.Adam(0.0005)     
    )

    eval_task = training.EvalTask(
        labeled_data=infinite_eval_generator,    
        metrics=[tl.CrossEntropyLoss(), tl.Accuracy()], 
        n_eval_batches=3      
    )
    
    training_loop = training.Loop(model,
                                  tasks = train_task,
                                  eval_tasks=eval_task,
                                  output_dir=output_dir)

    training_loop.run(n_steps=n_steps)
    
    return training_loop


In [20]:
training_loop = train_model(GRULM(),batch_data_generator,n_steps=1000)


Step   5700: Ran 100 train steps in 46.66 secs
Step   5700: train CrossEntropyLoss |  1.44672728
Step   5700: eval  CrossEntropyLoss |  1.41918472
Step   5700: eval          Accuracy |  0.55000877

Step   5800: Ran 100 train steps in 43.61 secs
Step   5800: train CrossEntropyLoss |  1.38289189
Step   5800: eval  CrossEntropyLoss |  1.40104055
Step   5800: eval          Accuracy |  0.56479341

Step   5900: Ran 100 train steps in 43.71 secs
Step   5900: train CrossEntropyLoss |  1.35343218
Step   5900: eval  CrossEntropyLoss |  1.38423947
Step   5900: eval          Accuracy |  0.56091684

Step   6000: Ran 100 train steps in 43.90 secs
Step   6000: train CrossEntropyLoss |  1.34320128
Step   6000: eval  CrossEntropyLoss |  1.33690719
Step   6000: eval          Accuracy |  0.57423945

Step   6100: Ran 100 train steps in 44.13 secs
Step   6100: train CrossEntropyLoss |  1.32550573
Step   6100: eval  CrossEntropyLoss |  1.36520954
Step   6100: eval          Accuracy |  0.56659261

Step   62


$$P(W) = \sqrt[N]{\prod_{i=1}^{N} \frac{1}{P(w_i| w_1,...,w_{n-1})}}$$

As an implementation hack, you would usually take the log of that formula (to enable us to use the log probabilities we get as output of our `RNN`, convert exponents to products, and products into sums which makes computations less complicated and computationally more efficient). You should also take care of the padding, since you do not want to include the padding when calculating the perplexity (because we do not want to have a perplexity measure artificially good).


$$log P(W) = {log\big(\sqrt[N]{\prod_{i=1}^{N} \frac{1}{P(w_i| w_1,...,w_{n-1})}}\big)}$$

$$ = {log\big({\prod_{i=1}^{N} \frac{1}{P(w_i| w_1,...,w_{n-1})}}\big)^{\frac{1}{N}}}$$ 

$$ = {log\big({\prod_{i=1}^{N}{P(w_i| w_1,...,w_{n-1})}}\big)^{-\frac{1}{N}}} $$
$$ = -\frac{1}{N}{log\big({\prod_{i=1}^{N}{P(w_i| w_1,...,w_{n-1})}}\big)} $$
$$ = -\frac{1}{N}{\big({\sum_{i=1}^{N}{logP(w_i| w_1,...,w_{n-1})}}\big)} $$



In [21]:
def test_model(preds, target):

    total_log_ppx = np.sum(preds * tl.one_hot(target, preds.shape[-1]),axis= -1) 

    non_pad = 1.0 - np.equal(target, 0)          
    ppx = total_log_ppx * non_pad                    

    log_ppx = np.sum(ppx) / np.sum(non_pad)
 
    
    return -log_ppx

In [22]:
model = GRULM()
batch_size =32
max_length=64
model.init_from_file('model/model.pkl.gz')
for x in range(3):
    batch = next(batch_data_generator(batch_size, max_length, lines))
    preds = model(batch[0])
    log_ppx = test_model(preds, batch[1])
    print('The log perplexity and perplexity of your model are respectively', log_ppx, np.exp(log_ppx))

The log perplexity and perplexity of your model are respectively 1.3317634 3.7877166
The log perplexity and perplexity of your model are respectively 1.2793124 3.5941675
The log perplexity and perplexity of your model are respectively 1.3148134 3.724056


In [23]:
batch = next(batch_data_generator(batch_size, max_length, lines))
preds = model(batch[0])
print(preds)
print(batch[1])

[[[-17.353485  -13.051613  -17.484028  ... -16.761162  -16.356499
   -17.164349 ]
  [-19.33699   -13.383178  -18.502068  ... -18.001366  -19.357914
   -18.92427  ]
  [-14.89674   -13.089214  -14.404482  ... -14.211534  -14.934781
   -12.932111 ]
  ...
  [-19.580896   -7.1406326 -20.074245  ... -17.992218  -18.432888
   -19.617348 ]
  [-19.65902    -7.682177  -20.06006   ... -17.952805  -18.853691
   -19.984495 ]
  [-20.172676   -7.265751  -20.601837  ... -18.207088  -18.812744
   -20.027431 ]]

 [[-17.353485  -13.051613  -17.484028  ... -16.761162  -16.356499
   -17.164349 ]
  [-18.151478   -8.387393  -17.373232  ... -17.808964  -17.810823
   -18.81025  ]
  [-17.926193  -13.913287  -18.117655  ... -16.872896  -16.792788
   -17.548904 ]
  ...
  [-19.338638  -11.687966  -19.283993  ... -17.402496  -18.97757
   -19.385492 ]
  [-19.442974  -11.361465  -19.691381  ... -17.903664  -18.74846
   -19.415577 ]
  [-19.249218  -11.773383  -19.554504  ... -17.796429  -19.018864
   -19.419933 ]]

 [

## Generating the language with your own model

In [24]:
def gumbel_sample(log_probs, temperature=1.0):
    """Gumbel sampling from a categorical distribution."""
    u = numpy.random.uniform(low=1e-6, high=1.0 - 1e-6, size=log_probs.shape)
    g = -np.log(-np.log(u))
    return np.argmax(log_probs + g * temperature, axis=-1)

def predict(num_chars, prefix):
    inp = [ord(c) for c in prefix]
    result = [c for c in prefix]
    max_len = len(prefix) + num_chars
    for _ in range(num_chars):
        cur_inp = np.array(inp + [0] * (max_len - len(inp)))
        outp = model(cur_inp[None, :])  # Add batch dim.
        next_char = gumbel_sample(outp[0, len(inp)])
        inp += [int(next_char)]
       
        if inp[-1] == 1:
            break  # EOS
        result.append(chr(int(next_char)))
    
    return "".join(result)



In [25]:
print(predict(32, ""))

so upon each subscript, the king


In [28]:
print(predict(64, "I am happy because"))

I am happy because need.
