In [3]:
import os
import trax
import trax.fastmath.numpy as np
import pickle
import itertools
import numpy
import random as rnd
from trax import fastmath
from trax import layers as tl

# set random seed
rnd.seed(32)

In [4]:
dirname = 'data/'
lines = [] 
for filename in os.listdir(dirname):
    with open(os.path.join(dirname, filename)) as files:
        for line in files:
            # remove leading and trailing whitespace
            pure_line = line.strip()
            # if pure_line is not the empty string,
            if pure_line:
                lines.append(pure_line.lower())
                

print(f"Number of lines: {len(lines)}")
print(lines[100])

Number of lines: 125097
hath beaten down young hotspur and his troops,


## make training and eval from data

In [5]:
eval_lines = lines[-1000:] 
lines = lines[:-1000] 

print(f"Number of lines for training: {len(lines)}")
print(f"Number of lines for validation: {len(eval_lines)}")

Number of lines for training: 124097
Number of lines for validation: 1000


## make tensor from each letter(convert to ascii code)

In [6]:
def line_to_tensor(line, EOS_int=1):
    
    tensor = []
    # for each character:
    for c in line:
        c_int = ord(c)
        tensor.append(c_int)

    tensor.append(EOS_int)
    return tensor

In [7]:
line_to_tensor('I like deeplearning!')

[73,
 32,
 108,
 105,
 107,
 101,
 32,
 100,
 101,
 101,
 112,
 108,
 101,
 97,
 114,
 110,
 105,
 110,
 103,
 33,
 1]

## batch data generator

In [8]:
def batch_data_generator(batch_size, max_length, data_lines, line_to_tensor=line_to_tensor):
    index =[]
    while True:
        if len(index)<len(data_lines):
            #find index of lines less than maxlength
            index = numpy.where([1 if len(line)<max_length else 0 for line in data_lines])[0]
        batch_index = numpy.random.choice(index,batch_size)
        #remove used index 
        index = [x for x in index if x not in batch_index]
        #make a batch
        batch = [data_lines[i] for i in batch_index]  

        batch_ = []
        mask = []
        # make a tensor
        for li in batch:
            tensor = line_to_tensor(li)
            pad = [0] * (max_length - len(tensor))
            tensor_pad = tensor + pad
            example_mask = [0 if t == 0 else 1 for t in tensor_pad]
            mask.append(example_mask)
            batch_.append(tensor_pad)
        batch_np_arr = np.array(batch_)
        mask_np_arr = np.array(mask)
        yield batch_np_arr,batch_np_arr,mask_np_arr

In [9]:
# Try out batch data generator
tmp_lines = ['12345678901', #length 11
             '123456789', # length 9
             '2345690', # length 9
             '345678901'] # length 9

# Get a batch size of 2, max length 10
tmp_data_gen = batch_data_generator(batch_size=2, 
                              max_length=15, 
                              data_lines=tmp_lines,
                            )

# get one batch
tmp_batch = next(tmp_data_gen)

# view the batch
tmp_batch



(DeviceArray([[49, 50, 51, 52, 53, 54, 55, 56, 57,  1,  0,  0,  0,  0,  0],
              [50, 51, 52, 53, 54, 57, 48,  1,  0,  0,  0,  0,  0,  0,  0]],            dtype=int32),
 DeviceArray([[49, 50, 51, 52, 53, 54, 55, 56, 57,  1,  0,  0,  0,  0,  0],
              [50, 51, 52, 53, 54, 57, 48,  1,  0,  0,  0,  0,  0,  0,  0]],            dtype=int32),
 DeviceArray([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
              [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]], dtype=int32))

## GRU model

In [10]:
def GRULM(vocab_size=256, d_model=512, n_layers=2, mode='train'):
    model = tl.Serial(
      tl.ShiftRight(mode=mode), # Stack the ShiftRight layer
      tl.Embedding(vocab_size=vocab_size, d_feature=d_model), # Stack the embedding layer
      [tl.GRU(n_units=d_model) for _ in range(n_layers)], # Stack GRU layers of d_model units keeping n_layer parameter in mind (use list comprehension syntax)
      tl.Dense(n_units=vocab_size), # Dense layer
      tl.LogSoftmax() # Log Softmax
    )
    return model


In [11]:
model = GRULM()
print(model)

Serial[
  Serial[
    ShiftRight(1)
  ]
  Embedding_256_512
  GRU_512
  GRU_512
  Dense_256
  LogSoftmax
]


## train model

In [18]:
from trax.supervised import training


def train_model(model, data_generator, batch_size=32, max_length=64, lines=lines, eval_lines=eval_lines, n_steps=1, output_dir='model/'): 

    bare_train_generator = data_generator(batch_size, max_length, data_lines=lines)
    infinite_train_generator = itertools.cycle(bare_train_generator)
    
    bare_eval_generator = data_generator(batch_size, max_length, data_lines=eval_lines)
    infinite_eval_generator = itertools.cycle(bare_eval_generator)
   
    train_task = training.TrainTask(
        labeled_data=infinite_train_generator, 
        loss_layer=tl.CrossEntropyLoss(),   
        optimizer=trax.optimizers.Adam(0.0005)     
    )

    eval_task = training.EvalTask(
        labeled_data=infinite_eval_generator,    
        metrics=[tl.CrossEntropyLoss(), tl.Accuracy()], 
        n_eval_batches=3      
    )
    
    training_loop = training.Loop(model,
                                  tasks = train_task,
                                  eval_tasks=eval_task,
                                  output_dir=output_dir)

    training_loop.run(n_steps=n_steps)
    
    return training_loop


In [None]:
training_loop = train_model(GRULM(),batch_data_generator,n_steps=10000)


Step    100: Ran 99 train steps in 46.47 secs
Step    100: train CrossEntropyLoss |  2.76606679
Step    100: eval  CrossEntropyLoss |  2.26216833
Step    100: eval          Accuracy |  0.33201503

Step    200: Ran 100 train steps in 43.88 secs
Step    200: train CrossEntropyLoss |  2.13829851
Step    200: eval  CrossEntropyLoss |  1.99351176
Step    200: eval          Accuracy |  0.40818863

Step    300: Ran 100 train steps in 44.03 secs
Step    300: train CrossEntropyLoss |  1.97075748
Step    300: eval  CrossEntropyLoss |  1.88342321
Step    300: eval          Accuracy |  0.43857095

Step    400: Ran 100 train steps in 44.07 secs
Step    400: train CrossEntropyLoss |  1.86722732
Step    400: eval  CrossEntropyLoss |  1.82994707
Step    400: eval          Accuracy |  0.44792417

Step    500: Ran 100 train steps in 44.01 secs
Step    500: train CrossEntropyLoss |  1.79598188
Step    500: eval  CrossEntropyLoss |  1.70853364
Step    500: eval          Accuracy |  0.47246412

Step    60


$$P(W) = \sqrt[N]{\prod_{i=1}^{N} \frac{1}{P(w_i| w_1,...,w_{n-1})}}$$

As an implementation hack, you would usually take the log of that formula (to enable us to use the log probabilities we get as output of our `RNN`, convert exponents to products, and products into sums which makes computations less complicated and computationally more efficient). You should also take care of the padding, since you do not want to include the padding when calculating the perplexity (because we do not want to have a perplexity measure artificially good).


$$log P(W) = {log\big(\sqrt[N]{\prod_{i=1}^{N} \frac{1}{P(w_i| w_1,...,w_{n-1})}}\big)}$$

$$ = {log\big({\prod_{i=1}^{N} \frac{1}{P(w_i| w_1,...,w_{n-1})}}\big)^{\frac{1}{N}}}$$ 

$$ = {log\big({\prod_{i=1}^{N}{P(w_i| w_1,...,w_{n-1})}}\big)^{-\frac{1}{N}}} $$
$$ = -\frac{1}{N}{log\big({\prod_{i=1}^{N}{P(w_i| w_1,...,w_{n-1})}}\big)} $$
$$ = -\frac{1}{N}{\big({\sum_{i=1}^{N}{logP(w_i| w_1,...,w_{n-1})}}\big)} $$



In [12]:
def test_model(preds, target):

    total_log_ppx = np.sum(preds * tl.one_hot(target, preds.shape[-1]),axis= -1) 

    non_pad = 1.0 - np.equal(target, 0)          
    ppx = total_log_ppx * non_pad                    

    log_ppx = np.sum(ppx) / np.sum(non_pad)
 
    
    return -log_ppx

In [16]:
model = GRULM()
batch_size =32
max_length=64
model.init_from_file('model/model.pkl.gz')
for x in range(3):
    batch = next(batch_data_generator(batch_size, max_length, lines))
    preds = model(batch[0])
    log_ppx = test_model(preds, batch[1])
    print('The log perplexity and perplexity of your model are respectively', log_ppx, np.exp(log_ppx))

The log perplexity and perplexity of your model are respectively 5.4902134 242.30891
The log perplexity and perplexity of your model are respectively 5.4907227 242.43234
The log perplexity and perplexity of your model are respectively 5.49048 242.3735


In [17]:
batch = next(batch_data_generator(batch_size, max_length, lines))
preds = model(batch[0])
print(preds)
print(batch[1])

[[[-5.5451574 -5.537999  -5.5434384 ... -5.5409994 -5.542995  -5.542787 ]
  [-5.540304  -5.5368576 -5.5455866 ... -5.5417385 -5.544574  -5.544505 ]
  [-5.555124  -5.5306234 -5.5492845 ... -5.5459957 -5.5411053 -5.546066 ]
  ...
  [-5.5460014 -5.5188766 -5.5379567 ... -5.5289564 -5.5380177 -5.537578 ]
  [-5.5460014 -5.518877  -5.5379567 ... -5.528958  -5.538017  -5.537578 ]
  [-5.5460005 -5.5188766 -5.537956  ... -5.528958  -5.538016  -5.537577 ]]

 [[-5.5451574 -5.537999  -5.5434384 ... -5.5409994 -5.542995  -5.542787 ]
  [-5.5538745 -5.5180736 -5.5448008 ... -5.5403438 -5.5445185 -5.5436845]
  [-5.5485077 -5.5182605 -5.5472755 ... -5.5424924 -5.546113  -5.5459046]
  ...
  [-5.5460005 -5.518877  -5.537956  ... -5.5289593 -5.538016  -5.537577 ]
  [-5.5460005 -5.518877  -5.537956  ... -5.5289593 -5.538016  -5.537577 ]
  [-5.5460005 -5.518877  -5.537956  ... -5.5289593 -5.538016  -5.537577 ]]

 [[-5.5451574 -5.537999  -5.5434384 ... -5.5409994 -5.542995  -5.542787 ]
  [-5.540304  -5.53685

## Generating the language with your own model

In [None]:
def gumbel_sample(log_probs, temperature=1.0):
    """Gumbel sampling from a categorical distribution."""
    u = numpy.random.uniform(low=1e-6, high=1.0 - 1e-6, size=log_probs.shape)
    g = -np.log(-np.log(u))
    return np.argmax(log_probs + g * temperature, axis=-1)

def predict(num_chars, prefix):
    inp = [ord(c) for c in prefix]
    result = [c for c in prefix]
    max_len = len(prefix) + num_chars
    for _ in range(num_chars):
        cur_inp = np.array(inp + [0] * (max_len - len(inp)))
        outp = model(cur_inp[None, :])  # Add batch dim.
        next_char = gumbel_sample(outp[0, len(inp)])
        inp += [int(next_char)]
       
        if inp[-1] == 1:
            break  # EOS
        result.append(chr(int(next_char)))
    
    return "".join(result)



In [None]:
print(predict(32, ""))

In [None]:
print(predict(64, "I am happy because"))