# Deep N-grams

### Predecir el siguiente conjunto de caracteres usando los caractees previos



In [59]:
!pip install -q -U trax

In [4]:
import os
import trax
import trax.fastmath.numpy as np
import pickle
import numpy
import random as rnd
from trax import fastmath
from trax import layers as tl

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
dirname = '/content/drive/MyDrive/semester work/Deep n-grams/data'

In [7]:
#dirname = 'data/'
lines = [] # storing all the lines in a variable.
for filename in os.listdir(dirname):
    with open(os.path.join(dirname, filename)) as files:
        for line in files:
            pure_line = line.strip()

            if pure_line:
                lines.append(pure_line)

In [8]:
n_lines = len(lines)
print(f"Number of lines: {n_lines}")
print(f"Sample line at position 0 {lines[0]}")
print(f"Sample line at position 999 {lines[999]}")

Number of lines: 125097
Sample line at position 0 1 KING HENRY IV
Sample line at position 999 HOTSPUR	Why, my horse, my love, my horse.


In [9]:
print("loaded data")

loaded data


In [10]:
for i, line in enumerate(lines):
    lines[i] = line.lower()

In [11]:
eval_lines = lines[-1000:] # Create a holdout validation set
lines = lines[:-1000] # Leave the rest for training


print(f"Number of lines for training: {len(lines)}")
print(f"Number of lines for validation: {len(eval_lines)}")

Number of lines for training: 124097
Number of lines for validation: 1000


In [12]:
def line_to_tensor(line, EOS_int=1):
    tensor = []
    for c in line:
        c_int = ord(c)
        tensor.append(c_int)
    tensor.append(EOS_int)
    return tensor


In [13]:
line_to_tensor('abc xyz')

[97, 98, 99, 32, 120, 121, 122, 1]

## Batch genetaror


In [14]:
def data_generator(batch_size, max_length, data_lines, line_to_tensor=line_to_tensor, shuffle=True):
    index = 0
    cur_batch = []
    num_lines = len(data_lines)
    lines_index = [*range(num_lines)]
    if shuffle:
        rnd.shuffle(lines_index)
    while True:
        if index>=num_lines:
            index = 0
            if shuffle:
                rnd.shuffle(lines_index)
        line = data_lines[index]

        if len(line)<max_length:
            cur_batch.append(line)
        index += 1

        if len(cur_batch)==batch_size:
            batch = []
            mask = []
            for li in cur_batch:
                tensor = line_to_tensor(li)
                pad = [0] * (max_length-len(tensor))
                tensor_pad = tensor+pad
                batch.append(tensor_pad)
                example_mask = [0 if i==0 else 1  for i in tensor_pad]
                mask.append(example_mask)
            batch_np_arr = np.array(batch)
            mask_np_arr = np.array(mask)

            yield batch_np_arr, batch_np_arr, mask_np_arr

            cur_batch = []

In [15]:
tmp_lines = ['12345678901', #11
             '123456789', # 9
             '234567890', # 9
             '345678901'] # 9

# Get a batch size of 2, max length 10
tmp_data_gen = data_generator(batch_size=2,
                              max_length=10,
                              data_lines=tmp_lines,
                              shuffle=False)

# get one batch
tmp_batch = next(tmp_data_gen)

# view the batch
tmp_batch

(DeviceArray([[49, 50, 51, 52, 53, 54, 55, 56, 57,  1],
              [50, 51, 52, 53, 54, 55, 56, 57, 48,  1]], dtype=int32),
 DeviceArray([[49, 50, 51, 52, 53, 54, 55, 56, 57,  1],
              [50, 51, 52, 53, 54, 55, 56, 57, 48,  1]], dtype=int32),
 DeviceArray([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
              [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32))

In [16]:
import torch, jax; print(torch.cuda.is_available()); print(jax.devices())

True
[GpuDevice(id=0, process_index=0)]


In [17]:
import itertools

infinite_data_generator = itertools.cycle(
    data_generator(batch_size=2, max_length=10, data_lines=tmp_lines))

In [18]:
ten_lines = [next(infinite_data_generator) for _ in range(10)]
print(len(ten_lines))

10


## Model with GRU


In [19]:
def GRULM(vocab_size=256, d_model=512, n_layers=2, mode='train'):

    model = tl.Serial(
      tl.ShiftRight(mode=mode), # Stack the ShiftRight layer
      tl.Embedding(vocab_size = vocab_size,d_feature=d_model), # Stack the embedding layer
      [tl.GRU(n_units=d_model) for _ in range(n_layers)], # Stack GRU layers of d_model units keeping n_layer parameter in mind (use list comprehension syntax)
      tl.Dense(n_units=vocab_size), # Dense layer
      tl.LogSoftmax() # Log Softmax
    )
    return model

In [20]:
model = GRULM()
print(model)

Serial[
  Serial[
    ShiftRight(1)
  ]
  Embedding_256_512
  GRU_512
  GRU_512
  Dense_256
  LogSoftmax
]


In [21]:
batch_size = 32
max_length = 64

In [50]:
def n_used_lines(lines, max_length):


    n_lines = 0
    for l in lines:
        if len(l) <= max_length:
            n_lines += 1
    return n_lines

num_used_lines = n_used_lines(lines, 32)
print('Number of used lines from the dataset:', num_used_lines)
print('Batch size (a power of 2):', int(batch_size))
steps_per_epoch = int(num_used_lines/batch_size)
print('Number of steps to cover one epoch:', steps_per_epoch)

Number of used lines from the dataset: 25773
Batch size (a power of 2): 32
Number of steps to cover one epoch: 805


## Training model

In [62]:
dir = '/content/drive/MyDrive/semester work/Deep n-grams/model/'

In [63]:
from trax.supervised import training

def train_model(model, data_generator, batch_size=32, max_length=64, lines=lines, eval_lines=eval_lines, n_steps=1, output_dir=dir):
    print(output_dir)
    bare_train_generator = data_generator(batch_size=batch_size, max_length=max_length, data_lines=lines)
    infinite_train_generator =  itertools.cycle(bare_train_generator)

    bare_eval_generator = data_generator(batch_size=batch_size, max_length=max_length, data_lines=eval_lines)
    infinite_eval_generator = itertools.cycle(bare_eval_generator)
    
    train_task = training.TrainTask(
        labeled_data=infinite_train_generator,
        loss_layer= tl.CrossEntropyLoss(),
        optimizer=trax.optimizers.Adam(learning_rate=0.0005)
    )


    eval_task = training.EvalTask(
        labeled_data=infinite_eval_generator,
        metrics=[tl.CrossEntropyLoss(), tl.Accuracy()],
        n_eval_batches=3
    )
    training_loop = training.Loop(model,
                                  train_task,
                                  eval_tasks=[eval_task],
                                  output_dir=output_dir)
    print("Training")
    training_loop.run(n_steps=n_steps)

    return training_loop


In [64]:
training_loop = train_model(GRULM(), data_generator)

/content/drive/MyDrive/semester work/Deep n-grams/model/


  "jax.host_id has been renamed to jax.process_index. This alias "
  "jax.host_count has been renamed to jax.process_count. This alias "


Training


## Evaluación



In [54]:
def test_model(preds, target):

    total_log_ppx = np.sum(tl.one_hot(target,preds.shape[-1]) * preds, axis= -1) # HINT: tl.one_hot() should replace one of the Nones

    non_pad = 1.0 - np.equal(target, 0)
    ppx = total_log_ppx * non_pad

    log_ppx = np.sum(ppx) / np.sum(non_pad)

    return -log_ppx


In [56]:
model = GRULM()
model.init_from_file('/content/drive/MyDrive/semester work/Deep n-grams/model/model.pkl.gz')
batch = next(data_generator(batch_size, max_length, lines, shuffle=False))
preds = model(batch[0])
log_ppx = test_model(preds, batch[1])
print('The log perplexity and perplexity of your model are respectively', log_ppx, np.exp(log_ppx))

The log perplexity and perplexity of your model are respectively 5.541664 255.10217


In [57]:
def gumbel_sample(log_probs, temperature=1.0):
    """Gumbel sampling from a categorical distribution."""
    u = numpy.random.uniform(low=1e-6, high=1.0 - 1e-6, size=log_probs.shape)
    g = -np.log(-np.log(u))
    return np.argmax(log_probs + g * temperature, axis=-1)

def predict(num_chars, prefix):
    inp = [ord(c) for c in prefix]
    result = [c for c in prefix]
    max_len = len(prefix) + num_chars
    for _ in range(num_chars):
        cur_inp = np.array(inp + [0] * (max_len - len(inp)))
        outp = model(cur_inp[None, :])  # Add batch dim.
        next_char = gumbel_sample(outp[0, len(inp)])
        inp += [int(next_char)]

        if inp[-1] == 1:
            break  # EOS
        result.append(chr(int(next_char)))

    return "".join(result)

print(predict(32, ""))


E¿º¸7l¨ëÏ0I~þaÁzWNµy¦R_ûòD


In [58]:
print(predict(32, ""))
print(predict(32, ""))
print(predict(32, ""))


#ùd¥gq 8zN.;`>ÎÃ§8mRàËÆ
ø¯3=¹Tã1wï×ÑP¹­¹+#òÇ÷FØ 
'à
