In [9]:
import pandas as pd

In [10]:

# Load the first million lines of the opensubtitles 2018 dataset
# Reference: Citation J. Tiedemann, 2012, Parallel Data, Tools and Interfaces in OPUS. In Proceedings of the 8th International Conference on Language Resources and Evaluation (LREC 2012)

with open('data/tiny_opensubtitles.txt', 'r', encoding='utf-8') as f:
    text = f.read()


In [11]:
print(f'Dataset length: {len(text)}')

Dataset length: 1048194


In [12]:
# all unique characters in the dataset
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(f'Length of vocab: {vocab_size}')


 !"#$%'()*,-./0123456789:;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_abcdefghijklmnopqrstuvwxyz~¦¬Îáâçèéíó♪置群
Length of vocab: 99


In [13]:
# create a mapping of characters to integers
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

#encoder: take a string and return a list of integers mapped to that string
encode = lambda s: [stoi[ch] for ch in s] 

#decoder: take a list of integers and return a string
decode = lambda l: ''.join([itos[i] for i in l])

str = "Hello Transformer!"
print(encode(str))
print(decode(encode(str)))

[36, 63, 70, 70, 73, 1, 48, 76, 59, 72, 77, 64, 73, 76, 71, 63, 76, 2]
Hello Transformer!


In [17]:
# Encode the entire dataset and store it as a tensor in Torch
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([1048194]) torch.int64


In [19]:
# Train and Validation data sets
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

print(f'Train data length: {len(train_data)}, Valid data length: {len(val_data)}')

Train data length: 943374, Valid data length: 104820


In [20]:
block_size = 8
train_data[:block_size + 1]

tensor([44, 76, 63, 77, 63, 72, 78, 63, 62])

In [26]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'when input in {context} ({decode(context.tolist())}), target is {target} ({decode([target.tolist()])})')

when input in tensor([44]) (P), target is 76 (r)
when input in tensor([44, 76]) (Pr), target is 63 (e)
when input in tensor([44, 76, 63]) (Pre), target is 77 (s)
when input in tensor([44, 76, 63, 77]) (Pres), target is 63 (e)
when input in tensor([44, 76, 63, 77, 63]) (Prese), target is 72 (n)
when input in tensor([44, 76, 63, 77, 63, 72]) (Presen), target is 78 (t)
when input in tensor([44, 76, 63, 77, 63, 72, 78]) (Present), target is 63 (e)
when input in tensor([44, 76, 63, 77, 63, 72, 78, 63]) (Presente), target is 62 (d)


In [38]:
torch.manual_seed(1337)

batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # whhat is the maximum content length for prediction?

def get_batch(split):
    #generate a random batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    # create a block_size worth of random indexes within the data - blocksize length
    ix = torch.randint(len(data) - block_size - 1, (block_size,))
    xb = torch.stack([data[i:i+block_size] for i in ix])
    yb = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return xb,yb

xb, yb = get_batch('train')
print(f'inputs: {xb.shape}, targets: {yb.shape}')
print('xb = ', xb)
print('yb = ', yb)
print('------')
for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'when input in {context} ({decode(context.tolist())}), target is {target} ({decode([target.tolist()])})')

inputs: torch.Size([8, 8]), targets: torch.Size([8, 8])
xb =  tensor([[66, 59, 78, 78, 63, 76,  2,  0],
        [71, 60, 63, 76,  1, 83, 73, 79],
        [74, 27,  0, 51, 63,  1, 71, 79],
        [48, 59, 69, 63,  1, 73, 64, 64],
        [64, 64, 63, 61, 78,  0, 78, 66],
        [ 1, 70, 67, 69, 63,  1, 78, 73],
        [12, 37, 67, 64, 63,  1, 78, 66],
        [78, 66,  1, 78, 66, 63, 71,  1]])
yb =  tensor([[59, 78, 78, 63, 76,  2,  0, 36],
        [60, 63, 76,  1, 83, 73, 79,  1],
        [27,  0, 51, 63,  1, 71, 79, 77],
        [59, 69, 63,  1, 73, 64, 64,  1],
        [64, 63, 61, 78,  0, 78, 66, 59],
        [70, 67, 69, 63,  1, 78, 73,  1],
        [37, 67, 64, 63,  1, 78, 66, 67],
        [66,  1, 78, 66, 63, 71,  1, 74]])
------
when input in tensor([66]) (h), target is 59 (a)
when input in tensor([66, 59]) (ha), target is 78 (t)
when input in tensor([66, 59, 78]) (hat), target is 78 (t)
when input in tensor([66, 59, 78, 78]) (hatt), target is 63 (e)
when input in tensor([66,

In [39]:
# Input to the transformer
print(xb)

tensor([[66, 59, 78, 78, 63, 76,  2,  0],
        [71, 60, 63, 76,  1, 83, 73, 79],
        [74, 27,  0, 51, 63,  1, 71, 79],
        [48, 59, 69, 63,  1, 73, 64, 64],
        [64, 64, 63, 61, 78,  0, 78, 66],
        [ 1, 70, 67, 69, 63,  1, 78, 73],
        [12, 37, 67, 64, 63,  1, 78, 66],
        [78, 66,  1, 78, 66, 63, 71,  1]])
