In [57]:
import pandas as pd

In [138]:

# Load the first million lines of the opensubtitles 2018 dataset
# Reference: Citation J. Tiedemann, 2012, Parallel Data, Tools and Interfaces in OPUS. In Proceedings of the 8th International Conference on Language Resources and Evaluation (LREC 2012)

with open('data/tiny_opensubtitles.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Remove all non-alphanumeric characters
import re
text = re.sub('[^A-Za-z0-9.,!"\':;_ ]+', '', text)


In [139]:
print(f'Dataset length: {len(text)}')
text[:1000]

Dataset length: 10002825


"Presented by IM PicturesProduced by Shin CineIn association with MVP Venture Capital and Cinema ServiceJeon Jihyun Cha TaehyunMy Sassy GirlExactly two years ago today, she and I buried a time capsule here.We promised to meet here two years later, but she hasn't come yet.I'm going to wait.Here we go.Please, don't move.One, two...Wait a minute.HelloOh, auntie.Sorry, I'm on my way.I'm really sorry.Yes, I'm coming.I'm having my photo taken.Bye.Are you readyHere we go.One, two...My parents wanted a daughter, so they raised me like one.So I thought I was a girl until I was seven.I had to go to the women's public bath, too.The older I got,I thought my penis would get smaller and disappear.But it was the opposite.First HalfHe hasn't changed at all.No, I'm a real man now.Hey, asshole.Think clerical work in the army makes you a manYou irritate me!Give me a break, asshole.My job was tougher than you could imagine.Hey!I worked near the DMZ.Who are you kiddingHold it.Anyway, welcome back home.She'

In [140]:
# all unique characters in the dataset
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(f'Length of vocab: {vocab_size}')

 !"',.0123456789:;ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz
Length of vocab: 71


In [141]:
# create a mapping of characters to integers
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

#encoder: take a string and return a list of integers mapped to that string
encode = lambda s: [stoi[ch] for ch in s] 

#decoder: take a list of integers and return a string
decode = lambda l: ''.join([itos[i] for i in l])

str = "Hello Transformer!"
print(encode(str))
print(decode(encode(str)))

[25, 49, 56, 56, 59, 0, 37, 62, 45, 58, 63, 50, 59, 62, 57, 49, 62, 1]
Hello Transformer!


In [142]:
# Encode the entire dataset and store it as a tensor in Torch
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([10002825]) torch.int64


In [143]:
# Train and Validation data sets
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

print(f'Train data length: {len(train_data)}, Valid data length: {len(val_data)}')

Train data length: 9002542, Valid data length: 1000283


In [144]:
block_size = 8
train_data[:block_size + 1]

tensor([33, 62, 49, 63, 49, 58, 64, 49, 48])

In [145]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'when input in {context} ({decode(context.tolist())}), target is {target} ({decode([target.tolist()])})')

when input in tensor([33]) (P), target is 62 (r)
when input in tensor([33, 62]) (Pr), target is 49 (e)
when input in tensor([33, 62, 49]) (Pre), target is 63 (s)
when input in tensor([33, 62, 49, 63]) (Pres), target is 49 (e)
when input in tensor([33, 62, 49, 63, 49]) (Prese), target is 58 (n)
when input in tensor([33, 62, 49, 63, 49, 58]) (Presen), target is 64 (t)
when input in tensor([33, 62, 49, 63, 49, 58, 64]) (Present), target is 49 (e)
when input in tensor([33, 62, 49, 63, 49, 58, 64, 49]) (Presente), target is 48 (d)


In [146]:
torch.manual_seed(1337)

batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # whhat is the maximum content length for prediction?

def get_batch(split):
    #generate a random batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    # create a block_size worth of random indexes within the data - blocksize length
    ix = torch.randint(len(data) - block_size - 1, (block_size,))
    xb = torch.stack([data[i:i+block_size] for i in ix])
    yb = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return xb,yb

xb, yb = get_batch('train')
print(f'inputs: {xb.shape}, targets: {yb.shape}')
print('xb = ', xb)
print('yb = ', yb)
print('------')
for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'when input in {context} ({decode(context.tolist())}), target is {target} ({decode([target.tolist()])})')

inputs: torch.Size([8, 8]), targets: torch.Size([8, 8])
xb =  tensor([[59, 58, 67, 59, 59, 16, 18, 58],
        [ 4,  0, 57, 69,  0, 56, 59, 62],
        [25, 45, 58, 55,  5, 26, 64,  3],
        [50, 56, 65, 49, 58, 64,  0, 57],
        [11,  0, 58, 53, 51, 52, 64, 63],
        [62,  0, 62, 45, 64, 52, 49, 62],
        [56,  4,  0, 26,  0, 52, 59, 60],
        [49,  4,  0, 64, 52, 53, 63,  0]])
yb =  tensor([[58, 67, 59, 59, 16, 18, 58, 48],
        [ 0, 57, 69,  0, 56, 59, 62, 48],
        [45, 58, 55,  5, 26, 64,  3, 63],
        [56, 65, 49, 58, 64,  0, 57, 49],
        [ 0, 58, 53, 51, 52, 64, 63,  0],
        [ 0, 62, 45, 64, 52, 49, 62,  4],
        [ 4,  0, 26,  0, 52, 59, 60, 49],
        [ 4,  0, 64, 52, 53, 63,  0, 67]])
------
when input in tensor([59]) (o), target is 58 (n)
when input in tensor([59, 58]) (on), target is 67 (w)
when input in tensor([59, 58, 67]) (onw), target is 59 (o)
when input in tensor([59, 58, 67, 59]) (onwo), target is 59 (o)
when input in tensor([59,

In [147]:
# Input to the transformer
print(xb)

tensor([[59, 58, 67, 59, 59, 16, 18, 58],
        [ 4,  0, 57, 69,  0, 56, 59, 62],
        [25, 45, 58, 55,  5, 26, 64,  3],
        [50, 56, 65, 49, 58, 64,  0, 57],
        [11,  0, 58, 53, 51, 52, 64, 63],
        [62,  0, 62, 45, 64, 52, 49, 62],
        [56,  4,  0, 26,  0, 52, 59, 60],
        [49,  4,  0, 64, 52, 53, 63,  0]])


In [148]:
# Implement a Bigram LM using Torch

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        # A simple lookup table that stores embeddings of a fixed dictionary and size.
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets=None):
        # ifx and targets are both (B, T) tensor of integers
        logits = self.token_embedding_table(idx) #(B, T, C)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens=100):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get predictions
            logits, loss = self(idx)
            
            # focus only on the last time step
            logits = logits[:, -1, :] # shape is now (B, C)
            
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B,C)
            
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [149]:
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(f'Loss = {loss} - based on vocab_size {vocab_size}, expected loss is $ln(1/vocab_size)$ {torch.log(torch.tensor(1/vocab_size))}')

print('Generated Text:\n')
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long))[0].tolist()))

torch.Size([64, 71])
Loss = 4.904510498046875 - based on vocab_size 71, expected loss is $ln(1/vocab_size)$ -4.2626800537109375
Generated Text:

 N25idzV5oLiJhyc6uR71 ,V5;f5ht8w77Wl92yQIQIgqAmvBniN7BfIT dITaHuOLQ4!947bYYuzPDGae7RhJ8skbf:HX_if;.3u


In [150]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [151]:
batch_size = 32
for steps in range(10000):
    
    # sample a batch of data
    xb, yb = get_batch('train')
    
    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
print(loss.item())
    

4.8807783126831055


In [152]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))

 j:kTgqZr9gXLNej_ITvv;AAR1B jnHAhyjwNTe75aeT,.B1FTI9VYMOGFWbV,VtBe,LU P7RhrL.Qz d'bLIqEBM T;zPF,Bpp15S1j_kV rlPB9;tENCKkkO"z"dadRFX,znwMOe"i2dv3OhWXH,yVsFcN6 dPWGKk rMKuyIDGFM wMCDkLVX5vL!1Ztc'RhpQwI6'fN2dvjMQvG:k4bxm6ITI3E2poPjBpUeVxBM.v0fO91G9kRhRJAp'8DnoGaC wS'hMmPwVXLAKHHN"FZmG'JC"F,.6uZ;UU lj8,j9YVifE1nRdVU0wMOzgEJjF6eOJtfoP3IXCnCLqhj'1zVX5.HHge1C;fOIwVtca3I9UPDmG2GlPVUzc:2mhyj4aSOhc d6QQGMDxM_gRhrQ:L1mobLkXL8iWOU:6Hcj;tcjph4Adh9'w NT1BAc MK2bc:IDsLQnVJ.V13v_n8u;i4 Tc23e"9p".v_:oszGdZDcaiv.v
