In [117]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [118]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [119]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O input_shakespeare.txt

--2023-03-25 16:18:53--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8002::154, 2606:50c0:8001::154, 2606:50c0:8003::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8002::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input_shakespeare.txt’


2023-03-25 16:18:54 (10.7 MB/s) - ‘input_shakespeare.txt’ saved [1115394/1115394]



In [120]:
with open('input_shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read() # should be simple plain text file

In [121]:
print('input length in chars:', len(text))

input length in chars: 1115394


In [122]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



# Build Vocab

In [123]:
vocab = sorted(list(set(text)))
vocab_size = len(vocab)
print('vocab length:', vocab_size)
print(''.join(vocab))

vocab length: 65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [124]:
stoi = {c: i for i, c in enumerate(vocab)}
itos = {i: c for i, c in enumerate(vocab)}
encode = lambda x: [stoi[c] for c in x]
decode = lambda x: ''.join([itos[c] for c in x])
print(encode('hello'))
print(decode(encode('hello')))

[46, 43, 50, 50, 53]
hello


In [125]:
data = encode(text)

n_split = int(0.9 * len(data))
train_data = data[:n_split]
val_data = data[n_split:]

In [126]:
print(train_data[:100])

[18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63, 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56, 1, 51, 43, 1, 57, 54, 43, 39, 49, 8, 0, 0, 13, 50, 50, 10, 0, 31, 54, 43, 39, 49, 6, 1, 57, 54, 43, 39, 49, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 37, 53, 59]


In [127]:
block_size = 8
print(train_data[:block_size+9])

[18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43]


# Example result
[context] --> output

In [128]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for i in range(block_size):
    print(x[:i+1], '-->', y[i])

[18] --> 47
[18, 47] --> 56
[18, 47, 56] --> 57
[18, 47, 56, 57] --> 58
[18, 47, 56, 57, 58] --> 1
[18, 47, 56, 57, 58, 1] --> 15
[18, 47, 56, 57, 58, 1, 15] --> 47
[18, 47, 56, 57, 58, 1, 15, 47] --> 58


# Example batch and results

In [129]:
# seed random number generator for torch
torch.manual_seed(1337)

batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data if split == 'val' else None
    
    if data is None:
        raise ValueError('split must be either train or val')
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.tensor([data[i:i+block_size] for i in ix]).to(device)
    y = torch.tensor([data[i+1:i+block_size+1] for i in ix]).to(device)
    return x, y
    
xb, yb = get_batch('train')
print(f'imputs shape: {xb.shape}')
print(f'targets shape: {yb.shape}')
print(f'inputs (first batch):', xb)
print(f'targets (first batch):', yb)
print('-----')
print('context --> target')
for b in range(batch_size):
    for i in range(block_size):
        print(xb[b, 0:i+1].tolist(), '-->', yb[b, i].item())

imputs shape: torch.Size([4, 8])
targets shape: torch.Size([4, 8])
inputs (first batch): tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]], device='cuda:0')
targets (first batch): tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]], device='cuda:0')
-----
context --> target
[24] --> 43
[24, 43] --> 58
[24, 43, 58] --> 5
[24, 43, 58, 5] --> 57
[24, 43, 58, 5, 57] --> 1
[24, 43, 58, 5, 57, 1] --> 46
[24, 43, 58, 5, 57, 1, 46] --> 43
[24, 43, 58, 5, 57, 1, 46, 43] --> 39
[44] --> 53
[44, 53] --> 56
[44, 53, 56] --> 1
[44, 53, 56, 1] --> 58
[44, 53, 56, 1, 58] --> 46
[44, 53, 56, 1, 58, 46] --> 39
[44, 53, 56, 1, 58, 46, 39] --> 58
[44, 53, 56, 1, 58, 46, 39, 58] --> 1
[52] --> 58
[52, 58] --> 1
[52, 58, 1] --> 58
[52, 58, 1, 58] --> 46
[52, 58, 1, 58, 46] -->

# Build model

In [130]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        
    def forward(self, x, targets=None):
        x = self.embed(x)
        logits = x

        loss = None
        if targets != None:
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        if targets is None:
            return logits
        else:
            return logits, loss
    
    def generate(self, x, n):
        # input is B x T
        for i in range(n):
            logits = self(x) # B x T x C
            # only use last prediction
            logits = logits[:, -1, :] # B x C
            # sample from distribution
            probs = F.softmax(logits, dim=-1) # B x C
            # sample next token
            idx_next = torch.multinomial(probs, num_samples=1) # B x 1
            x = torch.cat([x, idx_next], dim=1) # B x (T+1)

        return x
    
n_embed = vocab_size
model = BigramLanguageModel(vocab_size, n_embed).to(device)
print(model)
pred, loss = model(xb, yb)
print(pred.shape)
print(pred)

# expect loss to be around log(vocab_size) on random predictions
print(f'Expected Loss: log(vocab_size) -> {torch.log(torch.tensor(vocab_size))}')
print(f'{loss.shape=}')
print(f'{loss=}')

BigramLanguageModel(
  (embed): Embedding(65, 65)
)
torch.Size([32, 65])
tensor([[ 1.6347, -0.0518,  0.4996,  ...,  0.2432,  1.1519,  0.9950],
        [ 0.3418, -0.9276,  1.2381,  ...,  1.5018, -0.5266,  0.2354],
        [ 0.1479, -0.4333,  0.5203,  ...,  0.3302,  1.5454,  1.3778],
        ...,
        [-0.5693, -0.0735,  0.7743,  ..., -0.0815, -1.1445, -0.0623],
        [ 0.4658, -0.2573, -1.0673,  ...,  1.2439,  1.3471,  1.6910],
        [-0.4553,  0.0139,  0.9309,  ...,  0.0290, -0.7568,  0.8701]],
       device='cuda:0', grad_fn=<ViewBackward0>)
Expected Loss: log(vocab_size) -> 4.174387454986572
loss.shape=torch.Size([])
loss=tensor(5.0364, device='cuda:0', grad_fn=<NllLossBackward0>)


# Test init samples

In [131]:
pred.size(-1)

65

In [132]:
gen = model.generate(xb, 10)
print(gen.shape)
print(gen)
[decode(gen[i].tolist()) for i in range(batch_size)]

torch.Size([4, 18])
tensor([[24, 43, 58,  5, 57,  1, 46, 43, 57, 55,  3, 11, 58, 44, 14, 44, 30, 27],
        [44, 53, 56,  1, 58, 46, 39, 58,  2, 36, 41, 31, 19, 60, 30, 44, 15, 36],
        [52, 58,  1, 58, 46, 39, 58,  1, 62, 51, 27,  4, 55, 21, 16, 50, 11,  2],
        [25, 17, 27, 10,  0, 21,  1, 54, 11, 58, 22, 29, 46, 34,  0, 63, 34,  0]],
       device='cuda:0')


["Let's hesq$;tfBfRO",
 'for that!XcSGvRfCX',
 'nt that xmO&qIDl;!',
 'MEO:\nI p;tJQhV\nyV\n']

In [134]:
idx = torch.zeros(batch_size, 1, dtype=torch.long).to(device)
print(idx)
[print(p) for p in [decode(pred) for pred in model.generate(idx, 100).tolist()]]

tensor([[0],
        [0],
        [0],
        [0]], device='cuda:0')

kNdcuwdZZTkOMl;,ertK
w:!PLCkMBbeA$3:XaSGJO-3p&M-c?KL3auhpFYVXJFhNNNuhq$OMxv.tbVFYdXlrFZaAeNuw:cPPyRE

lc-T nA,e!ngm MWtJferEFQ 
yQfQwsZENdpkS:WRfL-kZbMtviGvRmt'vK&$DjCerSm bns
yCb,-cKknvTHMvyu&l;tMu'Rfg

O
nG?RPhBOUjuhpd
CTYui3pYCGPimnqj.aajGK,eM
Eeoql-RoY.WvsZEQ:B;'vDUqheFREN?zkyX'It;C n:;Gr.ypkdoPl?Kl

ug;tkN'a3ePBPUfpkl;zUZuAFGbrFSiXs
lMOH-aZpdei&$ydu'wDSR;BaV
!xQKkZWQ$Bjtzkl;aaniq!3fzg-$n-U3QH&I&$RN


[None, None, None, None]

# construct optimizer

In [135]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [136]:
batch_size = 32
step_count = 10000
for step in range(step_count):
    xb, yb = get_batch('train')
    _, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
    if step % 10 == 0:
        print(f'{step=}, {loss=}')

step=0, loss=tensor(4.6816, device='cuda:0', grad_fn=<NllLossBackward0>)
step=10, loss=tensor(4.6154, device='cuda:0', grad_fn=<NllLossBackward0>)
step=20, loss=tensor(4.7026, device='cuda:0', grad_fn=<NllLossBackward0>)
step=30, loss=tensor(4.5495, device='cuda:0', grad_fn=<NllLossBackward0>)
step=40, loss=tensor(4.6024, device='cuda:0', grad_fn=<NllLossBackward0>)
step=50, loss=tensor(4.6507, device='cuda:0', grad_fn=<NllLossBackward0>)
step=60, loss=tensor(4.5948, device='cuda:0', grad_fn=<NllLossBackward0>)
step=70, loss=tensor(4.5310, device='cuda:0', grad_fn=<NllLossBackward0>)
step=80, loss=tensor(4.5669, device='cuda:0', grad_fn=<NllLossBackward0>)
step=90, loss=tensor(4.6026, device='cuda:0', grad_fn=<NllLossBackward0>)
step=100, loss=tensor(4.4530, device='cuda:0', grad_fn=<NllLossBackward0>)
step=110, loss=tensor(4.5275, device='cuda:0', grad_fn=<NllLossBackward0>)
step=120, loss=tensor(4.4396, device='cuda:0', grad_fn=<NllLossBackward0>)
step=130, loss=tensor(4.5409, device

In [138]:
print(decode(model.generate(torch.zeros((1,1), device=device, dtype=torch.long), 200)[0].tolist()))


Thanstarom oroup
Yowhthetof isth ble mil; dill, ath iree sengmin lat Heriliovets, and Win nghire yombousel lind me l.
HAshe ce hiry ptupr aisspllw y.
Hurindu n Boopetelaves
MPORDis, d mothakleo Windo 
