In [19]:
# Read file

with open('dataset/input.txt','r', encoding='utf-8') as f:
    text = f.read()

In [20]:
# Length of text
print(len(text))

1115394


In [21]:
# Check the text
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [22]:
# Check unique characters 
chars = sorted(list(set(text)))
print(f'number of unique characters: {len(chars)}')
print(f'Characters: {"".join(chars)}')
vocab_size = len(chars)

number of unique characters: 65
Characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [23]:
stoi = { ch:i for i,ch in enumerate(chars)}
itos = { i:ch for i,ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]  # Function takes a string and retuns encoded integers
decode = lambda l: ''.join([itos[i] for i in l]) # Function takes a list of integers and returns a decoded string

print(encode("hello there!"))
print(decode([25,33,52,24,23,1,43,61]))

[46, 43, 50, 50, 53, 1, 58, 46, 43, 56, 43, 2]
MUnLK ew


In [24]:
import torch 
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [25]:
# Split data

n = int(0.9 * len(data))
train_data, val_data = data[:n], data[n:]




In [26]:
# train Block size/ Context length
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [27]:
# How context works with example
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target is {target}")


when input is tensor([18]) the target is 47
when input is tensor([18, 47]) the target is 56
when input is tensor([18, 47, 56]) the target is 57
when input is tensor([18, 47, 56, 57]) the target is 58
when input is tensor([18, 47, 56, 57, 58]) the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


In [28]:
torch.manual_seed(1337)
batch_size = 4 
block_size = 8 

def get_batch(split):
    # generate a small batch of input data x and target y 
    data = train_data if split == 'train' else val_data 
    ix = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y 

xb, yb = get_batch('train')

print('inputs: ')
print(xb.shape)
print(xb)

print('targets: ')
print(yb.shape)
print(yb)

print('_______________')



for b in range (batch_size):    # batch dim
    for t in range(block_size): # time dim
        context = xb[b, :t+1]
        target  = yb[b,t]
        print(f'when context is {context.tolist()} the target is {target}')


inputs: 
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets: 
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
_______________
when context is [24] the target is 43
when context is [24, 43] the target is 58
when context is [24, 43, 58] the target is 5
when context is [24, 43, 58, 5] the target is 57
when context is [24, 43, 58, 5, 57] the target is 1
when context is [24, 43, 58, 5, 57, 1] the target is 46
when context is [24, 43, 58, 5, 57, 1, 46] the target is 43
when context is [24, 43, 58, 5, 57, 1, 46, 43] the target is 39
when context is [44] the target is 53
when context is [44, 53] the target is 56
when context is [44, 53, 56] the target is 1
when context is [44, 53, 56, 1] the target is 58
when cont

In [29]:
print()




In [30]:
import torch
import torch.nn as nn 
from torch.nn import functional as F  

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        logits = self.token_embedding_table(idx) # Batch, Time(block size), Channel(vocab size) = BTC
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            # print(B, T, C)

            logits = logits.view(B*T , C)
            targets = targets.view(B*T)
            
            loss = F.cross_entropy(logits, targets)
        
        return logits , loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context

        for _ in range(max_new_tokens):
            # get prediction
            logits, loss = self(idx)
            #focus only on the last time step
            logits = logits[:,-1,:] #becomes (B,C)
            #apply softmax to get probabilities
            probs = F.softmax(logits, dim=1) # B,C
            # Sample from distribution
            idx_next = torch.multinomial(probs, num_samples=1) #(B,1)
            #append sampled index to the running sequence
            idx = torch.cat((idx, idx_next),dim=1) #(B,T+1)
        return idx 
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(5.0364, grad_fn=<NllLossBackward0>)


In [31]:
# Check how the model outputs garbage before training

idx0 = torch.zeros((1,1),dtype=torch.long)
print(decode(m.generate(idx0, max_new_tokens=100)[0].tolist()))


lfJeukRuaRJKXAYtXzfJ:HEPiu--sDioi;ILCo3pHNTmDwJsfheKRxZCFs
lZJ XQc?:s:HEzEnXalEPklcPU cL'DpdLCafBheH


In [32]:
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)

In [34]:
batch_size = 32 
learning_steps = 10000
for step in range(learning_steps):
    #sample the batch
    xb, yb = get_batch('train')

    #evaluate loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if step%100 is 0:
        print("Step %d\tLoss %.4f" % (step, loss))
    
print('----done----')

  if step%100 is 0:


Step 0	Loss 3.6683
Step 100	Loss 3.6054
Step 200	Loss 3.5995
Step 300	Loss 3.5802
Step 400	Loss 3.4669
Step 500	Loss 3.3458
Step 600	Loss 3.2836
Step 700	Loss 3.1671
Step 800	Loss 3.1332
Step 900	Loss 3.1885
Step 1000	Loss 3.3209
Step 1100	Loss 3.0852
Step 1200	Loss 3.0418
Step 1300	Loss 3.0337
Step 1400	Loss 2.9010
Step 1500	Loss 2.9626
Step 1600	Loss 2.7994
Step 1700	Loss 2.9445
Step 1800	Loss 2.9192
Step 1900	Loss 2.7770
Step 2000	Loss 2.8779
Step 2100	Loss 2.7572
Step 2200	Loss 2.6434
Step 2300	Loss 2.6713
Step 2400	Loss 2.7567
Step 2500	Loss 2.6116
Step 2600	Loss 2.7023
Step 2700	Loss 2.6181
Step 2800	Loss 2.6877
Step 2900	Loss 2.5604
Step 3000	Loss 2.6919
Step 3100	Loss 2.7078
Step 3200	Loss 2.6816
Step 3300	Loss 2.6944
Step 3400	Loss 2.5496
Step 3500	Loss 2.6288
Step 3600	Loss 2.5659
Step 3700	Loss 2.6009
Step 3800	Loss 2.5666
Step 3900	Loss 2.6167
Step 4000	Loss 2.4803
Step 4100	Loss 2.4859
Step 4200	Loss 2.6072
Step 4300	Loss 2.5963
Step 4400	Loss 2.5240
Step 4500	Loss 2.5048


In [36]:
# Check how the model outputs garbage before training

idx0 = torch.zeros((1,1),dtype=torch.long)
print(decode(m.generate(idx0, max_new_tokens=1000)[0].tolist()))


My ng try he anged t of ars ro-
Beeteroureritth, ICouellounresoravein stie

Thea y thint bllofok:
Walorerllisty ou PARYon RI'so at ay t baby soula be
Is t, t thallive mu,
bererthaverorer pathery I ite todigr
I is as h
We herd w ur:
Theanchal my le led
Whelliof seaurerintho;

QUCAnd is
GLe th,
I y hand,

I lly arr u eestaveenthe yorarar thafe:
Ithowh ie akior wis
S:

Sit s f bowhens
You;
A ttho y, ino ay chald me! ome ywhar s buror ou alaurnt t icopoblle, fis IOFillold kirs f?
Hg, In be
IAtheeshe:
St cop.
TE:
IS:
A:
Goomy:

msthe,
CHANCEY st
ARINI londesines rk ilis hisurit ll mee,
IV:
No drckindervel f.
Fislicot s condes d mau ir:

HAR: cthy Thitod moodek; pek
Tofth bokl oyo n ghe hay, fonds,fecrene f sicll brer chy whas.
Wimfom ofurs th.
Bis ro alis aki&.
Th. h'ltondond a tot h ghy edisachoungoth maraw Ro, mo t hulind s eemustin out athabecras s woue nd bra wr sere owoutr
N ne qu g ars; ishiou st sh ather msitin:

Hur tors thue ys
O:
IOfik!
ADintwin
Pu bavee iforven t lousirtikest.
P