In [50]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-03-16 14:16:02--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2023-03-16 14:16:03 (18.0 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [51]:
with open('input.txt','r',encoding='utf-8') as f:
  data = f.read()

In [52]:
print("Length of dataset is", len(data))

Length of dataset is 1115394


In [53]:
print(data[:150])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

A


In [54]:
chars = sorted(list(set(data)))
vocab_size = len(chars)
print(''.join(chars))
print(len(chars))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


## DATA PRE-PROCESSING

In [55]:
#Converting string ids
char2ids = {ch:i for i,ch in enumerate(chars)} # This takes the characters and map characters to ids
ids2char = {i:ch for i,ch in enumerate(chars)} # This takes IDs to characters
encode = lambda s: [char2ids[c] for c in s] #Encodes a string of text
decode = lambda s: [ids2char[id] for id in s] # Decodes a string of intergers

#Checking it on some random string
''.join(decode(encode("Hi my name is Randall")))

'Hi my name is Randall'

In [285]:
''.join(decode(encode("hi there")))

'hi there'

In [56]:
import torch
data = torch.tensor(encode(data),dtype=torch.long)
print(data.shape,data.type)
print(data[:150])

torch.Size([1115394]) <built-in method type of Tensor object at 0x7fa6938e8e50>
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13])


In [57]:
n = int(0.9*len(data))
train = data[:n]
val = data[n:]

In [58]:
block_size= 8
train[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [59]:
#How we will input our case:
x = train[:block_size]
y = train[1:block_size+1]
for i in range(block_size):
  context = x[:i+1]
  target = y[i]
  print(f"when the input is {context} target is {target}")

when the input is tensor([18]) target is 47
when the input is tensor([18, 47]) target is 56
when the input is tensor([18, 47, 56]) target is 57
when the input is tensor([18, 47, 56, 57]) target is 58
when the input is tensor([18, 47, 56, 57, 58]) target is 1
when the input is tensor([18, 47, 56, 57, 58,  1]) target is 15
when the input is tensor([18, 47, 56, 57, 58,  1, 15]) target is 47
when the input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) target is 58


In [60]:
#This samples batches for training. Using batch size of 8
batch_size = 4

def get_batch(split):
  data = train if split == 'train' else val
  idx = torch.randint(len(data)-block_size,(batch_size,))
  print(idx)
  x = torch.stack([data[i:i+block_size] for i in idx])
  y = torch.stack([data[i+1:i+block_size+1] for i in idx])
  return x,y

xb,yb = get_batch('train')
print("xb shape is", xb.shape)
print("yb shape is", yb.shape)


tensor([584732, 118300, 677704, 449216])
xb shape is torch.Size([4, 8])
yb shape is torch.Size([4, 8])


##MODEL

In [61]:
import torch.nn as nn
from torch.nn import functional as F

In [201]:
n_embed = 32
#head_size = 8
n_heads = 4

###SELF ATTENTION PART

In [221]:
#IN OUR IMPLEMENTATION , WE JUST USED DECODER BLOCK INSTEAD OF ENCODER

#THIS IS ESSENTIALLY QUERIES @ KEYS @ VALUES
class Head(nn.Module):
  def __init__(self,head_size):
    super().__init__()
    #self.head_size = head_size
    self.key = nn.Linear(n_embed,head_size,bias=False)
    self.query = nn.Linear(n_embed,head_size,bias=False)
    self.value = nn.Linear(n_embed,head_size,bias=False)
    self.register_buffer('tril',torch.tril(torch.ones((block_size,block_size))))

  def forward(self,x):
    B,T,C = x.shape
    #KEYS AND QUERIES ACT AS LINEAR TRANFORMATIONS BEFORE COMPARING OUR TOKENS TO EACH OTHER
    k = self.key(x) #dim=(B,T,16)
    q = self.query(x) #dim=(B,T,16)
    #print("key", k)
    #print("query", q)
    weights =  q @ k.transpose(-2,-1) * C**-0.5# (B,T,16) @ (B,16,T){after transposing} = (B,T,T) #THIS MEASURES THE 'ATTENTION FOR EACH TOKEN TO ANOTHER'
    
    weights = weights.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
    #print("weights tensor", weights)
    #weights = weights.masked_fill(tril==0,float('inf'))
    weights = F.softmax(weights,dim=-1)
    #print("weights tensor", weights)
    v = self.value(x)
    output = weights @ v #(B,T,T) @ (B,T,32) = 
    return output

In [222]:
#LETS DO MULTI HEAD ATTENTION
class MultiHead(nn.Module):

  def __init__(self,num_heads,head_size):
    super().__init__()
    self.multihead = nn.ModuleList([Head(head_size) for i in range(num_heads)])
    self.proj = nn.Linear(n_embed,n_embed)

  def forward(self,x):
    out =  torch.cat([h(x) for h in self.multihead],dim=-1)
    return self.proj(out)

In [233]:
class Feedforward(nn.Module):

  def __init__(self,n_embed):
    super().__init__()
    self.ffn = nn.Sequential(nn.Linear(n_embed,4*n_embed),nn.ReLU(),nn.Linear(4*n_embed,n_embed))

  def forward(self,x):
    return self.ffn(x)


In [294]:
class Block(nn.Module):

  def __init__(self,n_embed,n_heads):
    super().__init__()
    head_size = n_embed // n_heads
    self.multihead = MultiHead(n_heads,head_size)
    self.ffn = Feedforward(n_embed)
    self.ln1 = nn.LayerNorm(n_embed)
    self.ln2 = nn.LayerNorm(n_embed)

  def forward(self,x):
    x = x + self.multihead(self.ln1(x))
    return x + self.ffn(self.ln2(x))

In [295]:
## Using the Attention Model For taking a word at time step t, and it should take the elements from 1 all the way to time step t
class Model(nn.Module):

  def __init__(self):
    super().__init__()
    self.embed = nn.Embedding(vocab_size,n_embed) #dim=(B,T,n_embed)
    self.lin = nn.Linear(n_embed,vocab_size) #dim=(B,T,Vocab_size)
    self.pos_embed = nn.Embedding(block_size,n_embed)
    self.blocks = nn.Sequential(
        Block(n_embed,n_heads=4),
        Block(n_embed,n_heads=4),
        Block(n_embed,n_heads=4)
    )
  def forward(self,x,target=None):
    B , T = x.shape
    #print(T)
    x = self.embed(x)
    #print("Dim of tokens embedding is ",x.shape)
    pos_embed = self.pos_embed(torch.arange(T))
    #print("dim of position embedding is ", pos_embed.shape)
    x = x + pos_embed
    #print("after embedding", x)
    x = self.blocks(x)
    #print("After head", x)
    #print("final dim of x is ",x.shape)
    x = self.lin(x)

    if target == None:
      loss = None
    else:
      B,T,C = x.shape
      x = x.view(B*T,C)
      target = target.view(B*T)
      loss = F.cross_entropy(x,target)

    return x, loss

  @torch.no_grad()
  def predict(self,idx,max_new_tokens):
      #idx is (batch,context/T)
    for _ in range(max_new_tokens):
      idxc = idx.clone()
      idxc = idx[:,-block_size:]
      logits , loss = self(idxc)
      logits = logits[:,-1,:]
      probs = F.softmax(logits,dim=-1)

      #draw samples using multinomial
      idx_next = torch.multinomial(probs,num_samples=1)
      #print(idx)
      idx = torch.cat((idx,idx_next),dim=1)
    return idx

In [296]:
m = Model()
x , loss = m(xb,yb)
print(loss)

tensor(4.6013, grad_fn=<NllLossBackward0>)


In [282]:
## TRYING OUT 
''.join(decode(m.predict(torch.zeros((1,1),dtype=torch.long),500)[0].tolist()))

"\n z p E w l . ? Z q d 3 . ' ; u & y z 3   G t ? I - K $ v x j j ; G D p t i ! G N C X D n a g N t Y X d ' 3 g G w D ? D z C g T   j h 3 T 3 x 3 z e z G w W O b s 3 d K & x K W R k a G U ? P g G l R E W V a n g T 3 3 a N B l m U x R I 3 T m u ? r x ! T z v G : , y ; G d d G   $ m x R ? G i x o 3 p Y Z G \n g u u ? $ ; & N L s $ m R L v U I U Z d R I x   X T T D a s O , x g N K C v G T W k y M x G x Z E z Y r D z q w G C m - U j \n T D W L u K Y J M s j W a P Q 3 V D w M G r E 3 O e R ? R k P T D y L x z G U Q v T T F r G z v G 3 ! 3 V j z c Z e T F W R R W N ! C ; D I r r B T : 3   G ? D j y j j & b C : J Y T G j j $ Y R G c r $ G Z $ ? 3 r x M y R 3 C t . - C z G $ i G i R n ; z j d B m , t G z l - $ . ' i & z $ ; G r M a a , q Y - v ! r E X i G w : m z e i \n \n 3 Q T n J c a H l m w - $ ; u E z r ; O E z g R w 3 w r x Q X g x k g 3 w M B G   ; C   j E z v x j B D G z G O 3 - M ; G b \n x k E G & I B g G ; v D N r C x T R I x U a Y z G x , d f s j b R i ; a ? w q J , G I w T a - T G

In [297]:
#create optimizer
optimizer = torch.optim.AdamW(m.parameters(),lr=1e-3)

In [298]:
batch_size = 32

for steps in range(10000):

  xb,yb = get_batch('train')
  logits, loss = m(xb,yb)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

print(loss.item())

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        430641, 233587, 404110, 423222, 986479, 185067, 133072, 499701, 276668,
        455949, 294032, 732682, 131075, 395314, 130890, 420347, 386044, 918357,
        209835, 910816,  15403, 816887, 873718])
tensor([687229, 253084, 399165, 748240, 956329, 152594, 651147, 273213,  94292,
        452965, 350687, 476434,  16138, 459092, 965675, 773293, 594109, 227457,
        255751, 375935, 107689, 765594, 366747, 487310, 345647, 674398, 107073,
        112665, 506366, 700502, 620503, 143892])
tensor([407489, 644725, 480445, 691012,   8738,  78327, 176698, 701303, 644149,
        911249, 545727, 766160, 163702,  51680, 714368, 503171, 368435, 476775,
        708013,  86116, 324384,  47284, 678630, 345221, 457160, 270448, 653553,
        781254, 359168,   1428, 127286, 651093])
tensor([ 285767,  624794,  341402,  347443,  966060,  129832,  297343,  829534,
         713572,   85028,  966883,  550914,  498268,  456527,  36707

In [299]:
''.join(decode(m.predict(torch.zeros((1,1),dtype=torch.long),2000)[0].tolist()))

"\n\nRAANABHARTISIO:\nMy look your have, Lount Ve'ting to then moune way fathing murt, sir,\nHe, I cantient is With where say that sto.\n\nMARIALUCE IIII:\nAnd my the\npreaciamst and have on:\nUnt eurseeforn: Ay not. Qaulseven would on that we distrect what breme overs.\n\nKIRGARBET:\nAnd Endly come naispy una.\n\nQUEEN Lord:\nFerist of thy tspalking that be stiors no myselfluy on art that he cark knawher staule has\nThat my were upon\nHow\nAs mirke lufent the body, one your undringnot thou seet olience oughtign'd you that in is's that the\nis you well forthysh then eath dothreepliye ut bolderew of a shear meriouslizen:\nAy\nPladio, they thank tigh of with the-brop in dowars:\nO' hat he me day?\nThe stratenmbring my this my abluch would your his Reike could lork adot bis I what I'LEjoest of their emplest it would wer.\n\nQUEEN NHARY CHENA:\nUncoinance.:\nMencey know why charm;\nOa, joy to death poward my verear.\n\nVETHUSBUEENO:\nI keet mish proveraing ore\nI\nHISA:\nGod's Briked him o