In [2]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-03-03 13:26:31--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-03-03 13:26:31 (19.9 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [3]:
with open("input.txt", 'r', encoding='utf-8') as F:
  book = F.read()

In [4]:
print(book[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [5]:
book[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [6]:
len(book)

1115394

In [7]:
r = sorted(set(book))
chars =''.join(r)
print(chars)
charsize = len(chars)
print(charsize)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [8]:
## I used rfind() function to create my encoder and deccoder instead of using dictionary which was used in the Original code
encoder = lambda c: [chars.rfind(c[i]) for i in range(len(c))]
decoder = lambda c: "".join([chars[i] for i in c])

In [9]:
print(encoder('Hello There'))
print(decoder(encoder('Hello There')))

[20, 43, 50, 50, 53, 1, 32, 46, 43, 56, 43]
Hello There


In [10]:
book_digits_List=encoder(book)

In [11]:
print('First 15 characters in the book:  ',book[:15])
print('First 15 codes in the encoded book:  ',book_digits_List[:15])
print('\nLength of the characters in Book:     ',len(book))
print('Length of the codes in Encoded Book:  ',len(book_digits_List))

First 15 characters in the book:   First Citizen:

First 15 codes in the encoded book:   [18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0]

Length of the characters in Book:      1115394
Length of the codes in Encoded Book:   1115394


In [12]:
# Convert Encoded Book from python List to PyTorch Tensor
import torch
book_digits = torch.tensor(book_digits_List)
book_digits[:15]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0])

In [13]:
n=len(book_digits)*9//10
Train_data = book_digits[:n]
Val_data = book_digits[n:]

In [14]:
batch_size = 4
context = 3

def get_batch(x):
  Batch_start = torch.randint(0, len(x)-context,(batch_size,))
  xb = torch.stack([x[Batch_start[i]:Batch_start[i]+context] for i in range(batch_size)])
  yb = torch.stack([x[Batch_start[i]+1:Batch_start[i]+context+1] for i in range(batch_size)])
  return xb, yb

xb, yb = get_batch(book_digits)
print(xb)
print(yb)

tensor([[ 0,  0, 17],
        [63,  1, 51],
        [56,  1, 56],
        [61, 53, 56]])
tensor([[ 0, 17, 16],
        [ 1, 51, 53],
        [ 1, 56, 43],
        [53, 56, 58]])


In [15]:
# Let's define a Bigram Language model
import torch.nn as nn
import torch.nn.functional as F

# nn.module inheritance is not added
# super().__init() is not included yet

class NgramLanguageModel:

  def __init__(self):
    self.channels = nn.Embedding(charsize,charsize)
    self.parameters = [self.channels.weight]

  def forward(self,random_training_batch):
    # looks up in the Embedding table created in the constructor to assign weights to each character coming in
    logits = self.channels(random_training_batch) # Batch (B) X context (T) X Embedding/Channels (C)
    return logits

  def LossFunction(self,logits,random_training_batch_nextChar):
    logits = logits.view(-1,charsize) # we are doing this since Pytorch functinal.cross_entropy function needs Channels to be assigned to the second dimension
    Target = random_training_batch_nextChar.view(-1)
    Loss = F.cross_entropy(logits,Target)
    return Loss

  def generate(self, initiator_token, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits = self.forward(initiator_token)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            initiator_token_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            initiator_token = torch.cat((initiator_token, initiator_token_next), dim=1) # (B, T+1)
        return initiator_token

In [16]:
# NgramLM = NgramLanguageModel()
# print(NgramLM.channels.weight.grad)
#iter([NgramLM.channels.weight])

In [17]:
# Create a Ngram Language Model Object
NgramLM = NgramLanguageModel()

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(NgramLM.parameters, lr=1e-3)

In [18]:
### TRAINING LOOP ###

# define number of iterations to train the data
Iterations = 5000
# number of batches for each iteration
batch_size = 16
# How many characters are used to predict the next? Context = 1 --> Bigram model, Context > 1 --> Ngram model
context = 32

for i in range(Iterations):

  # get a random batch of data
  xb, yb = get_batch(Train_data)

  # forward pass
  logits = NgramLM.forward(xb)

  # Calculate loss
  Loss = NgramLM.LossFunction(logits,yb)

  #print(NgramLM.channels.weight.grad)
  #Zero all parameter gradients
  NgramLM.channels.weight.grad = None
  #
  #print(NgramLM.channels.weight.grad)

  # Backward Path to Calculate new grads
  Loss.backward()

  # Update the weights in embedding
  optimizer.step()


print(Loss.item())

2.4891881942749023


In [20]:
## Generate some text with trained model
print(decoder(NgramLM.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))



ThWhabe at !
TE:

Gntl, fry plly $3pttethansYouged is an unolandrimas waYOheswatzGRUSenm Ifuerofe stowinthe
Andgadshera tefeato; merso berVI wone, k'n
Iffand minje hie sththecest azPO:
LI Condo ies r$kOUke.-thotrerond al tr owowiloul s.z--Fin, INshecthedomon byooirplllld:
RO, chen faroubrveI ash ougenele? sl
du serethYoyorinoORThreyo jeveit u, hes fiin my!
anereay th $Unear wondsthaurod fontay mesue wallitR:
ONur fetheak ldo,
OLEd!QGhif. i&x m? in'steas latV:

Or X;
Duor t d!


M:
Yome jotMEBI:


###__Using *Mean Value* of previous tokens to create communication between time dimension__

In [29]:
### Let's define a Bigram Language model
### with Mean of previous tokens

import torch.nn as nn
import torch.nn.functional as F

# nn.module inheritance is not added
# super().__init() is not included yet

class MeanNgramLanguageModel:

  def __init__(self):
    self.channels = nn.Embedding(charsize,charsize)
    self.parameters = [self.channels.weight]

  def forward(self,random_training_batch):
    # looks up in the Embedding table created in the constructor to assign weights to each character coming in
    logits = self.channels(random_training_batch) # Batch (B) X context (T) X Embedding/Channels (C)
    ## Masked Meaning feature of the previous tokens
    B,T,C = logits.shape
    tril = torch.tril(torch.ones(T, T))
    wei = torch.zeros((T,T))
    wei = wei.masked_fill(tril == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1) # Context (T) X (T)
    logits = wei @ logits
    return logits

  def LossFunction(self,logits,random_training_batch_nextChar):
    logits = logits.view(-1,charsize) # we are doing this since Pytorch functinal.cross_entropy function needs Channels to be assigned to the second dimension
    Target = random_training_batch_nextChar.view(-1)
    Loss = F.cross_entropy(logits,Target)
    return Loss

  def generate(self, initiator_token, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits = self.forward(initiator_token)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            initiator_token_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            initiator_token = torch.cat((initiator_token, initiator_token_next), dim=1) # (B, T+1)
        return initiator_token

In [30]:
# Create a Mean Ngram Language Model Object
MeanNgramLM = MeanNgramLanguageModel()

# create a PyTorch optimizer
optimizerMean = torch.optim.AdamW(MeanNgramLM.parameters, lr=1e-3)

In [31]:
### TRAINING LOOP ###

# define number of iterations to train the data
Iterations = 5000
# number of batches for each iteration
batch_size = 16
# How many characters are used to predict the next? Context = 1 --> Bigram model, Context > 1 --> Ngram model
context = 32


for i in range(Iterations):

  # get a random batch of data
  xb, yb = get_batch(Train_data)

  # forward pass
  logits = MeanNgramLM.forward(xb)

  # Calculate loss
  Loss = MeanNgramLM.LossFunction(logits,yb)

  #print(NgramLM.channels.weight.grad)
  #Zero all parameter gradients
  MeanNgramLM.channels.weight.grad = None
  #
  #print(NgramLM.channels.weight.grad)

  # Backward Path to Calculate new grads
  Loss.backward()

  # Update the weights in embedding
  optimizerMean.step()


print(Loss.item())

3.210155725479126


In [33]:
## Generate some text with trained model
print(decoder(MeanNgramLM.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


s iLtt  dil Iew;t.irueue noya
Rnbc
 o oIb nh
crAya yrecgy,uolgci&'rIy
otsm,
o  ei e,eCd   rd er d.htiysRise
 retIe nFeU 
 ot eore  l,sn ecdneeh'seaedtii lcsi bshMrrF
,t ,lKtl
i,thvs.ltswCwiCt
t Tw iltret
iy'niyn u den hh tiiyoN e
htw.nydaoE
ooFttaaosenOsfento oare heeTaeleo
sree u  efUPhaP egtaee fo
 il
ioR.ogstH d dmhbCun dgti;ao Iils'm e omd diEZ' e h e
hf r
BsUaw  e vSdnotadgy ott fniwe
nAlow   ,ei
,Bltehi t
i:aQdubden mmoel w ohur
 fnBl'wtailhTrr ,s t bs tsRota,J?bNeleWre'df,bn o htno
ai
w h


## __Self Attention__

In [16]:
torch.tril(torch.ones(3,3))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [23]:
w = torch.tril(torch.ones(3,3))
d = w/torch.sum(w,1,keepdim=True)

In [32]:
torch.transpose(w,0)

TypeError: transpose() received an invalid combination of arguments - got (Tensor, int), but expected one of:
 * (Tensor input, int dim0, int dim1)
 * (Tensor input, name dim0, name dim1)


In [46]:
FF = torch.tensor([[1,2,4],[2,4,6]],dtype=torch.float)

In [59]:
F.softmax(FF,0)

tensor([[0.2689, 0.1192, 0.1192],
        [0.7311, 0.8808, 0.8808]])

In [53]:
import math
math.exp(1)/(math.exp(1)+math.exp(2)+math.exp(4))

0.04201006613406605

In [60]:
math.exp(1)/(math.exp(1)+math.exp(2))

0.2689414213699951