In [1]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-03-02 19:29:32--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2024-03-02 19:29:32 (19.6 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [2]:
with open("input.txt", 'r', encoding='utf-8') as F:
  book = F.read()

In [3]:
print(book[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [4]:
book[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [5]:
len(book)

1115394

In [6]:
r = sorted(set(book))
chars =''.join(r)
print(chars)
charsize = len(chars)
print(charsize)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [7]:
## I used rfind() function to create my encoder and deccoder instead of using dictionary which was used in the Original code
encoder = lambda c: [chars.rfind(c[i]) for i in range(len(c))]
decoder = lambda c: "".join([chars[i] for i in c])

In [8]:
print(encoder('Hello There'))
print(decoder(encoder('Hello There')))

[20, 43, 50, 50, 53, 1, 32, 46, 43, 56, 43]
Hello There


In [9]:
book_digits_List=encoder(book)

In [10]:
print('First 15 characters in the book:  ',book[:15])
print('First 15 codes in the encoded book:  ',book_digits_List[:15])
print('\nLength of the characters in Book:     ',len(book))
print('Length of the codes in Encoded Book:  ',len(book_digits_List))

First 15 characters in the book:   First Citizen:

First 15 codes in the encoded book:   [18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0]

Length of the characters in Book:      1115394
Length of the codes in Encoded Book:   1115394


In [11]:
# Convert Encoded Book from python List to PyTorch Tensor
import torch
book_digits = torch.tensor(book_digits_List)
book_digits[:15]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0])

In [12]:
n=len(book_digits)*9//10
Train_data = book_digits[:n]
Val_data = book_digits[n:]

In [13]:
batch_size = 4
context = 3

def get_batch(x):
  Batch_start = torch.randint(0, len(x)-context,(batch_size,))
  xb = torch.stack([x[Batch_start[i]:Batch_start[i]+context] for i in range(batch_size)])
  yb = torch.stack([x[Batch_start[i]+1:Batch_start[i]+context+1] for i in range(batch_size)])
  return xb, yb

xb, yb = get_batch(book_digits)
print(xb)
print(yb)

tensor([[59, 57, 11],
        [ 2,  1, 34],
        [ 1, 58, 53],
        [ 0, 27,  1]])
tensor([[57, 11,  1],
        [ 1, 34, 39],
        [58, 53, 59],
        [27,  1, 51]])


In [14]:
# Let's define a Bigram Language model
import torch.nn as nn
import torch.nn.functional as F

# nn.module inheritance is not added
# super().__init() is not included yet

class NgramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()
    self.channels = nn.Embedding(charsize,charsize)
    #self.parameters = [self.channels.weight]

  def forward(self,random_training_batch):
    # looks up in the Embedding table created in the constructor to assign weights to each character coming in
    logits = self.channels(random_training_batch) # Batch (B) X context (T) X Embedding/Channels (C)
    return logits

  def LossFunction(self,logits,random_training_batch_nextChar):
    logits = logits.view(-1,charsize) # we are doing this since Pytorch functinal.cross_entropy function needs Channels to be assigned to the second dimension
    Target = random_training_batch_nextChar.view(-1)
    Loss = F.cross_entropy(logits,Target)
    return Loss

In [15]:
# NgramLM = NgramLanguageModel()
# print(NgramLM.channels.weight.grad)
#iter([NgramLM.channels.weight])

In [16]:
# Create a Ngram Language Model Object
NgramLM = NgramLanguageModel()

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(NgramLM.parameters(), lr=1e-3)

In [17]:
### TRAINING LOOP ###

# define number of iterations to train the data
Iterations = 20000
# number of batches for each iteration
batch_size = 4
# How many characters are used to predict the next? Context = 1 --> Bigram model, Context > 1 --> Ngram model
context = 2

for i in range(Iterations):

  # get a random batch of data
  xb, yb = get_batch(Train_data)

  # forward pass
  logits = NgramLM.forward(xb)

  # Calculate loss
  Loss = NgramLM.LossFunction(logits,yb)

  #print(NgramLM.channels.weight.grad)
  #Zero all parameter gradients
  optimizer.zero_grad(set_to_none=True)
  #
  #print(NgramLM.channels.weight.grad)

  # Backward Path to Calculate new grads
  Loss.backward()

  # Update the weights in embedding
  optimizer.step()


print(Loss.item())

2.254117727279663
