<a href="https://colab.research.google.com/github/asrjy/mahaGPT/blob/main/mahagpt_dev.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(420)

<torch._C.Generator at 0x7d33c4571350>

In [3]:
with open('mahabharata.txt', 'r', encoding = 'utf-8') as f:
  text = f.read()

In [4]:
len(text)

14929983

In [5]:
print(text[:1000])

ADI PARVA

SECTION I

Om! Having bowed down to Narayana and Nara, the most exalted male being,
and also to the goddess Saraswati, must the word Jaya be uttered.

Ugrasrava, the son of Lomaharshana, surnamed Sauti, well-versed in the
Puranas, bending with humility, one day approached the great sages of
rigid vows, sitting at their ease, who had attended the twelve years'
sacrifice of Saunaka, surnamed Kulapati, in the forest of Naimisha. Those
ascetics, wishing to hear his wonderful narrations, presently began to
address him who had thus arrived at that recluse abode of the inhabitants
of the forest of Naimisha. Having been entertained with due respect by
those holy men, he saluted those Munis (sages) with joined palms, even
all of them, and inquired about the progress of their asceticism. Then
all the ascetics being again seated, the son of Lomaharshana humbly
occupied the seat that was assigned to him. Seeing that he was
comfortably seated, and recovered from fatigue, one of the Rishi

In [6]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)



 !"&'(),-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz
79


In [7]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [8]:
print(encode("yoo wassup"))
print(decode(encode("yoo wassup")))

[77, 67, 67, 1, 75, 53, 71, 71, 73, 68]
yoo wassup


In [9]:

data = torch.tensor(encode(text), dtype = torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([14929983]) torch.int64
tensor([24, 27, 32,  1, 39, 24, 41, 45, 24,  0,  0, 42, 28, 26, 43, 32, 38, 37,
         1, 32,  0,  0, 38, 65,  2,  1, 31, 53, 74, 61, 66, 59,  1, 54, 67, 75,
        57, 56,  1, 56, 67, 75, 66,  1, 72, 67,  1, 37, 53, 70, 53, 77, 53, 66,
        53,  1, 53, 66, 56,  1, 37, 53, 70, 53,  8,  1, 72, 60, 57,  1, 65, 67,
        71, 72,  1, 57, 76, 53, 64, 72, 57, 56,  1, 65, 53, 64, 57,  1, 54, 57,
        61, 66, 59,  8,  0, 53, 66, 56,  1, 53, 64, 71, 67,  1, 72, 67,  1, 72,
        60, 57,  1, 59, 67, 56, 56, 57, 71, 71,  1, 42, 53, 70, 53, 71, 75, 53,
        72, 61,  8,  1, 65, 73, 71, 72,  1, 72, 60, 57,  1, 75, 67, 70, 56,  1,
        33, 53, 77, 53,  1, 54, 57,  1, 73, 72, 72, 57, 70, 57, 56, 10,  0,  0,
        44, 59, 70, 53, 71, 70, 53, 74, 53,  8,  1, 72, 60, 57,  1, 71, 67, 66,
         1, 67, 58,  1, 35, 67, 65, 53, 60, 53, 70, 71, 60, 53, 66, 53,  8,  1,
        71, 73, 70, 66, 53, 65, 57, 56,  1, 42, 53, 73, 72, 61,  8,  1, 75, 57,
     

In [10]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [11]:
block_size = 8
train_data[:block_size+1]

tensor([24, 27, 32,  1, 39, 24, 41, 45, 24])

In [12]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f"when context is {context}, target is {target}")

when context is tensor([24]), target is 27
when context is tensor([24, 27]), target is 32
when context is tensor([24, 27, 32]), target is 1
when context is tensor([24, 27, 32,  1]), target is 39
when context is tensor([24, 27, 32,  1, 39]), target is 24
when context is tensor([24, 27, 32,  1, 39, 24]), target is 41
when context is tensor([24, 27, 32,  1, 39, 24, 41]), target is 45
when context is tensor([24, 27, 32,  1, 39, 24, 41, 45]), target is 24


In [13]:
len(train_data)

13436984

In [14]:
batch_size = 4
block_size = 8

def get_batch(split):
  data = train_data if split == "train" else val_data
  ix = torch.randint(len(data)-block_size, (batch_size, ))
  x = torch.stack([data[i: i+block_size] for i in ix])
  y = torch.stack([data[i+1: i+block_size+1] for i in ix])
  return x, y

xb, yb = get_batch('train')

print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('--------')

for b in range(batch_size):
  for t in range(block_size):
    context = xb[b, :t+1]
    target = yb[b, t]
    print(f"When input is {context.tolist()} the target is {target.tolist()}")

inputs:
torch.Size([4, 8])
tensor([[61, 67, 66, 71,  5,  1, 61, 71],
        [10,  1, 32, 72,  1, 75, 53, 71],
        [ 0, 65, 53, 77,  1, 54, 57,  1],
        [57, 56,  1, 71, 53, 59, 57,  8]])
targets:
torch.Size([4, 8])
tensor([[67, 66, 71,  5,  1, 61, 71,  1],
        [ 1, 32, 72,  1, 75, 53, 71,  1],
        [65, 53, 77,  1, 54, 57,  1, 68],
        [56,  1, 71, 53, 59, 57,  8,  1]])
--------
When input is [61] the target is 67
When input is [61, 67] the target is 66
When input is [61, 67, 66] the target is 71
When input is [61, 67, 66, 71] the target is 5
When input is [61, 67, 66, 71, 5] the target is 1
When input is [61, 67, 66, 71, 5, 1] the target is 61
When input is [61, 67, 66, 71, 5, 1, 61] the target is 71
When input is [61, 67, 66, 71, 5, 1, 61, 71] the target is 1
When input is [10] the target is 1
When input is [10, 1] the target is 32
When input is [10, 1, 32] the target is 72
When input is [10, 1, 32, 72] the target is 1
When input is [10, 1, 32, 72, 1] the target i

In [15]:
# input to the transformer
xb

tensor([[61, 67, 66, 71,  5,  1, 61, 71],
        [10,  1, 32, 72,  1, 75, 53, 71],
        [ 0, 65, 53, 77,  1, 54, 57,  1],
        [57, 56,  1, 71, 53, 59, 57,  8]])

In [26]:
class BigramModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):
    logits = self.token_embedding_table(idx)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # generate max_new_tokens new indices and concatenate to idx
    # idx is (B, T) array of indices. row is number of batches, and column is context length
    for _ in range(max_new_tokens):
      logits, loss = self(idx)
      # we only need the last value in the sequence to generate the next sequence, in this particular model
      logits = logits[:, -1, :]
      # getting the probabilites from the logits
      probs = F.softmax(logits, dim = -1)
      # sampling from the distrbution
      idx_next = torch.multinomial(probs, num_samples = 1)
      idx = torch.cat((idx, idx_next), dim = 1)
    return idx


m = BigramModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens = 100)[0].tolist()))

torch.Size([32, 79])
tensor(5.1451, grad_fn=<NllLossBackward0>)

au65vai0mb9RX:.NO)kMAr5U4sfQ6r7s&Z8)V!Gw yfQhd8PAPwR'r]p4?Jrt5tL(5WV4Hz-6)xRXcix:0]t;ff(dQ
VR'oEYs,[


In [27]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [34]:
batch_size = 32
for steps in range(10000):
  xb, yb = get_batch('train')
  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss.item())

2.3525025844573975


In [37]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens = 4 00)[0].tolist()))


e ushis, wio sd. indun hidavenormoonef f cth alamming t) the re mbervy Hing thidiller on brereconthersay Alofours acke isuccredes d celacie tinde archtasuthedonthorof
"
me, edo toudemoreanmere (grsh ies Whoie wnd id, they O w tha ithyto hikisaf ce, ding atonarinthe
wofom ftif
th tivi on s ise's honwn fioud y ive s cas, thechuref 'Thodee. cllinchof ly thendeshes Hachtupacte
f toned llowins. puls on
