In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd

In [8]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from torch.utils.data import Dataset, DataLoader
from typing import Iterable, List
import torchtext
from tqdm import tqdm
# # We need to modify the URLs for the dataset since the links to the original dataset are broken
# # Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
# multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
# multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"


# Place-holders
token_transform = {}
vocab_transform = {}

In [9]:
batch_size = 64
block_size = 256
learning_rate = 1e-2
max_iters = 5000
eval_interval = 500
eval_iters = 200
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_embd = 384
dropout = 0.2
no_of_heads = 6
n_layer = 6
device
SRC_LANGUAGE = 'Fr'
TGT_LANGUAGE = 'En'

In [None]:
# # Installing dependencies
# !pip install -U torchdata
# !pip install -U spacy
# !pip install 'portalocker>=2.0.0'
# # !python -m spacy download en_core_web_sm
# !python -m spacy download fr_core_news_sm

In [34]:
with open("/content/input.txt", "r") as f:
     text = f.read()
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [7]:
torch.cuda.is_available()

False

In [None]:
# Mapping between integers and characters
stoi = {}
itos = {}
for i, ch in enumerate(chars):
    stoi[ch] = i
    itos[i] = ch

# Take a string and output a list of integers
def encode(s):
    out = []
    for ch in s:
        out.append(stoi[ch])
    return out

# Take a list of integers and output a string
def decode(ints):
    out = ""
    for i in ints:
        out += itos[i]
    return out

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [None]:
# Encoding the entire dataset and saving it in a tensor
data = torch.tensor(encode(text), dtype = torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [11]:
print(data.shape)

torch.Size([288721])


In [None]:
# Splitting into train and validation
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [None]:
# Creating a block size (Maximum context)
data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

In [None]:
x = data[:block_size]
y = data[1:block_size + 1]
for t in range(block_size):
    context = x[:t + 1]
    target = y[t]
    print(f"When context is {context}, target is {target}")

In [None]:
gen = torch.manual_seed(1337)
# Creating a batch size (Number of independent sequences processed in parallel)
# batch_size = 4
# block_size = 8
def get_batch(split):
    data = train_data if split == "train" else val_data
    # ix is 4 (batch_size) numbers that are randomly generated between len(data) and block_size
    ix = torch.randint(len(data) - block_size, (batch_size, 1), generator = gen)
    x = torch.stack([data[i: i + block_size] for i in ix])
    y = torch.stack([data[i + 1: i + block_size + 1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x,y
xb, yb = get_batch(train_data)
print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t + 1]
        target = yb[b, t]
        print(f"When context is {context.tolist()}, target is {target}")

In [None]:
print(xb[0])
print(yb[0])

tensor([43, 51, 11,  1, 46, 39, 60, 47, 52, 45,  1, 40, 53, 58, 46,  1, 58, 46,
        43,  1, 49, 43, 63,  0, 27, 44,  1, 53, 44, 44, 47, 41, 43, 56,  1, 39,
        52, 42,  1, 53, 44, 44, 47, 41, 43,  6,  1, 57, 43, 58,  1, 39, 50, 50,
         1, 46, 43, 39, 56, 58, 57,  1, 47,  5,  1, 58, 46, 43,  1, 57, 58, 39,
        58, 43,  0, 32, 53,  1, 61, 46, 39, 58,  1, 58, 59, 52, 43,  1, 54, 50,
        43, 39, 57, 43, 42,  1, 46, 47, 57,  1, 43, 39, 56, 11,  1, 58, 46, 39,
        58,  1, 52, 53, 61,  1, 46, 43,  1, 61, 39, 57,  0, 32, 46, 43,  1, 47,
        60, 63,  1, 61, 46, 47, 41, 46,  1, 46, 39, 42,  1, 46, 47, 42,  1, 51,
        63,  1, 54, 56, 47, 52, 41, 43, 50, 63,  1, 58, 56, 59, 52, 49,  6,  0,
        13, 52, 42,  1, 57, 59, 41, 49,  5, 42,  1, 51, 63,  1, 60, 43, 56, 42,
        59, 56, 43,  1, 53, 59, 58,  1, 53, 52,  5, 58,  8,  1, 32, 46, 53, 59,
         1, 39, 58, 58, 43, 52, 42,  5, 57, 58,  1, 52, 53, 58,  8,  0,  0, 25,
        21, 30, 13, 26, 16, 13, 10,  0, 

In [None]:
# One head of self attention
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        # Query, key, and value are all linear layers.
        self.key = nn.Linear(n_embd, head_size, bias = False)
        self.query = nn.Linear(n_embd, head_size, bias = False)
        self.value = nn.Linear(n_embd, head_size, bias = False)
        # create a tril matrix of ones
        # PyTorch naming convention because the tril is not a parameter
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        v = self.value(x) # (B, T, head_size)

        # Dot product the key and the query to get the weights
        w = k @ q.transpose(-2, -1)  # (B,T,H) @ (B,H,T) = (B, T, T)

        # Dividing by sqrt(head_size) for stability and making sure the variance stays close to zero
        w = w * (C ** -0.5)

        w = w.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        w = F.softmax(w, dim = -1)
        w = self.dropout(w)

        out = w @ v # (B, T, T) @ (B, T, C) = (B, T, C) cuz B stays the same so essentially its a (T, T) @ (T, C)
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, no_of_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(no_of_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim = -1)
        out = self.proj(out)
        out = self.dropout(out)
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, no_of_heads):
        super().__init__()
        head_size = n_embd // no_of_heads
        self.sa = MultiHeadAttention(no_of_heads, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x



class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, no_of_heads=no_of_heads) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets = None):
        B,T = idx.shape
        tok_emb = self.token_embedding_table(idx) #(B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device = device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        if targets is None:
            loss = None
        else:
            # idx and targets are of shape (B,T)
            B,T,C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        #idx is (B,T)
        for _ in range(max_new_tokens):
            # Cropping the idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :] # Becomes (B, C)
            probs = F.softmax(logits, dim = -1)

            # Sampling from distribution
            idx_next = torch.multinomial(probs, num_samples = 1)
            idx = torch.cat((idx, idx_next), dim = 1)
        return idx

model = BigramLanguageModel(vocab_size)
m = model.to(device)
logits, loss = m(xb, yb)
print(loss)

idx = torch.zeros((1,1), dtype = torch.long, device = device) # stands for the new line token \n
print(decode(m.generate(idx = idx, max_new_tokens = 100)[0].tolist()))


tensor(4.3272, device='cuda:0', grad_fn=<NllLossBackward0>)

rp-DRCUZ maQJr?F,E.HfmNDzd,HEvGNsA3Wj pE,CVEt
nO&D-pq AQj!
,Hj-fs?',ycT
SRat:PdGdNSwPkmVGMyF;LOulLVg


In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    m.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = m(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out

In [None]:
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-3)

In [None]:
for iter in range(max_iters):
    if iter % eval_interval == 0:
      losses = estimate_loss()
      print(f"Step {iter}: Train loss = {losses['train']}, Val loss = {losses['val']}")

    xb, yb = get_batch("train")
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()

Step 0: Train loss = 4.335792064666748, Val loss = 4.3321614265441895
Step 500: Train loss = 1.6884046792984009, Val loss = 1.8422389030456543
Step 1000: Train loss = 1.3595319986343384, Val loss = 1.5879207849502563
Step 1500: Train loss = 1.238218903541565, Val loss = 1.5090975761413574
Step 2000: Train loss = 1.1531693935394287, Val loss = 1.4877129793167114
Step 2500: Train loss = 1.0890713930130005, Val loss = 1.4992848634719849
Step 3000: Train loss = 1.0264713764190674, Val loss = 1.5077345371246338
Step 3500: Train loss = 0.9688115119934082, Val loss = 1.5320357084274292
Step 4000: Train loss = 0.9069042801856995, Val loss = 1.558619737625122
Step 4500: Train loss = 0.8507474660873413, Val loss = 1.5785332918167114


In [None]:
print(decode(m.generate(idx = idx, max_new_tokens = 10000)[0].tolist()))



RIVERSS:
I cannot nor tetding those sit you shalt do
To well o'er the top offenders, for my honour.
We have no fger than groan the steeds and well, and
chelt his head one. Let me be spoke for another;
he was another being formed, but banish him;
he, for a pippe. I see, thought leave thee saw
In those that I condemn to unwish! If thou be
no great opparent to be in sudden formity,
who lately wear theirs past that revenge this outwo
compass.

ESCALUS:
Why do you this?

LUCIO:
Doth ship forfeitly that consul, Signio?

ANGELO:
Well; bring of all.

ESCALUS:
It is for your wisdom be the nopent. The wish, the
apperonce is a guilty will tread yourself and
profit, your sea, that the wretchedness in our sleep,
the fier eventon you as the sword or waking
and her constempore by what short is no year.

First Senator:
Weeping we well ender, were the case you--
He as sure.

AUTOLYCUS:
Gentle Pardon, my friend Potist ballads;--here is in
performity; how art like to visit the worst along,
that Aufidiu

In [None]:
B,T,C = 4,8,32

print(w.shape)
out.shape

In [None]:
print("Our model: \n\n", model, '\n')
print("The state dict keys: \n\n", model.state_dict().keys())

Our model: 

 BigramLanguageModel(
  (token_embedding_table): Embedding(65, 384)
  (position_embedding_table): Embedding(256, 384)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-5): 6 x Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
            (value): Linear(in_features=384, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1536, out_features=384, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((384,), eps=1e-05, e

In [None]:
from google.colab import files

torch.save(m.state_dict(), 'trainedGenerator(1.57).pth')
# download checkpoint file
files.download('trainedGenerator(1.57).pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Number of parameters in the model
sum(p.numel() for p in m.parameters() if p.requires_grad)

10788929