# EVA8 Session 11 Assignment - Part 2
## GPT Custom Retraining

## Goals:
1. Implement sparse attention in the GPT Code
2. Train on custom data collected for training BERT in the first part of the assignment
3. Share training logs and 10 examples of output

## Import Dataset, Model & Device

In [1]:
import torch
from model import Transformer
from transformers import AutoTokenizer  # pip install transformers
from utils import (
    BATCH_SIZE,
    BLOCK_SIZE,
    DEVICE,
    DROPOUT,
    LEARNING_RATE,
    NUM_EMBED,
    NUM_HEAD,
    NUM_LAYER,
    MAX_ITER,
    EVAL_INTER,
    encode,
    decode,
    get_batch,
    save_model_to_chekpoint,
    estimate_loss,
)

# load model from checkpoint
# m = load_model_from_checkpoint(Transformer,vocab_size=vocab_size)

# example to decode sequence
# enc_sec = m.generate(idx=torch.zeros((1,1), dtype=torch.long),
# max_new_tokens=20)[0].tolist()
# print(decode(vocab=vocab, enc_sec=enc_sec))

# raw data
#path_do_data = "data/english.txt"
path_do_data = "/Users/abhinavpujahari/Documents/EVA8/Session11/BERT/BERT_AssignmentDataset.txt"
data_raw = open(path_do_data, encoding="utf-8").read()
# we use pretrained BERT tokenizer for performance improvements
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
vocab_size = tokenizer.vocab_size
# data_raw = data_raw[4000000:] # short dataset

# train/val split
data = encode(text_seq=data_raw, tokenizer=tokenizer)
n = int(0.9 * len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# train a new model
model = Transformer(
    vocab_size=vocab_size,
    num_embed=NUM_EMBED,
    block_size=BLOCK_SIZE,
    num_heads=NUM_HEAD,
    num_layers=NUM_LAYER,
    dropout=DROPOUT,
)
# load model to GPU if available
m = model.to(DEVICE)
# print the number of parameters in the model
print(
    "Model with {:.2f}M parameters".format(sum(p.numel() for p in m.parameters()) / 1e6)
)


  from .autonotebook import tqdm as notebook_tqdm
Token indices sequence length is longer than the specified maximum sequence length for this model (21470730 > 512). Running this sequence through the model will result in indexing errors


Model with 89.48M parameters


## Define Optimizer and Train

In [2]:
# optimizer takes the model's parameters and the learning rate as input,
# and updates the parameters during the training process in order to
# minimize the loss function.
optimizer = torch.optim.AdamW(m.parameters(), lr=LEARNING_RATE)
MAX_ITER = 500
EVAL_INTER = 50
for step in range(MAX_ITER):

    # every EVAL_INTER evaluate the loss on train and val sets
    if step % EVAL_INTER == 0 or step == MAX_ITER - 1:
        loss_train = estimate_loss(
            data=train_data, model=m, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        loss_val = estimate_loss(
            data=val_data, model=m, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        print("step {:10} | train loss {:6.4f} | val loss {:6.4f}".format(step, loss_train, loss_val))

    # sample a batch of data
    xb, yb = get_batch(data=train_data, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE)
    logits, loss = m.forward(xb, yb)
    # zero_grad() method sets the gradients of all parameters in the optimizer to zero
    optimizer.zero_grad(set_to_none=True)
    # backward() method on the loss variable calculates the gradients 
    # of the loss with respect to the model's parameters.
    loss.backward()
    # step() method on the optimizer updates the model's parameters 
    # using the calculated gradients, in order to minimize the loss.
    optimizer.step()



step          0 | train loss 10.8555 | val loss 10.8366
step         50 | train loss 3.8926 | val loss 3.8270
step        100 | train loss 2.7408 | val loss 2.7125
step        150 | train loss 2.6448 | val loss 2.6919
step        200 | train loss 2.6032 | val loss 2.5895
step        250 | train loss 2.5616 | val loss 2.5476
step        300 | train loss 2.5775 | val loss 2.5472
step        350 | train loss 2.5341 | val loss 2.4919
step        400 | train loss 2.4986 | val loss 2.5273
step        450 | train loss 2.5135 | val loss 2.5204
step        499 | train loss 2.4591 | val loss 2.4642


## Save Model Checkpoint

In [3]:
save_model_to_chekpoint(model = m, path_to_checkpoint="checkpoint", epoch=step)

Successfully saved the model to checkpoint/checkpoint_epoch-499_10.03.2023_20:39:42.pt


## Generate Output Samples

In [4]:
# generate some output based on the context
context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)

for i in range(10):
    print(f"-------------Example {i}----------------")
    print(
        decode(
            enc_sec=m.generate(idx=context, max_new_tokens=100, block_size=BLOCK_SIZE)[0],
            tokenizer=tokenizer,
        )
    )

-------------Example 0----------------
[PAD] long period to 50 % of ritual, tamil language ; there were than 4, the top 12 billion agreement with a look east india, becoming common in 2010, a gift from eastern india company began a sovereign, greater than one in recent economic challenges india subsequently signed a thousand marriages are hemmed but it is the dhamamamamamotototototamotolaototamamototolaotamolaotamotolaamotamamamotolaotolaam
-------------Example 1----------------
[PAD] 1785 much for eating usually served the wearing of pakistan in the indus valley civilisation was supplanted around $ 2 % of india. eschewing tribal bonds and firepower of the resulting mughal and medieval islam ; between 1631 million a us $ 24 billion equallingual and in the development of the upper body gang body gang body - - gang - - - body body - classes body - - - - - - - - - body body - - gang body body body body - - -
-------------Example 2----------------
[PAD] with the appointment in the remainin