# Assignment 4, Aayush Verma,30/07/2024

## Question 1

### Transformer modules are defined in associated Python File

In [1]:
from pyTorchTransformer import Transformer
from transformers import AutoConfig,AutoTokenizer
import torch
from datasets import load_dataset
from tqdm.notebook import tqdm
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
srctokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-uncased")
tgttokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-uncased")

In [3]:
trf = Transformer().cuda()

In [4]:
src = srctokenizer(["Hello World!"],padding = True, truncation = True, max_length = 256,return_tensors = "pt")
tgt = tgttokenizer(["Hello World!"],padding = True, truncation = True, max_length = 256,return_tensors = "pt")

In [5]:
trf(src.input_ids.cuda(),tgt.input_ids[:,:-1].cuda()).shape

torch.Size([1, 4, 105879])

In [6]:
ds = load_dataset("Aarif1430/english-to-hindi")

In [7]:
ds

DatasetDict({
    train: Dataset({
        features: ['english_sentence', 'hindi_sentence'],
        num_rows: 127705
    })
})

In [8]:
ds = ds['train'].train_test_split(test_size = 0.1)

In [9]:
train = ds['train']
test = ds['test']

In [10]:
train = train.train_test_split(test_size = 0.005)

In [11]:
val = train['test']
train = train['train']

In [12]:
val

Dataset({
    features: ['english_sentence', 'hindi_sentence'],
    num_rows: 575
})

In [13]:
train

Dataset({
    features: ['english_sentence', 'hindi_sentence'],
    num_rows: 114359
})

In [14]:
train = torch.utils.data.DataLoader(train,batch_size = 8,num_workers = 1)
val = torch.utils.data.DataLoader(val,batch_size = 8,num_workers = 1)

In [15]:
optimizer = torch.optim.AdamW(trf.parameters(),lr = 3e-5)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.01, end_factor=1.0, total_iters=10000)
CCE = torch.nn.CrossEntropyLoss(ignore_index = 0)

In [16]:
step = 0
total_loss = 0.0
best_val_loss = 10000
for epoch in range(10):
    for batch in tqdm(train):
        trf.train()
        src = srctokenizer(batch['english_sentence'],padding = True, truncation = True, max_length = 256,return_tensors = "pt").input_ids.cuda()
        tgt = tgttokenizer(batch['hindi_sentence'],padding = True, truncation = True, max_length = 256,return_tensors = "pt").input_ids.cuda()
        optimizer.zero_grad()
        logits = trf(src,tgt[:,:-1]).permute((0,2,1))
        loss = CCE(logits,tgt[:,1:])
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()*src.size(0)
        step+= 1
        if step%1000 == 0:
            val_loss = 0.0
            accuracy = 0.0
            for valBatch in tqdm(val,leave=False):
                with torch.no_grad():
                    trf.eval()
                    src = srctokenizer(valBatch['english_sentence'],padding = True, truncation = True, max_length = 256,return_tensors = "pt").input_ids.cuda()
                    tgt = tgttokenizer(valBatch['hindi_sentence'],padding = True, truncation = True, max_length = 256,return_tensors = "pt").input_ids.cuda()
                    logits = trf(src,tgt[:,:-1]).permute((0,2,1))
                    loss = CCE(logits,tgt[:,1:])
                    preds = logits.argmax(1)
                    total_corrects = (preds == tgt[:,1:]).sum()
                    accuracy = accuracy + (total_corrects/(logits.shape[-1]*logits.shape[-3]))
                    val_loss += loss.item()*src.size(0)
            metrics = {"step":step,"trainLoss":round((total_loss/(8000)),4),"valLoss":round((val_loss/575),4),"acc":round((accuracy.item()/5.75),4)}
            print(metrics)
            if (val_loss/575)< best_val_loss:
                best_val_loss = val_loss/575
                torch.save({'step': step,
                            'model_state_dict': trf.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict(),
                            'loss': total_loss/8000,},"./Transformer.pt")
            total_loss = 0.0

  0%|          | 0/14295 [00:00<?, ?it/s]

  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 1000, 'trainLoss': 3.7074, 'valLoss': 2.9974, 'acc': 20.2888}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 2000, 'trainLoss': 2.7834, 'valLoss': 2.5621, 'acc': 24.7936}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 3000, 'trainLoss': 2.4819, 'valLoss': 2.3421, 'acc': 28.0208}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 4000, 'trainLoss': 2.2954, 'valLoss': 2.1775, 'acc': 30.0632}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 5000, 'trainLoss': 2.1528, 'valLoss': 2.0834, 'acc': 31.316}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 6000, 'trainLoss': 2.0497, 'valLoss': 1.9615, 'acc': 33.2096}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 7000, 'trainLoss': 1.9421, 'valLoss': 1.8899, 'acc': 34.096}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 8000, 'trainLoss': 1.8639, 'valLoss': 1.8003, 'acc': 36.0928}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 9000, 'trainLoss': 1.7757, 'valLoss': 1.7305, 'acc': 37.9344}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 10000, 'trainLoss': 1.7019, 'valLoss': 1.6801, 'acc': 38.5288}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 11000, 'trainLoss': 1.6485, 'valLoss': 1.6308, 'acc': 39.8888}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 12000, 'trainLoss': 1.5939, 'valLoss': 1.5782, 'acc': 41.5608}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 13000, 'trainLoss': 1.5473, 'valLoss': 1.5316, 'acc': 41.9824}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 14000, 'trainLoss': 1.511, 'valLoss': 1.4924, 'acc': 42.5056}


  0%|          | 0/14295 [00:00<?, ?it/s]

  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 15000, 'trainLoss': 1.4418, 'valLoss': 1.4454, 'acc': 43.2984}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 16000, 'trainLoss': 1.3897, 'valLoss': 1.3975, 'acc': 44.7184}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 17000, 'trainLoss': 1.3498, 'valLoss': 1.3617, 'acc': 45.3912}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 18000, 'trainLoss': 1.3144, 'valLoss': 1.3356, 'acc': 46.6096}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 19000, 'trainLoss': 1.2825, 'valLoss': 1.2997, 'acc': 47.0888}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 25000, 'trainLoss': 1.0811, 'valLoss': 1.1168, 'acc': 51.848}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 26000, 'trainLoss': 1.0282, 'valLoss': 1.0926, 'acc': 52.2696}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 27000, 'trainLoss': 1.0193, 'valLoss': 1.0633, 'acc': 52.736}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 28000, 'trainLoss': 0.9922, 'valLoss': 1.0515, 'acc': 52.8568}


  0%|          | 0/14295 [00:00<?, ?it/s]

  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 29000, 'trainLoss': 0.9657, 'valLoss': 1.0176, 'acc': 54.1896}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 30000, 'trainLoss': 0.9166, 'valLoss': 0.9859, 'acc': 54.9936}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 36000, 'trainLoss': 0.7785, 'valLoss': 0.8734, 'acc': 58.1136}


  0%|          | 0/72 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 38000, 'trainLoss': 0.7301, 'valLoss': 0.826, 'acc': 59.1944}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 39000, 'trainLoss': 0.6962, 'valLoss': 0.8024, 'acc': 59.9984}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 40000, 'trainLoss': 0.68, 'valLoss': 0.7799, 'acc': 60.6488}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 41000, 'trainLoss': 0.6649, 'valLoss': 0.7582, 'acc': 61.3704}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 42000, 'trainLoss': 0.6387, 'valLoss': 0.7546, 'acc': 61.1592}


  0%|          | 0/14295 [00:00<?, ?it/s]

  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 43000, 'trainLoss': 0.635, 'valLoss': 0.7249, 'acc': 61.5752}


  0%|          | 0/72 [00:00<?, ?it/s]

{'step': 44000, 'trainLoss': 0.5701, 'valLoss': 0.7107, 'acc': 63.14}


KeyboardInterrupt: 

In [18]:
trf = torch.load("Transformer.pt")

In [27]:
trf.keys()

dict_keys(['step', 'model_state_dict', 'optimizer_state_dict', 'loss'])