In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
import json
import torch
import time
import pandas as pd
from pathlib import Path
from transformers import AdamW
from torch.utils.data import Dataset, IterableDataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt

# from vl_Build_model_network import *
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel,OpenAIGPTLMHeadModel, AutoModelForCausalLM,AutoTokenizer
from vl_model_args import *

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", cache_dir = 'Model')
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", cache_dir = 'Model')
# pad_token =  '[PAD]'
# sep_token =  '[SEP]'
# tokenizer.add_special_tokens({'additional_special_tokens': [pad_token]+[sep_token]})
# pad_token_id = tokenizer.convert_tokens_to_ids('[PAD]')
# sep_token_id = tokenizer.convert_tokens_to_ids('[SEP]')
special_tokens = {'pad_token':'<|pad|>','sep_token':'<|sep|>'}
num_add_toks = tokenizer.add_special_tokens(special_tokens)
# tokenizer.add_special_tokens({'Sep_token': '[SEP]'})

In [5]:
tokenizer.eos_token

'<|endoftext|>'

In [6]:
tokenizer.pad_token_id

50257

In [7]:
tokenizer.eos_token

'<|endoftext|>'

In [8]:
tokenizer

GPT2TokenizerFast(name_or_path='openai-community/gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'sep_token': '<|sep|>', 'pad_token': '<|pad|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50257: AddedToken("<|pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50258: AddedToken("<|sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [9]:
tokenizer.sep_token, tokenizer(tokenizer.sep_token)

('<|sep|>', {'input_ids': [50258], 'attention_mask': [1]})

In [10]:
ignore_idx = tokenizer.pad_token_id


In [11]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 768)

In [12]:
token = tokenizer("Where did fortune cookies originate ?", return_tensors="pt")

In [13]:
outputs =  model.generate(input_ids = token['input_ids'], max_new_tokens = 50)
out = tokenizer.decode(outputs[0], skip_special_tokens = True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [14]:
print(out)

Where did fortune cookies originate?


In [15]:
input_ids = token['input_ids']
print(input_ids.size())
print(type(input_ids), input_ids.dtype)

torch.Size([1, 6])
<class 'torch.Tensor'> torch.int64


In [16]:
model(input_ids = token['input_ids'])

CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[[-3.2687e+01, -3.1960e+01, -3.5255e+01,  ..., -3.2099e+01,
           4.0421e-01,  4.1818e-01],
         [-8.7095e+01, -8.6464e+01, -8.9933e+01,  ..., -8.7396e+01,
           2.3679e+00, -1.8623e+00],
         [-9.2922e+01, -9.1611e+01, -9.6331e+01,  ..., -9.5655e+01,
           2.3855e+00,  7.8973e-01],
         [-1.0124e+02, -1.0095e+02, -1.0534e+02,  ..., -1.0455e+02,
           2.8520e+00, -1.4060e+00],
         [-6.6829e+01, -6.8585e+01, -7.1825e+01,  ..., -7.2043e+01,
           2.2912e-01, -1.1065e-02],
         [-1.1713e+02, -1.1552e+02, -1.1945e+02,  ..., -1.1490e+02,
           2.1203e+00,  2.4318e-01]]], grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[-1.1705,  1.9088,  0.5898,  ..., -1.4944, -0.6937,  1.4462],
          [-1.7728,  1.9535,  2.0315,  ..., -1.1755, -2.4116,  2.5686],
          [-1.8935,  1.2292,  1.5397,  ..., -2.6831, -2.0781,  0.5863],
          [-1.6944,  3.4057,  0.8116,  ..., -1.408

In [17]:
path_data = Path('Data/sqaud')

In [18]:
def Question_Answer(data_df):
    """
    Convert an ordered JSON object into a token sequence
    """
    Context = data_df['context']
    Question = data_df['question']
    Answer = data_df['answer']['text']
    Context_Que = f"{Context} {Question} {tokenizer.eos_token} {Answer} {tokenizer.eos_token}"
    Answer = f"{Context} {Question} {tokenizer.eos_token}"
    return Context_Que, Answer

In [19]:
def create_dataset(text_paths):
    sample = []
    for txt_file in os.listdir(text_paths):
        if txt_file.endswith('.json'):
            txt_path = text_paths.joinpath(txt_file)
            sample.append({'data_path':txt_path})
    return sample 

In [20]:
txt_path = create_dataset(path_data)

In [21]:
txt_path[0]

{'data_path': PosixPath('Data/sqaud/Squad_data_dev_12217.json')}

In [22]:
txt_path[0]['data_path']

PosixPath('Data/sqaud/Squad_data_dev_12217.json')

In [23]:
text_path = txt_path[0]['data_path']
with open(text_path, 'r') as f:
    json_text = json.load(f)       
text = Question_Answer(json_text)

In [24]:
# Index_length = []
# for idx in range(len(txt_path)):
#         ttxt_path = txt_path[idx]['data_path']
#         with open(ttxt_path, 'r') as f:
#             text = json.load(f)       
#         Context_Que, Ans = Question_Answer(text)
#         # print(Context_Que)
#         Ques_token = tokenizer(Context_Que, 
#                                       add_special_tokens=False,
#                                       max_length  = 512,
#                                       truncation=True,
#                                       padding='max_length',
#                                       return_tensors="pt")#
#         Ques_input_ids =   Ques_token['input_ids'].squeeze(0)   
#         Ans_token = tokenizer(Ans, 
#                                       add_special_tokens=False,
#                                       truncation=True,
#                                       return_tensors="pt")#
#         Ans_input_ids =   Ans_token['input_ids'].squeeze(0)  
#         labels = Ques_input_ids.clone()
#         labels[: torch.nonzero(labels == tokenizer.pad_token_id).sum()] = -100 
#         pad_tokenize = torch.nonzero(Ques_input_ids == tokenizer.eos_token)
#         if pad_tokenize.nelement() == 0:
#             continue
#         length = torch.nonzero(Ques_input_ids == tokenizer.eos_token).sum()
        
#         # print(length)
#         labels = Ques_input_ids[length:-1]
#         Index_length.append(len(Ques_input_ids))
#         # print(Ques_input_ids, labels)
#         # break

In [25]:
# max(Index_length)

In [26]:
# plt.hist(Index_length, bins=100)
# plt.show()

In [27]:
# for i in range(df.shape[0]):
#     dictionary = df.iloc[i].to_dict()
#     data_path = path_data.joinpath(f'Truth_Data{i}.json')
#     with open(data_path, 'w') as f:
#         json.dump(dictionary, f)
#     print(txt_path)

In [28]:
# with open(txt_path, 'r') as f:
#     text = json.load(f)       
# text = json2token(text)

In [29]:
len(txt_path)

107123

In [30]:
class FineTune_Dataset(Dataset):
    
    def __init__(self, sample, tokenizer, max_length=256):
        self.sample = sample
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.ignore_id = ModelArguments.pad_token
        self.num_list = []
        self.check()
    def check(self):
        for  idx in range(len(self.sample)):
            #print("index : ", idx)
            txt_path = self.sample[idx]['data_path']
            with open(txt_path, 'r') as f:
                text = json.load(f)       
            Context_Que_Ans, _ = Question_Answer(text)
            Context_Que_Ans_token = self.tokenizer(Context_Que_Ans, 
                                        add_special_tokens=False,
                                        max_length  = self.max_length,
                                        truncation=True,
                                        padding='max_length',
                                        return_tensors="pt")#
            Ques_input_ids =   Context_Que_Ans_token['input_ids'].squeeze(0)   
            #print("IDS : ",Ques_input_ids)                          
            eos_tokenize = torch.nonzero(Ques_input_ids == tokenizer.eos_token_id)
            #print(eos_tokenize, eos_tokenize.nelement())
            if eos_tokenize.nelement() == 2:
                self.num_list.append(txt_path)

        
    def __getitem__(self, idx):
        
        txt_path = self.sample[self.num_list[idx]]['data_path']
        with open(txt_path, 'r') as f:
            text = json.load(f)       
        Context_Que_Ans, Ans = Question_Answer(text)
        Context_Que_Ans_token = self.tokenizer(Context_Que_Ans, 
                                      add_special_tokens=False,
                                      max_length  = self.max_length,
                                      truncation=True,
                                      padding='max_length',
                                      return_tensors="pt")#
        Ques_input_ids =   Context_Que_Ans_token['input_ids'].squeeze(0)                  
        Ques_attention_ids = Context_Que_Ans_token['attention_mask'].squeeze(0)   
        eos_tokenize = torch.nonzero(Ques_input_ids == tokenizer.eos_token_id)

        labels = Ques_input_ids.clone()
        # length = eos_tokenize[0][-1]
        # length = torch.nonzero(Ques_input_ids == self.tokenizer.sep_token_id).sum()
        labels[labels == self.tokenizer.pad_token_id] = self.ignore_id
        
        data_batch = {"input_ids":Ques_input_ids.to(device),'attention_mask' : Ques_attention_ids.to(device), "labels":labels}

        return data_batch

    def __len__(self):
        return len(self.num_list)

In [31]:
# sample_train = txt_path[:1000]
# sample_test = txt_path[:100]
# train_dataset = FineTune_Dataset(sample_train,tokenizer, max_length = 64)

In [32]:
sample_train = txt_path[0:100000]
sample_test = txt_path[0:1000]
train_dataset = FineTune_Dataset(sample_train,tokenizer, max_length = 128)
val_dataset   = FineTune_Dataset(sample_test, tokenizer, max_length = 128)

In [33]:
my_list = train_dataset.num_list

In [34]:
filename = "my_list.txt"

# Open the file in write mode ('w')
with open(filename, 'w') as file:
    # Iterate through each element in the list
    for element in my_list:
        # Write each element to the file, followed by a newline character
        file.write(str(element) + '\n')

In [35]:
filename = "my_list.txt"

# Initialize an empty list to store the elements
my_list = []

# Open the file in read mode ('r')
with open(filename, 'r') as file:
    # Iterate through each line in the file
    for line in file:
        # Convert each line (which is a string) to an integer and append it to the list
        my_list.append(line.strip())

# print("List loaded from file:", my_list)

In [36]:
len(my_list)

18146

In [37]:
min()

TypeError: min expected at least 1 argument, got 0

In [None]:
filename = "my_list.txt"

# Open the file in write mode ('w')
with open(filename, 'w') as file:
    # Iterate through each element in the list
    for element in my_list:
        # Write each element to the file, followed by a newline character
        file.write(str(element) + '\n')

In [None]:
len(train_dataset), len(val_dataset)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader   = DataLoader(val_dataset  , batch_size=1, shuffle=True)

In [None]:
for i,batch_data in enumerate(val_dataset):
    input_ids=batch_data['input_ids']
    print(batch_data)
    break

In [None]:
for i,batch_data in enumerate(val_loader):
    input_ids=batch_data['input_ids']
    print(batch_data)
    break
print(i)

In [None]:
# for  i,batch_data in enumerate(train_loader):
#     input_ids=batch_data['input_ids']
#     print(batch_data)
#     break
# print(i)

In [None]:
#  def accuracy_function(label, logits):
#         masked_active_acc_one = torch.ne(label.view(-1,),ModelArguments.pad_token)
#         masked_labels = torch.masked_select(label.view(-1), masked_active_acc_one)
#         masked_active_acc_one = torch.unsqueeze(masked_active_acc_one, 1)
#         masked_active_acc = masked_active_acc_one.repeat(1,logits.size(dim=2))
#         logits = logits.view(-1,logits.size(dim=2))
#         new_logits = logits[masked_active_acc]
#         new_logits = new_logits.view(-1, logits.size(-1))##len(tokenizer)) #len(tokenizer)
#         print(new_logits.size())
#         masked_pred = torch.argmax(new_logits, dim=-1)
#         print(masked_labels)
#         print(masked_labels.size())
#         result = (masked_pred == masked_labels).float().mean()
#         return result

In [None]:
# for  batch_data in val_loader:
#     input_ids=batch_data['input_ids']
#     print(input_ids.size())
#     print(type(input_ids), input_ids.dtype)
#     labels=batch_data['labels']
#     model = model.to(device)
#     lm_logits = model(input_ids=batch_data['input_ids'], attention_mask=batch_data['attention_mask'])['logits']
    
#     shift_logits = lm_logits[..., :-1, :].contiguous()
#     shift_labels = labels[..., 1:].contiguous()
#     print("shift_logits >> ",shift_logits.size())
#     print("shift_labels >> ",shift_labels.size())

#             # Flatten the tokens
#     loss_fct = torch.nn.CrossEntropyLoss()
#     loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
#     print(loss)
#     acc = accuracy_function(shift_labels, shift_logits)
#     print(acc)
#     break

In [None]:
class Build_model(torch.nn.Module):    
    def __init__(self, model):
            super(Build_model, self).__init__()
            self.model = model

    def loss_object(self, real, pred):

            loss = self.loss_fct(pred.view(-1, pred.size(-1)), real.view(-1))
            return loss 
    def accuracy_function(self,label, logits):
        masked_active_acc_one = torch.ne(label.view(-1,),ModelArguments.pad_token)
        masked_labels = torch.masked_select(label.view(-1), masked_active_acc_one)
        masked_active_acc_one = torch.unsqueeze(masked_active_acc_one, 1)
        masked_active_acc = masked_active_acc_one.repeat(1,logits.size(dim=2))
        logits = logits.view(-1,logits.size(dim=2))
        new_logits = logits[masked_active_acc]
        new_logits = new_logits.view(-1, logits.size(-1))##len(tokenizer)) #len(tokenizer)
        masked_pred = torch.argmax(new_logits, dim=-1)
        result = (masked_pred == masked_labels).float().mean()
        return result
        
 
    def compiler(self, optimizer, loss_fct):
        
            self.optimizer = optimizer
            self.loss_fct = loss_fct
        
    def calculate(self,
            input_ids: Optional[torch.LongTensor] = None,
            attention_mask: Optional[torch.LongTensor] = None,
            labels: Optional[torch.LongTensor] = None):
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = logits['logits']
        new_logits = logits[..., :-1, :].contiguous()
        new_labels = labels[..., 1:].contiguous()
        loss = self.loss_object(new_labels, new_logits)
        accuracy = self.accuracy_function(new_labels, new_logits)
        return loss, accuracy             
            

    def fit(self,train_loader,val_loader,epochs):
        history = {
        "epoch": [],
        "loss": [],
        "Accuracy" :[],
        "val_loss" :[],
        "val_Accuracy" :[]
        }
        print("Training Started ........ ")
        for epoch in range(epochs):
            start = time.time()
                    
            total_loss = 0
            val_loss = 0
            total_accuracy = 0
            val_accuracy = 0            
            print(f"Epoch : {epoch +1}\n")
            batch =0 
            start_iter_train = time.time()
            for batch_data in train_loader:
                    input_ids=batch_data['input_ids']
                    attention_mask = batch_data['attention_mask']
                    try:
                        labels=batch_data['labels']
                    except KeyError as ke:
                        labels = None
                    self.model.train()  # make sure we are in .train() mode    

                    loss, acc = self.calculate(input_ids,attention_mask, labels)
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()
                    if (batch+1) % 100 == 0:    
                        # self.save_weights(ModelArguments.fine_tuned_weights)
                        print(f"Train iter : {batch+1} ||  accuracy : {acc.detach().cpu().numpy():.4f} || loss : {loss.detach().cpu().numpy():.4f} || time : {time.time() - start_iter_train:.2f} secs")  
                    total_accuracy = total_accuracy+acc.detach().cpu().numpy() 
                    total_loss = total_loss + loss.detach().cpu().numpy()
                    batch=batch+1

            print(f'\nTotal Loss: {total_loss/(batch):.4f} | Accuracy : {total_accuracy/(batch):.4f}\n')
            with torch.no_grad():
                    batc =0 
                    start_iter_val = time.time()
                    for val_data in val_loader:
                            input_ids=val_data['input_ids']
                            attention_mask = val_data['attention_mask']
                            try:
                                labels = val_data['labels']
                            except KeyError as ke:
                                labels = None
                            self.model.eval()  # make sure we are in .eval() mode
                            loss, acc = self.calculate(input_ids,attention_mask, labels)
                            if (batc+1) % 100 == 0:    
                                print(f"Validation iter : {batc+1}  ||  accuracy : {acc.detach().cpu().numpy():.4f} || loss : {loss.detach().cpu().numpy():.4f} || time : {time.time() - start_iter_val:.2f} secs")  
                            val_accuracy = val_accuracy+acc.detach().cpu().numpy() 
                            val_loss = val_loss + loss.detach().cpu().numpy()
                            batc=batc+1


            print(f'\nTotal Validation Loss : {val_loss/(batc):.4f} | Validation Accuracy : {val_accuracy/(batc):.4f}') 
            print(f'\nTime taken for 1 epoch : {time.time() - start:.2f} secs\n')  
        return history 
    def save_weights(self, path):
        torch.save(self.model.state_dict(), path)
    def load_weights(self, path):
        self.model.load_state_dict(torch.load(path))
    def predict(self, inp, MAX_LEN=10):
        with torch.no_grad():
                    input_ids=inp['input_ids']
                    self.model.eval()
                    outputs =  model.generate(input_ids = input_ids, max_new_tokens = MAX_LEN)
                    out = tokenizer.decode(outputs[0], skip_special_tokens = True)
                    # output =  self.model(input_ids=input_ids)
        return out

In [None]:
loss_fct = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=ModelArguments.INIT_LR)      
        
model = model.to(device)
GPTmodel = Build_model(model)
GPTmodel.compiler(optimizer, loss_fct)
# GPTmodel.load_weights(ModelArguments.fine_tuned_weights)

In [None]:
print("##### Model Start training .....")
history = GPTmodel.fit(train_loader,val_loader, epochs=ModelArguments.epochs)

In [None]:
token = tokenizer("Where did fortune cookies originate?", return_tensors="pt")
token = {key:value.to(device) for key, value in token.items()}

out = GPTmodel.predict(token, MAX_LEN=50)
print(out)

In [None]:
# GPTmodel.save_weights(ModelArguments.fine_tuned_weights)

In [None]:
min()

In [None]:
GPTmodel_Test = Build_model(model)
GPTmodel_Test.load_weights(ModelArguments.fine_tuned_weights)

In [None]:
token = tokenizer("What is insurance", return_tensors="pt")
out = GPTmodel_Test.predict(token, MAX_LEN=50)
print(out)

In [None]:
min()

In [None]:
tokenizer.pad_token_id

In [None]:
input = torch.randn(10)
input

In [None]:
padding = (0,4)

In [None]:
torch.nn.functional.pad(input, padding, mode = "constant", value = 0.0)

In [None]:
m(input)

In [None]:
# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device
# model.to(device)
# # activate training mode of model
# model.train()
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(model.parameters(), lr=1e-5)
#optim = torch.optim.Adam(model.parameters(), lr=1e-5)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [None]:
# for batch in train_loader:
#     print(batch)

In [None]:
class Build_model(torch.nn.Module):    
    def __init__(self, model):
            super(Build_model, self).__init__()
            self.model = model

    def loss_object(self, real, pred):

            loss = self.loss_fct(pred.view(-1, pred.size(-1)), real.view(-1))
            return loss 
    def accuracy_function(self,label, logits):
        masked_active_acc_one = torch.ne(label.view(-1,),ModelArguments.pad_token)
        masked_labels = torch.masked_select(label.view(-1), masked_active_acc_one)
        masked_active_acc_one = torch.unsqueeze(masked_active_acc_one, 1)
        masked_active_acc = masked_active_acc_one.repeat(1,logits.size(dim=2))
        logits = logits.view(-1,logits.size(dim=2))
        new_logits = logits[masked_active_acc]
        new_logits = new_logits.view(-1, logits.size(-1))##len(tokenizer)) #len(tokenizer)
        masked_pred = torch.argmax(new_logits, dim=-1)
        result = (masked_pred == masked_labels).float().mean()
        return result
        
 
    def compiler(self, optimizer, loss_fct):
        
            self.optimizer = optimizer
            self.loss_fct = loss_fct
        
    def calculate(self,
            input_ids: Optional[torch.LongTensor] = None,
            labels: Optional[torch.LongTensor] = None):

        logits = self.model(input_ids=input_ids)['logits']
        if labels == None:
                labels = input_ids_x.clone()
                labels[labels == tokenizer.pad_token_id] = self.ignore_id
                new_logits = logits[..., :-1, :].contiguous()
                new_labels = label[..., 1:].contiguous()
                loss = self.loss_object(new_labels, new_logits)
                accuracy = self.accuracy_function(new_labels, new_logits)
        else :
            
                loss = self.loss_object(labels, logits)
                accuracy = self.accuracy_function(labels, logits)
        return loss, accuracy             
            

    def fit(self,train_dataset,val_dataset,epochs):
        history = {
        "epoch": [],
        "loss": [],
        "Accuracy" :[],
        "val_loss" :[],
        "val_Accuracy" :[]
        }
        print("Training Started ........ ")
        for epoch in range(epochs):
            start = time.time()
                    
            total_loss = 0
            val_loss = 0
            total_accuracy = 0
            val_accuracy = 0            
            print(f"Epoch : {epoch +1}\n")
            batch =0 
            start_iter_train = time.time()
            for batch_data in train_dataset:
                    input_ids=batch_data['input_ids']
                    labels=batch_data['labels']
                    self.model.train()  # make sure we are in .train() mode    

                    loss, acc = self.calculate(input_ids, labels)
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()
                    if (batch+1) % 1 == 0:    
                        # self.save_weights(ModelArguments.fine_tuned_weights)
                        print(f"Train iter : {batch+1} ||  accuracy : {acc.detach().cpu().numpy():.4f} || loss : {loss.detach().cpu().numpy():.4f} || time : {time.time() - start_iter_train:.2f} secs")  
                    total_accuracy = total_accuracy+acc.detach().cpu().numpy() 
                    total_loss = total_loss + loss.detach().cpu().numpy()
                    batch=batch+1

            print(f'\nTotal Loss: {total_loss/(batch):.4f} | Accuracy : {total_accuracy/(batch):.4f}\n')
            with torch.no_grad():
                    batc =0 
                    start_iter_val = time.time()
                    for val_data in val_dataset:
                            input_ids=val_data['input_ids']
                            labels = val_data['labels']
                            self.model.eval()  # make sure we are in .eval() mode
                            loss, acc = self.calculate(input_ids, labels)
                            if (batc+1) % 1 == 0:    
                                print(f"Validation iter : {batc+1}  ||  accuracy : {acc.detach().cpu().numpy():.4f} || loss : {loss.detach().cpu().numpy():.4f} || time : {time.time() - start_iter_val:.2f} secs")  
                            val_accuracy = val_accuracy+acc.detach().cpu().numpy() 
                            val_loss = val_loss + loss.detach().cpu().numpy()
                            batc=batc+1


            print(f'\nTotal Validation Loss : {val_loss/(batc):.4f} | Validation Accuracy : {val_accuracy/(batc):.4f}') 
            print(f'\nTime taken for 1 epoch : {time.time() - start:.2f} secs\n')  
        return history 
    def save_weights(self, path):
        torch.save(self.model.state_dict(), path)
    def load_weights(self, path):
        self.model.load_state_dict(torch.load(path))
    def predict(self, inp, MAX_LEN=10):
        with torch.no_grad():
                    input_ids=inp['input_ids']
                    self.model.eval()
                    outputs =  model.generate(input_ids = input_ids, max_new_tokens = MAX_LEN)
                    out = tokenizer.decode(outputs[0], skip_special_tokens = True)
                    # output =  self.model(input_ids=input_ids)
        return out

In [None]:
class Build_model(torch.nn.Module):    
    def __init__(self, model):
            super(Build_model, self).__init__()
            self.model = model

    def loss_object(self, real, pred):

            loss = self.loss_fct(pred.view(-1, pred.size(-1)), real.view(-1))
            return loss 
    def accuracy_function(self,label, logits):
        masked_active_acc_one = torch.ne(label.view(-1,),ModelArguments.pad_token)
        masked_labels = torch.masked_select(label.view(-1), masked_active_acc_one)
        masked_active_acc_one = torch.unsqueeze(masked_active_acc_one, 1)
        masked_active_acc = masked_active_acc_one.repeat(1,logits.size(dim=2))
        logits = logits.view(-1,logits.size(dim=2))
        new_logits = logits[masked_active_acc]
        new_logits = new_logits.view(-1, logits.size(-1))##len(tokenizer)) #len(tokenizer)
        masked_pred = torch.argmax(new_logits, dim=-1)
        result = (masked_pred == masked_labels).float().mean()
        return result
        
 
    def compiler(self, optimizer, loss_fct):
        
            self.optimizer = optimizer
            self.loss_fct = loss_fct

    def fit(self,train_dataset,val_dataset,epochs):
        history = {
        "epoch": [],
        "loss": [],
        "Accuracy" :[],
        "val_loss" :[],
        "val_Accuracy" :[]
        }
        print("Training Started ........ ")
        for epoch in range(epochs):
            start = time.time()
                    
            total_loss = 0
            val_loss = 0
            total_accuracy = 0
            val_accuracy = 0            
            print(f"Epoch : {epoch +1}\n")
            batch =0 
            start_iter_train = time.time()
            for batch_data in train_dataset:
                    input_ids=batch_data['input_ids']
                    label=batch_data['labels']
                    self.model.train()  # make sure we are in .train() mode                
                    logits = self.model(input_ids=input_ids)['logits']
                    new_logits = logits[..., :-1, :].contiguous()
                    new_labels = label[..., 1:].contiguous()
                    loss = self.loss_object(new_labels, new_logits)

                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()
                    acc = self.accuracy_function(new_labels, new_logits)
                    if (batch+1) % 1 == 0:    
                        print(f"Train iter : {batch+1} ||  accuracy : {acc.detach().cpu().numpy():.4f} || loss : {loss.detach().cpu().numpy():.4f} || time : {time.time() - start_iter_train:.2f} secs")  
                    total_accuracy = total_accuracy+acc.detach().cpu().numpy() 
                    total_loss = total_loss + loss.detach().cpu().numpy()
                    batch=batch+1

            print(f'\nTotal Loss: {total_loss/(batch):.4f} | Accuracy : {total_accuracy/(batch):.4f}\n')
            with torch.no_grad():
                    batc =0 
                    start_iter_val = time.time()
                    for val_data in val_dataset:
                            input_ids=val_data['input_ids']
                            label = val_data['labels']
                            self.model.eval()  # make sure we are in .eval() mode
                            logits = self.model(input_ids=input_ids)['logits']
                            shift_logits = logits[..., :-1, :].contiguous()
                            shift_labels = label[..., 1:].contiguous()
                        
                            loss = self.loss_object(shift_labels, shift_logits)
                            acc = self.accuracy_function(shift_labels, shift_logits)
                            if (batc+1) % 1 == 0:    
                                print(f"Validation iter : {batc+1}  ||  accuracy : {acc.detach().cpu().numpy():.4f} || loss : {loss.detach().cpu().numpy():.4f} || time : {time.time() - start_iter_val:.2f} secs")  
                            val_accuracy = val_accuracy+acc.detach().cpu().numpy() 
                            val_loss = val_loss + loss.detach().cpu().numpy()
                            batc=batc+1


            print(f'\nTotal Validation Loss : {val_loss/(batc):.4f} | Validation Accuracy : {val_accuracy/(batc):.4f}') 
            print(f'\nTime taken for 1 epoch : {time.time() - start:.2f} secs\n')  
        return history 
    def save_weights(self, path):
        torch.save(self.model.state_dict(), path)
    def load_weights(self, path):
        self.model.load_state_dict(torch.load(path))
    def predict(self, inp, MAX_LEN=10):
        with torch.no_grad():
                    input_ids=inp['input_ids']
                    self.model.eval()
                    outputs =  model.generate(input_ids = input_ids, max_new_tokens = MAX_LEN)
                    out = tokenizer.decode(outputs[0], skip_special_tokens = True)
                    # output =  self.model(input_ids=input_ids)
        return out