In [3]:
import torch
import torch.nn as nn
import math

In [4]:
# improting pandas for dataframe difference operation
import pandas as pd
# importing torch.nn.functional module for call method of softmax
import torch.nn.functional as F
# import pad_sequence for make same length of all sentence tokens during batching
from torch.nn.utils.rnn import pad_sequence
# import autotokenizer for download pretrained tokenizer, and automodel for download pretrained embed model
from transformers import AutoTokenizer, AutoModel
# import dataset and dataloader for making custom dataset with addition operations , loader for shuffle , batching
from torch.utils.data import DataLoader, Dataset 
# import torch for tensor operations or used for different module
import torch
# import nn for different deeplearning model like lstm ,linear etc
import torch.nn as nn
# import tqdm for progress bar
from tqdm import tqdm
# importing json for load and dump json file
import json
# importing csv file for creating file for logging report summary
import csv

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# check cuda is available or not 
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [6]:
class SelfAttention(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.query_w = nn.Linear(embed_dim, embed_dim)
        self.key_w = nn.Linear(embed_dim, embed_dim)
        self.value_w = nn.Linear(embed_dim, embed_dim)
        self.softmax = nn.Softmax(dim=-1)  
    def forward(self, embed):
        query = self.query_w(embed)
        key = self.key_w(embed)
        value = self.value_w(embed)
        scores = torch.matmul(query, key.transpose(-2, -1)) / (key.shape[-1] ** 0.5)
        attn_weights = self.softmax(scores)
        attended = torch.matmul(attn_weights, value)
        return attended

In [7]:
class MultiheadAttention(nn.Module):
    def __init__(self, num_heads, embed_dim):
        super().__init__()
        self.num_heads = num_heads
        self.embed_dim = embed_dim

        self.multi_head_attn = nn.ModuleList([
            SelfAttention(embed_dim) for _ in range(num_heads)
        ])
        self.W = nn.Linear(num_heads * embed_dim, embed_dim)

    def forward(self, embed):
        heads = [head(embed) for head in self.multi_head_attn]
        heads_cat = torch.cat(heads, dim=-1)
        output = self.W(heads_cat)
        return output


        

In [8]:
class LayerNormalization(nn.Module):
    def __init__(self, embed_dim, eps=1e-5):
        super(LayerNormalization, self).__init__()
        self.alpha = nn.Parameter(torch.ones(embed_dim))  
        self.beta = nn.Parameter(torch.zeros(embed_dim))  
        self.eps = eps

    def forward(self, embed):
        mean = embed.mean(dim=-1, keepdim=True)
        var = embed.var(dim=-1, keepdim=True, unbiased=False)
        normalized = (embed - mean) / torch.sqrt(var + self.eps)

        return self.alpha * normalized + self.beta


In [9]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.W1 = nn.Linear(embed_dim, embed_dim)
        self.W2 = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(0.2)

    def forward(self, embed):
        x = self.W1(embed)
        x = self.dropout(x)
        x = self.W2(x)  
        return x


In [11]:
class Encoder(nn.Module):
    def __init__(self, num_heads, embed_dim):
        super().__init__()  
        self.multiheadattention = MultiheadAttention(num_heads, embed_dim)
        self.layernorm1 = nn.LayerNorm(embed_dim)  
        self.feedforward = FeedForward(embed_dim)
        self.layernorm2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        atten_x = self.multiheadattention(x)
        x = self.layernorm1(atten_x + x)
        ff_out = self.feedforward(x)
        x = self.layernorm2(ff_out + x)
        return x

In [12]:
class StackEncoder(nn.Module):
    def __init__(self, num_heads, embed_dim):
        super().__init__()
        self.encoders = nn.Sequential(
            *[Encoder(num_heads, embed_dim) for _ in range(6)]
        )
    
    def forward(self, x):
        return self.encoders(x)

In [14]:
def mask_mat(len_seq):
    mask_mat = torch.zeros(len_seq, len_seq)
    for i in range(len_seq):
        for j in range(len_seq):
            if i < j:
                mask_mat[i][j] = float("-inf")
    return mask_mat

In [15]:
class MaskAttention(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.q_w = nn.Linear(embed_dim, embed_dim)
        self.k_w = nn.Linear(embed_dim, embed_dim)
        self.v_w = nn.Linear(embed_dim, embed_dim) 
        self.softmax = nn.Softmax(dim=-1)  

    def forward(self, embed, mask_mat=None):
        query = self.q_w(embed)
        key = self.k_w(embed)
        value = self.v_w(embed)
        atten_score = torch.matmul(query, key.transpose(-2, -1)) / (key.shape[-1] ** 0.5)
         
        if mask_mat is not None:
            atten_score = atten_score + mask_mat  
            
        mask_atten_weight = self.softmax(atten_score)
        attended = torch.matmul(mask_atten_weight, value)
        return attended


In [16]:
class MaskMultiheadAttention(nn.Module):
    def __init__(self, num_heads, embed_dim):
        super().__init__()
        self.num_heads = num_heads
        self.embed_dim = embed_dim

        self.multi_head_attn = nn.ModuleList([
            MaskAttention(embed_dim) for _ in range(num_heads)
        ])
        self.W = nn.Linear(num_heads * embed_dim, embed_dim)

    def forward(self, embed, mask_mat):
        heads = [head(embed, mask_mat) for head in self.multi_head_attn]
        heads_cat = torch.cat(heads, dim=-1)
        output = self.W(heads_cat)
        return output


In [None]:
class CrossAttention(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.q_w = nn.Linear(embed_dim, embed_dim)
        self.k_w = nn.Linear(embed_dim, embed_dim)
        self.v_w = nn.Linear(embed_dim, embed_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, y):

        query = self.q_w(y)
        key = self.k_w(x)
        value = self.v_w(x)
        atten_score = torch.matmul(query , key.transpose(-2,-1)) / (key.shape[-1] ** 0.5)
        atten_weight = self.softmax(atten_score)
        attention = torch.matmul(atten_weight, value)
        return attention

In [18]:
class MultiheadCrossAttention(nn.Module):
    def __init__(self,embed_dim, num_head):
        super().__init__()
        self.atten_list = nn.ModuleList([CrossAttention(embed_dim) for _ in range(num_head)])
        self.W = nn.Linear(embed_dim * num_head, embed_dim)

    def forward(self, x, y):
        heads = [ head(x,y) for head in self.atten_list]
        heads_cat = torch.cat(heads, dim=-1)
        out = self.W(heads_cat)
        return out

In [204]:
class Decoder(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.mask_attention = MaskMultiheadAttention(num_heads, embed_dim)
        self.layer_norm_1 = LayerNormalization(embed_dim)
        self.cross_attention = MultiheadCrossAttention(embed_dim, num_heads)
        self.layer_norm_2 = LayerNormalization(embed_dim)
        self.feed_forward = FeedForward(embed_dim)
        self.layer_norm_3 = LayerNormalization(embed_dim)

    def forward(self, x, y, mask_mat):
        mask_atten = self.mask_attention(y, mask_mat)
        y_norm = self.layer_norm_1(mask_atten + y)
        cross_atten = self.cross_attention(x, y_norm)
        cross_norm = self.layer_norm_2(cross_atten + y)
        ff_out = self.feed_forward(cross_norm)
        out_norm = self.layer_norm_3(ff_out + cross_norm)
        return out_norm


In [205]:
class DecoderStack(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.decoders = nn.ModuleList([
            Decoder(embed_dim, num_heads) for _ in range(6)
        ])

    def forward(self, x, y, mask_mat):
        for decoder in self.decoders:
            y = decoder(x, y, mask_mat)
        return y


In [201]:
def even_position(p, i, dim):
    return math.sin(p / (10000 ** ((2 * i) / dim)))

def odd_position(p, i, dim):
    return math.cos(p / (10000 ** ((2 * i) / dim)))

def positional_encoding(tokens_len, embed_dim):
    positional_encodings = []
    for p in range(tokens_len):
        token_position = []
        for i in range(embed_dim):
            if i % 2 == 0:
                token_position.append(even_position(p, i, embed_dim))
            else:
                token_position.append(odd_position(p, i, embed_dim))
        positional_encodings.append(torch.tensor(token_position))
    return torch.stack(positional_encodings)


In [206]:
class TransformerBlock(nn.Module):
    def __init__(self, num_heads, embed_dim, vocab_size):
        super().__init__()
        self.encoder = StackEncoder(num_heads, embed_dim)
        self.decoders = DecoderStack(embed_dim, num_heads)
        self.linear = nn.Linear(embed_dim, vocab_size)
        
    def forward(self, x, y, mask_mat, postional_encoding_x, pos_encod_y):
        x = x + postional_encoding_x
        y = y + pos_encod_y
        x = self.encoder(x)
        y = self.decoders(x,y, mask_mat)
        out_logits = self.linear(y)
        return out_logits, y

In [None]:
class TransformerKBAttention(nn.Module):
    def __init__(self, num_heads, embed_dim, vocab_size):
        super().__init__()
        self.transformers = TransformerBlock(num_heads, embed_dim, vocab_size)
        self.vocab_size = vocab_size
        self.W_kb1 = nn.Linear(embed_dim, embed_dim)

    def kb_attention(self, decoder_hidden, kb_keys, new2old):
        """
        decoder_hidden: [B, T, D]
        kb_keys: [B, K, D]
        new2old: list of length K, mapping from kb_key idx → vocab idx
        """
        B, T, D = decoder_hidden.shape
        K = kb_keys.shape[1]

        # Project decoder hidden states
        x = torch.tanh(self.W_kb1(decoder_hidden))         # [B, T, D]

        # Normalize for cosine similarity
        x_norm = F.normalize(x, p=2, dim=-1)               # [B, T, D]
        kb_key_norm = F.normalize(kb_keys, p=2, dim=-1)    # [B, K, D]

        # Compute cosine similarity between each decoder token and each KB key
        # [B, T, K]
        cosine_sim = torch.matmul(x_norm, kb_key_norm.transpose(1, 2))  


        # Initialize attention tensor: [B, T, vocab_size]
        kb_attention = torch.zeros(B, T, self.vocab_size, device=decoder_hidden.device)

        # Loop over each KB key index
        for k_idx, vocab_idx in new2old.items():
            kb_attention[:, :, vocab_idx] = cosine_sim[:, :, k_idx]
            
        return kb_attention

    
    def forward(self,  x, y, mask_mat, positional_encod_x, pos_encod_y, new2old, kb_keys):
        vocab_logit, dec_out = self.transformers(x,y, mask_mat, positional_encod_x, pos_encod_y)
        kb_atten = self.kb_attention(dec_out, kb_keys=kb_keys, new2old=new2old)
        logits = vocab_logit + kb_atten
        return logits



In [25]:
# text preprocess
def preprocess_data(text):
    # lowering the text
    text = text.lower()
    # replace "?" with ''
    text = text.replace('?','')
    # replace "'" with ''
    text = text.replace("'","")
    # replace "," with ''
    text = text.replace(","," ")
    # replace "1)" with ''
    text = text.replace("1)"," ")
    # replace "2)" with ''
    text = text.replace("2)"," ")
    # replace "3)" with ''
    text = text.replace("3)"," ")
    # replace "4)" with ''
    text = text.replace("4)"," ")
    # replace "." with ''
    text = text.replace("."," ")
    # strip replace white space from forward and backward
    text = text.strip()
    # return preprcess text
    return text

In [26]:
# download pretrained tokenizer qwen model form hugging face
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left')
# download pretrained embedding qwen model form hugging face
embed_model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B')

In [27]:
class CustomDataset(Dataset):
    # initilze data and maxlength of tokens
    def __init__(self, data):
        self.data = data
        self.max_length = 8192

    def __len__(self):
        # here retun len of data
        return len(self.data)
    
    def __getitem__(self, index):
        # get one by one 'patent query' from data stored in questions
        question = self.data.iloc[index]['Patient query']
        # get one by 'doctor response' from data stored in questions
        answer = self.data.iloc[index]['Doctor response']
        
        # get token id from questin text , also set return tensor 'pt'
        question_ids = tokenizer(
            question,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )['input_ids'][0] 
        
        # get token id from anwer  text , also set return tensor 'pt'
        answer_ids = tokenizer(
            answer,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )['input_ids'][0]
    
        return question_ids, answer_ids


In [28]:
# open appointmenst_data in read mode
with open('../data/appointments_data.json', 'r') as f:
    # load that json data into key value data
    key_value_data = json.load(f)

In [29]:
# open qa pair.json in read mode
with open('../data/qa_pairs.json' , 'r') as f:
    # load json data into db_data
    db_data = json.load(f)

In [30]:
# preprocess db data q/a and stored in patient and assistant
patient , assistant = [ preprocess_data(i['question']) for i in db_data['Q&A'] ], [ preprocess_data(i['answer']) for i in db_data['Q&A'] ]

In [31]:
# initialize new key value data for stored preprocessed key value data
new_key_value_data = {}
# iterating kv data through items functions to get key value 
for key, value in key_value_data.items():
    # preprocess key 
    new_key = preprocess_data(key)
    # preprocess value
    new_value = preprocess_data(value)
    # add data into new key value data
    new_key_value_data[new_key] = new_value


In [32]:
# initialize mapping functions to map values with key for canonical tokens
def replace_values_with_keys(texts, mapping):
    # enumeratiing text list to get index and values
    for i, text in enumerate(texts):
        # from mapping data to get key an vlues
        for key, val in mapping.items():
            # split key by '_' and take only first text
            name = key.split('_')[0]
            # check name is present in text or not and also val present in text or not if both condition true then only enter inside if condition
            if name in text and val in text:
                # condition true , replace val with key
                texts[i] = text.replace(val, key)
                # break 
                break
    return texts


In [33]:
# call replace values with keys with assistan and newkeyvalue data parameters
assistant = replace_values_with_keys(assistant, new_key_value_data)

In [34]:
# use list comprehension to get all keys from dict
all_keys = [ i for i in new_key_value_data.keys()]

In [35]:
# add all keys to tokens
tokenizer.add_tokens(all_keys)

492

In [36]:
# resize token token embedding after increase len of tokenizer
embed_model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(152161, 1024)

In [37]:
# initialize set for storing used token ids 
used_token_ids = set()

# iterating all keys
for target_text in all_keys:
    # encode token to tokens id with add special tokens false
    tokens = tokenizer.encode(target_text, add_special_tokens=False)
    # update the used token ids 
    used_token_ids.update(tokens)

# sort the used token ids by its token ids
token_id_list = sorted(list(used_token_ids))

# used dict comprehession , apply enumerate to get idx and values 
# here assign value as key and idx as value for old to new 
old2new = {old: new for new, old in enumerate(token_id_list)}
# here also use dict comprehession for exchange key to value and value to key.
new2old = {v: k for k, v in old2new.items()}

In [38]:
# make dictionery db_data_dict having key "Patient query" and "Doctor response" with list of vlaues patient and doctor
db_data_dict = {'Patient query': patient, 'Doctor response': assistant}
print(len(patient))
# create data frame db_data_dict
db_df = pd.DataFrame(db_data_dict)
# call head method of df, to get five data rows
db_df.head()

4100


Unnamed: 0,Patient query,Doctor response
0,what time is benjamins appointment,the appointment time of benjamin is benjamin_a...
1,who is michaels doctor,the doctor for michael is michael_doctor
2,what is the contact number for kenneth,the contact number for kenneth is kenneth_contact
3,what time is blakes appointment,the appointment time of blake is blake_appoint...
4,where is brenda having their appointment,brenda is having their appointment at brenda_h...


In [39]:
# apply prepocess text on patient query and doctor response
db_df['Patient query'] = db_df['Patient query'].apply(preprocess_data)
db_df['Doctor response'] = db_df['Doctor response'].apply(preprocess_data)

In [292]:
dataset_db = CustomDataset(db_df)

In [3]:
# split data into train, val sets
train_set, val_set = torch.utils.data.random_split(dataset_db, [3600, 496])

NameError: name 'torch' is not defined

In [2]:
8 * 62

496

In [294]:
def collate_fn(batch):
    questions, answers = zip(*batch) 

    padded_questions = pad_sequence(questions, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_answers = pad_sequence(answers, batch_first=True, padding_value=tokenizer.pad_token_id)

    return padded_questions, padded_answers

In [295]:
train_loader_db = DataLoader(train_set , batch_size=8, shuffle=True, collate_fn=collate_fn)
val_loader_db = DataLoader(val_set , batch_size=8, shuffle=True, collate_fn=collate_fn)

In [296]:
max = 0
for x, y in train_loader_db:
    x = x
    y = y
    break


In [297]:
# load embed model into device , it may be cuda or cpu
embed_model = embed_model.to(device)

In [298]:
# initialize key embed list
keys_embed = []
# iterate all keys
for key in all_keys:
    # tokenize each key to get token id
    key = tokenizer(key, return_tensors='pt')['input_ids'][:,:-1]
    # embed tokenize key id
    key_embed = embed_model(key.to(device), )['last_hidden_state'][0]
    # append key embed to keys_embed list
    keys_embed.append(key_embed)

# stack keys_embed list on dim=0 , apply squeeze to decrease dimension on axis=2, 
# detach for autograd false
keys_embed = torch.stack(keys_embed, dim=0).squeeze(1).detach()

In [299]:
keys_embed.shape

torch.Size([492, 1024])

In [300]:
keys_embed.shape

torch.Size([492, 1024])

In [301]:
# tokenize start tokens
start_token = tokenizer('<|im_start|>', return_tensors='pt')['input_ids'][:,0].unsqueeze(0).to(device)

In [302]:
pos_encoding = positional_encoding(15, 1024).to(device)

In [303]:
mask_matrix = mask_mat(15).to(device)

In [189]:
for x, y in train_loader_db:
    B, T = x.shape
    B_y, T_y = y.shape
    pos_encod_x = pos_encoding[:T].expand(B, -1, -1)
    pos_encod_y = pos_encoding[:T_y+1].expand(B_y, -1, -1)
    mask_y = mask_matrix[:T_y+1, :T_y+1].expand(B_y, -1, -1)
    y = y.to(device)
    start_token_expanded = start_token.expand(y.size(0), 1)
    # concat sos token in target tokens
    target_with_sos = torch.cat([start_token_expanded, y], dim=1)
    with torch.no_grad():
            input_embeds = embed_model(x.to(device))['last_hidden_state'].detach()
            target_embeds = [embed_model(i.unsqueeze(0).unsqueeze(0))['last_hidden_state'].detach()[0][0]  for target_tokens in target_with_sos for i in target_tokens]
            target_embeds = torch.stack(target_embeds)
            print(target_embeds.shape)
            target_embeds = target_embeds.reshape(B_y, T_y+1, 1024)
            
    break


torch.Size([88, 1024])


In [162]:
target_embeds.shape

torch.Size([8, 11, 1024])

In [304]:
vocab_size = len(tokenizer)
model = TransformerKBAttention(num_heads=8, embed_dim=1024, vocab_size=vocab_size).to(device)

In [305]:
# set learning rate 
learning_rate = 0.001
# set epochs
epochs = 25
# call cross entory loss
criterion = nn.CrossEntropyLoss()
# initialize optimizer adam with learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [291]:
3600/4

900.0

In [306]:
# initilize train loss
train_losses = []
# initialize train acc
train_accuracies = []
# initialize val loss
val_losses = []
# initialize val acc
val_accuracies = []

# initialize file name to save and monitor loss , acc in each epochs
monitor_csv = '../data/kb_monitor.csv'
# intialize best val loss inf
best_val_loss = float('inf')  # for saving best model

# open file in write mode if available else create
with open(monitor_csv, mode='w', newline='') as f:
    # initialize writer
    writer = csv.writer(f)
    # write first rows
    writer.writerow(['epoch', 'train_loss', 'train_accuracy', 'val_loss', 'val_accuracy'])

# iterate each epochs
for epoch in range(epochs):
    # set model in train mode
    model.train()
    total_train_loss = 0.0
    total_train_correct = 0
    total_train_tokens = 0
 
    # iterate train_loader in every batch, tqdm used for progress bar
    for input_texts, target_tokens in tqdm(train_loader_db, desc=f"Training Epoch {epoch + 1}"):
        # load input text to device, like 'cuda'
        input_texts = input_texts.to(device)
        # load target text to device, like 'cuda'
        target_tokens = target_tokens.to(device)
        B, T = input_texts.shape
        B_y, T_y = target_tokens.shape
        pos_encod_x = pos_encoding[:T].expand(B, -1, -1)
        pos_encod_y = pos_encoding[:T_y+1].expand(B_y, -1, -1)
        mask_y = mask_matrix[:T_y+1, :T_y+1].expand(B_y, -1, -1)
        keys_embed = keys_embed.expand(B_y, -1,-1)
        start_token_expanded = start_token.expand(target_tokens.size(0), 1)
          
        # concat sos token in target tokens
        target_with_sos = torch.cat([start_token_expanded, target_tokens], dim=1)
        
        # make optimizer zero grad
        optimizer.zero_grad()
        
        # make auto grad false
        with torch.no_grad():
            # get input embed
            input_embeds = embed_model(input_texts)['last_hidden_state'].detach()
            # here we get embed of target token one by one, not using contextual embed , here is problem i face from few day now it solve
            target_embeds = [embed_model(i.unsqueeze(0).unsqueeze(0))['last_hidden_state'].detach()[0][0]  for target_tokens in target_with_sos for i in target_tokens]
            target_embeds = torch.stack(target_embeds)
            target_embeds = target_embeds.reshape(B_y, T_y+1, 1024)


        logits = model(x=input_embeds, y=target_embeds, mask_mat=mask_y, positional_encod_x=pos_encod_x, pos_encod_y=pos_encod_y, new2old=new2old, kb_keys=keys_embed)
        logits = logits[:, :-1, : ]
        
        # find loss between final logits and target token without sos tokens 
        loss = criterion(logits.reshape(-1, logits.size(-1)), target_tokens.view(-1))
        # backward calculates the gradients of the loss with respect to all the parameters (weights and biases) in the model that have requires_grad=True
        loss.backward()
        # update model parameters
        optimizer.step()
        
        # loss.item() to get only value not tensor
        total_train_loss += loss.item()
        # it gives max logits idx that is also token ids
        preds = torch.argmax(logits, dim=-1)
        # compare pred equals to target token or not
        total_train_correct += (preds == target_tokens).float().sum().item()
        # numel() help to calculate total number of elements
        total_train_tokens += target_tokens.numel()
    
    # it gives avg train loss
    avg_train_loss = total_train_loss / len(train_loader_db)
    # it gives train acc
    train_accuracy = total_train_correct / total_train_tokens
    
    # append avg loss , train acc into avg_train_loss, train_accuracy
    train_losses.append(avg_train_loss)
    train_accuracies.append(train_accuracy)

    # set model in eval mode
    model.eval()
    total_val_loss = 0.0
    total_val_correct = 0
    total_val_tokens = 0

    # set required grad false
    with torch.no_grad():
        # iterate val_loader in every batch, tqdm used for progress bar
        # all function works same as train without grad
        for val_input_texts, val_target_tokens in tqdm(val_loader_db, desc="Validation: "):

            val_input_texts = val_input_texts.to(device)
            val_target_tokens = val_target_tokens.to(device)

            B, T = val_input_texts.shape
            B_y, T_y = val_target_tokens.shape
            keys_embed = keys_embed.expand(B_y, -1,-1)
            pos_encod_x = pos_encoding[:T].expand(B, -1, -1)
            pos_encod_y = pos_encoding[:T_y+1].expand(B_y, -1, -1)
            mask_y = mask_matrix[:T_y+1, :T_y+1].expand(B_y, -1, -1)
           
            val_start_token_expanded = start_token.expand(val_target_tokens.size(0), 1)
            val_target_with_sos = torch.cat([val_start_token_expanded, val_target_tokens], dim=1)

            val_input_embeds = embed_model(val_input_texts)['last_hidden_state'].detach()
            val_target_embeds = [embed_model(i.unsqueeze(0).unsqueeze(0))['last_hidden_state'].detach()[0][0]  for val_target_tokens in val_target_with_sos for i in val_target_tokens]
            val_target_embeds = torch.stack(val_target_embeds)
            val_target_embeds = val_target_embeds.reshape(B_y, T_y+1, 1024)

            val_logits = model(x=val_input_embeds, y=val_target_embeds, mask_mat=mask_y, positional_encod_x=pos_encod_x, pos_encod_y=pos_encod_y, new2old=new2old, kb_keys=keys_embed)
            val_logits = val_logits[:, :-1, : ]
            val_loss = criterion(val_logits.reshape(-1, val_logits.size(-1)), val_target_tokens.view(-1))

            total_val_loss += val_loss.item()
            val_preds = torch.argmax(val_logits, dim=-1)
            total_val_correct += (val_preds == val_target_tokens).float().sum().item()
            total_val_tokens += val_target_tokens.numel()

    avg_val_loss = total_val_loss / len(val_loader_db)
    val_accuracy = total_val_correct / total_val_tokens

    val_losses.append(avg_val_loss)
    val_accuracies.append(val_accuracy)

    # save best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), f"../models/best_model_weight_kb.pth")

    # save current model checkpoint
    torch.save(model.state_dict(), f"../models/model_weight_kb_epoch_{epoch+1}.pth")

    # log all metrics
    with open(monitor_csv, mode='a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([epoch + 1, avg_train_loss, train_accuracy, avg_val_loss, val_accuracy])
    # print every epoch loss and acc
    print(f"Epoch {epoch + 1}: "
          f"Train Loss = {avg_train_loss:.4f}, Train Acc = {train_accuracy:.4f} | "
          f"Val Loss = {avg_val_loss:.4f}, Val Acc = {val_accuracy:.4f}")


Training Epoch 1: 100%|██████████| 450/450 [37:20<00:00,  4.98s/it]
Validation:  98%|█████████▊| 62/63 [02:32<00:02,  2.45s/it]


RuntimeError: The expanded size of the tensor (4) must match the existing size (8) at non-singleton dimension 0.  Target sizes: [4, -1, -1].  Tensor sizes: [8, 492, 1024]

In [262]:
logits.shape

torch.Size([8, 12, 152161])

In [263]:
target_tokens.shape

torch.Size([8, 11])

In [265]:
logits.view(-1, logits.size(-1)).shape

torch.Size([96, 152161])

In [280]:
logits_0 = logits[:, :-1, : ]

In [284]:
logits_0.reshape(88, logits_0.size(-1)).shape

torch.Size([88, 152161])

In [281]:
logits_0.view(-1, logits.size(-1))

RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.

In [285]:
loss = criterion(logits_0.reshape(88, logits_0.size(-1)), target_tokens.view(-1))

In [None]:
# get vocabsize
vocab_size = len(tokenizer.vocab)
# intialize num of head
num_head = 8
# initialize embed_dim
embed_dim = 1024
# initialize model with (embed_model, vocab-size, hidden dim , embedding )
model = TransformerBlock(num_heads=num_head, embed_dim=embed_dim, vocab_size=vocab_size)

TypeError: MaskAttention.__init__() takes 2 positional arguments but 3 were given

In [75]:
TransformerBlock(num_heads=num_head, embed_dim=embed_dim, vocab_size=vocab_size)

TypeError: MaskAttention.__init__() takes 2 positional arguments but 3 were given