In [1]:
import torch
import torch.nn as nn
import math

In [2]:
# improting pandas for dataframe difference operation
import pandas as pd
# importing torch.nn.functional module for call method of softmax
import torch.nn.functional as F
# import pad_sequence for make same length of all sentence tokens during batching
from torch.nn.utils.rnn import pad_sequence
# import autotokenizer for download pretrained tokenizer, and automodel for download pretrained embed model
from transformers import AutoTokenizer, AutoModel
# import dataset and dataloader for making custom dataset with addition operations , loader for shuffle , batching
from torch.utils.data import DataLoader, Dataset 
# import torch for tensor operations or used for different module
import torch
# import nn for different deeplearning model like lstm ,linear etc
import torch.nn as nn
# import tqdm for progress bar
from tqdm import tqdm
# importing json for load and dump json file
import json
# importing csv file for creating file for logging report summary
import csv

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
# check cuda is available or not 
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [176]:
class SelfAttention(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.query_w = nn.Linear(embed_dim, embed_dim)
        self.key_w = nn.Linear(embed_dim, embed_dim)
        self.value_w = nn.Linear(embed_dim, embed_dim)
        self.softmax = nn.Softmax(dim=-1)  
    def forward(self, embed):
        query = self.query_w(embed)
        key = self.key_w(embed)
        value = self.value_w(embed)
        scores = torch.matmul(query, key.transpose(-2, -1)) / (key.shape[-1] ** 0.5)
        attn_weights = self.softmax(scores)
        attended = torch.matmul(attn_weights, value)
        return attended

In [178]:
self_atten = SelfAttention(1024).to(device)

In [180]:
atten = self_atten(input_embeds)

In [181]:
atten.shape

torch.Size([8, 9, 1024])

In [182]:
class MultiheadAttention(nn.Module):
    def __init__(self, num_heads, embed_dim):
        super().__init__()
        self.num_heads = num_heads
        self.embed_dim = embed_dim

        self.multi_head_attn = nn.ModuleList([
            SelfAttention(embed_dim) for _ in range(num_heads)
        ])
        self.W = nn.Linear(num_heads * embed_dim, embed_dim)

    def forward(self, embed):
        heads = [head(embed) for head in self.multi_head_attn]
        heads_cat = torch.cat(heads, dim=-1)
        output = self.W(heads_cat)
        return output


        

In [185]:
mha = MultiheadAttention(8, 1024).to(device)

In [186]:
multiattent = mha(input_embeds)

In [187]:
multiattent.shape

torch.Size([8, 9, 1024])

In [29]:
class LayerNormalization(nn.Module):
    def __init__(self, embed_dim, eps=1e-5):
        super(LayerNormalization, self).__init__()
        self.alpha = nn.Parameter(torch.ones(embed_dim))  
        self.beta = nn.Parameter(torch.zeros(embed_dim))  
        self.eps = eps

    def forward(self, embed):
        mean = embed.mean(dim=-1, keepdim=True)
        var = embed.var(dim=-1, keepdim=True, unbiased=False)
        normalized = (embed - mean) / torch.sqrt(var + self.eps)

        return self.alpha * normalized + self.beta


In [195]:
norm = LayerNormalization(1024).to(device)

In [198]:
lnorm = norm(multiattent + input_embeds)

In [30]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.W1 = nn.Linear(embed_dim, embed_dim)
        self.W2 = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(0.2)

    def forward(self, embed):
        x = self.W1(embed)
        x = self.dropout(x)
        x = self.W2(x)  
        return x


In [200]:
Ff = FeedForward(embed_dim).to(device)


In [201]:
ff = Ff(lnorm)

In [202]:
ff.shape

torch.Size([8, 9, 1024])

In [203]:
class Encoder(nn.Module):
    def __init__(self, num_heads, embed_dim):
        super().__init__()  
        self.multiheadattention = MultiheadAttention(num_heads, embed_dim)
        self.layernorm1 = nn.LayerNorm(embed_dim)  
        self.feedforward = FeedForward(embed_dim)
        self.layernorm2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        atten_x = self.multiheadattention(x)
        x = self.layernorm1(atten_x + x)
        ff_out = self.feedforward(x)
        x = self.layernorm2(ff_out + x)
        return x

In [204]:
enc = Encoder(8, 1024).to(device)
enc_out = enc(input_embeds)

In [206]:
enc_out.shape

torch.Size([8, 9, 1024])

In [207]:
class StackEncoder(nn.Module):
    def __init__(self, num_heads, embed_dim):
        super().__init__()
        self.encoders = nn.Sequential(
            *[Encoder(num_heads, embed_dim) for _ in range(6)]
        )
    
    def forward(self, x):
        return self.encoders(x)

In [235]:
enc = StackEncoder(8, 1024).to(device)
enc_out = enc(input_embeds)

In [33]:
def mask_mat(len_seq):
    mask_mat = torch.zeros(len_seq, len_seq)
    for i in range(len_seq):
        for j in range(len_seq):
            if i < j:
                mask_mat[i][j] = float("-inf")
    return mask_mat

In [213]:
mask_matrix = mask_mat(input_embeds.shape[1]).expand(8, -1,-1).to(device)

In [34]:
class MaskAttention(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.q_w = nn.Linear(embed_dim, embed_dim)
        self.k_w = nn.Linear(embed_dim, embed_dim)
        self.v_w = nn.Linear(embed_dim, embed_dim) 
        self.softmax = nn.Softmax(dim=-1)  

    def forward(self, embed, mask_mat=None):
        query = self.q_w(embed)
        key = self.k_w(embed)
        value = self.v_w(embed)
        atten_score = torch.matmul(query, key.transpose(-2, -1)) / (key.shape[-1] ** 0.5)
         
        if mask_mat is not None:
            atten_score = atten_score + mask_mat  
            
        mask_atten_weight = self.softmax(atten_score)
        attended = torch.matmul(mask_atten_weight, value)
        return attended


In [219]:
MA = MaskAttention(1024).to(device)
ma = MA(input_embeds, mask_matrix)
ma.shape

torch.Size([8, 9, 1024])

In [221]:
class MaskMultiheadAttention(nn.Module):
    def __init__(self, num_heads, embed_dim):
        super().__init__()
        self.num_heads = num_heads
        self.embed_dim = embed_dim

        self.multi_head_attn = nn.ModuleList([
            MaskAttention(embed_dim) for _ in range(num_heads)
        ])
        self.W = nn.Linear(num_heads * embed_dim, embed_dim)

    def forward(self, embed, mask_mat):
        heads = [head(embed, mask_mat) for head in self.multi_head_attn]
        heads_cat = torch.cat(heads, dim=-1)
        output = self.W(heads_cat)
        return output


In [223]:
MMA = MaskMultiheadAttention(8, 1024).to(device)
ma = MMA(input_embeds, mask_matrix)
ma.shape

torch.Size([8, 9, 1024])

In [239]:
class CrossAttention(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.q_w = nn.Linear(embed_dim, embed_dim)
        self.k_w = nn.Linear(embed_dim, embed_dim)
        self.v_w = nn.Linear(embed_dim, embed_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, y):

        query = self.q_w(y)
        key = self.k_w(x)
        value = self.v_w(x)
      
        atten_score = torch.matmul(query , key.transpose(-2,-1)) / (key.shape[-1] ** 0.5)
        atten_weight = self.softmax(atten_score)
        attention = torch.matmul(atten_weight, value)
        return attention

In [240]:
CA = CrossAttention(1024).to(device)

ca_out=CA(enc_out, ma)
ca_out.shape

torch.Size([8, 9, 1024])

In [241]:
class MultiheadCrossAttention(nn.Module):
    def __init__(self,embed_dim, num_head):
        super().__init__()
        self.atten_list = nn.ModuleList([CrossAttention(embed_dim) for _ in range(num_head)])
        self.W = nn.Linear(embed_dim * num_head, embed_dim)

    def forward(self, x, y):
        heads = [ head(x,y) for head in self.atten_list]
        heads_cat = torch.cat(heads, dim=-1)
        out = self.W(heads_cat)
        return out

In [243]:
MCA = MultiheadCrossAttention(1024, 8).to(device)
mca_out = MCA(enc_out, ma)

In [246]:
mca_out.shape

torch.Size([8, 9, 1024])

In [244]:
class Decoder(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.mask_attention = MaskMultiheadAttention(num_heads, embed_dim)
        self.layer_norm_1 = LayerNormalization(embed_dim)
        self.cross_attention = MultiheadCrossAttention(embed_dim, num_heads)
        self.layer_norm_2 = LayerNormalization(embed_dim)
        self.feed_forward = FeedForward(embed_dim)
        self.layer_norm_3 = LayerNormalization(embed_dim)

    def forward(self, x, y, mask_mat):
        mask_atten = self.mask_attention(y, mask_mat)
        y_norm = self.layer_norm_1(mask_atten + y)
        cross_atten = self.cross_attention(x, y_norm)
        cross_norm = self.layer_norm_2(cross_atten + x)
        ff_out = self.feed_forward(cross_norm)
        out_norm = self.layer_norm_3(ff_out + cross_norm)
        return out_norm


In [248]:
decoder = Decoder(1024, 8).to(device)
decoder(enc_out, input_embeds, mask_matrix).shape

torch.Size([8, 9, 1024])

In [245]:
class DecoderStack(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.decoders = nn.ModuleList([
            Decoder(embed_dim, num_heads) for _ in range(6)
        ])

    def forward(self, x, y, mask_mat):
        for decoder in self.decoders:
            y = decoder(x, y, mask_mat)
        return y


In [250]:
decoder = DecoderStack(1024, 8).to(device)
decoder(enc_out, input_embeds, mask_matrix).shape

torch.Size([8, 9, 1024])

In [40]:
def even_position(p, i, dim):
    return math.sin(p / (10000 ** ((2 * i) / dim)))

def odd_position(p, i, dim):
    return math.cos(p / (10000 ** ((2 * i) / dim)))

def positional_encoding(tokens_len, embed_dim):
    positional_encodings = []
    for p in range(tokens_len):
        token_position = []
        for i in range(embed_dim):
            if i % 2 == 0:
                token_position.append(even_position(p, i, embed_dim))
            else:
                token_position.append(odd_position(p, i, embed_dim))
        positional_encodings.append(torch.tensor(token_position))
    return torch.stack(positional_encodings)


In [41]:
positional_encoding(10, 1024).shape

torch.Size([10, 1024])

In [268]:
class TransformerBlock(nn.Module):
    def __init__(self, num_heads, embed_dim, vocab_size):
        super().__init__()
        self.encoder = StackEncoder(num_heads, embed_dim)
        self.decoders = DecoderStack(embed_dim, num_heads)
        self.linear = nn.Linear(embed_dim, vocab_size)
        
    def forward(self, x, y, mask_mat, postional_encoding):
        x = x + postional_encoding
        y = y + postional_encoding
        x = self.encoder(x)
        y = self.decoders(x,y, mask_mat)
        out_logits = self.linear(y)
        return out_logits, y

In [266]:
transformer = TransformerBlock(8, 1024, vocab_size).to(device)

In [269]:
out, y = transformer(input_embeds, input_embeds, mask_matrix, pos_encod)

In [270]:
y.shape

torch.Size([8, 9, 1024])

In [311]:
class TransformerKBAttention(nn.Module):
    def __init__(self, num_heads, embed_dim, vocab_size):
        super().__init__()
        self.transformers = TransformerBlock(num_heads, embed_dim, vocab_size)
        self.vocab_size = vocab_size
        self.W_kb1 = nn.Linear(embed_dim, embed_dim)

    def kb_attention(self, decoder_hidden, kb_keys, new2old):
        """
        decoder_hidden: [B, T, D]
        kb_keys: [B, K, D]
        new2old: list of length K, mapping from kb_key idx → vocab idx
        """
        B, T, D = decoder_hidden.shape
        K = kb_keys.shape[1]

        # Project decoder hidden states
        x = torch.tanh(self.W_kb1(decoder_hidden))         # [B, T, D]

        # Normalize for cosine similarity
        x_norm = F.normalize(x, p=2, dim=-1)               # [B, T, D]
        kb_key_norm = F.normalize(kb_keys, p=2, dim=-1)    # [B, K, D]

        # Compute cosine similarity between each decoder token and each KB key
        # [B, T, K]
        cosine_sim = torch.matmul(x_norm, kb_key_norm.transpose(1, 2))  

        # Initialize attention tensor: [B, T, vocab_size]
        kb_attention = torch.zeros(B, T, self.vocab_size, device=decoder_hidden.device)

        # Loop over each KB key index
        for k_idx, vocab_idx in enumerate(new2old):
            # cosine_sim[:, :, k_idx] → [B, T], similarity for this KB key
            kb_attention[:, :, vocab_idx] += cosine_sim[:, :, k_idx]

        return kb_attention

    
    def forward(self,  x, y, mask_mat, positional_encod, new2old, kb_keys):
        vocab_logit, dec_out = self.transformers(x,y, mask_mat, positional_encod)
        kb_atten = self.kb_attention(dec_out, kb_keys=kb_keys, new2old=new2old)
        logits = vocab_logit + kb_atten
        return logits



In [312]:
tkb = TransformerKBAttention(8, 1024, vocab_size).to(device)

In [289]:
keys_embed = keys_embed.expand(8, -1,-1)

In [291]:
keys_embed.shape

torch.Size([8, 492, 1024])

In [313]:
tkb(input_embeds, input_embeds, mask_matrix, pos_encod, new2old, keys_embed)

tensor([[[-0.2877,  0.2992, -0.0485,  ...,  0.0439, -0.2981, -0.0910],
         [-0.7821, -0.6512,  0.2742,  ..., -0.2105, -0.8132,  0.2286],
         [-0.7546, -0.1569,  0.1842,  ...,  0.0055, -0.2678,  0.5550],
         ...,
         [-1.1172, -0.0050, -0.0065,  ...,  0.4781, -0.6678,  0.7323],
         [-0.8320, -0.1472, -0.1423,  ..., -0.3938, -0.8935,  0.8311],
         [-0.6768, -0.0465,  0.1754,  ...,  0.0143, -1.2519,  1.0366]],

        [[ 0.5688,  0.3594,  0.0303,  ..., -0.0431,  0.1983, -0.4620],
         [-0.3962,  0.0097, -0.5491,  ..., -0.5772, -0.4538, -0.2825],
         [ 0.0283,  0.0408, -0.5234,  ...,  0.0177,  0.4672,  0.2835],
         ...,
         [-0.2635,  0.6658, -0.3592,  ...,  0.1436, -1.3863,  1.1012],
         [ 0.0203,  0.2050, -0.1696,  ..., -0.1309, -1.2625,  0.7859],
         [ 0.1673,  0.0696, -0.3251,  ..., -0.4303, -1.0181,  0.8010]],

        [[ 0.0854,  0.3682,  0.2540,  ...,  0.3005, -0.1219, -0.1078],
         [-0.7070,  0.4751,  0.7104,  ...,  0

In [3]:
# text preprocess
def preprocess_data(text):
    # lowering the text
    text = text.lower()
    # replace "?" with ''
    text = text.replace('?','')
    # replace "'" with ''
    text = text.replace("'","")
    # replace "," with ''
    text = text.replace(","," ")
    # replace "1)" with ''
    text = text.replace("1)"," ")
    # replace "2)" with ''
    text = text.replace("2)"," ")
    # replace "3)" with ''
    text = text.replace("3)"," ")
    # replace "4)" with ''
    text = text.replace("4)"," ")
    # replace "." with ''
    text = text.replace("."," ")
    # strip replace white space from forward and backward
    text = text.strip()
    # return preprcess text
    return text

In [4]:
# download pretrained tokenizer qwen model form hugging face
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left')
# download pretrained embedding qwen model form hugging face
embed_model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B')

In [5]:
class CustomDataset(Dataset):
    # initilze data and maxlength of tokens
    def __init__(self, data):
        self.data = data
        self.max_length = 8192

    def __len__(self):
        # here retun len of data
        return len(self.data)
    
    def __getitem__(self, index):
        # get one by one 'patent query' from data stored in questions
        question = self.data.iloc[index]['Patient query']
        # get one by 'doctor response' from data stored in questions
        answer = self.data.iloc[index]['Doctor response']
        
        # get token id from questin text , also set return tensor 'pt'
        question_ids = tokenizer(
            question,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )['input_ids'][0] 
        
        # get token id from anwer  text , also set return tensor 'pt'
        answer_ids = tokenizer(
            answer,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )['input_ids'][0]
    
        return question_ids, answer_ids


In [6]:
# open appointmenst_data in read mode
with open('../data/appointments_data.json', 'r') as f:
    # load that json data into key value data
    key_value_data = json.load(f)

In [7]:
# open qa pair.json in read mode
with open('../data/qa_pairs.json' , 'r') as f:
    # load json data into db_data
    db_data = json.load(f)

In [8]:
# preprocess db data q/a and stored in patient and assistant
patient , assistant = [ preprocess_data(i['question']) for i in db_data['Q&A'] ], [ preprocess_data(i['answer']) for i in db_data['Q&A'] ]

In [9]:
# initialize new key value data for stored preprocessed key value data
new_key_value_data = {}
# iterating kv data through items functions to get key value 
for key, value in key_value_data.items():
    # preprocess key 
    new_key = preprocess_data(key)
    # preprocess value
    new_value = preprocess_data(value)
    # add data into new key value data
    new_key_value_data[new_key] = new_value


In [10]:
# initialize mapping functions to map values with key for canonical tokens
def replace_values_with_keys(texts, mapping):
    # enumeratiing text list to get index and values
    for i, text in enumerate(texts):
        # from mapping data to get key an vlues
        for key, val in mapping.items():
            # split key by '_' and take only first text
            name = key.split('_')[0]
            # check name is present in text or not and also val present in text or not if both condition true then only enter inside if condition
            if name in text and val in text:
                # condition true , replace val with key
                texts[i] = text.replace(val, key)
                # break 
                break
    return texts


In [11]:
# call replace values with keys with assistan and newkeyvalue data parameters
assistant = replace_values_with_keys(assistant, new_key_value_data)

In [12]:
# use list comprehension to get all keys from dict
all_keys = [ i for i in new_key_value_data.keys()]

In [13]:
# add all keys to tokens
tokenizer.add_tokens(all_keys)

492

In [14]:
# resize token token embedding after increase len of tokenizer
embed_model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(152161, 1024)

In [15]:
# initialize set for storing used token ids 
used_token_ids = set()

# iterating all keys
for target_text in all_keys:
    # encode token to tokens id with add special tokens false
    tokens = tokenizer.encode(target_text, add_special_tokens=False)
    # update the used token ids 
    used_token_ids.update(tokens)

# sort the used token ids by its token ids
token_id_list = sorted(list(used_token_ids))

# used dict comprehession , apply enumerate to get idx and values 
# here assign value as key and idx as value for old to new 
old2new = {old: new for new, old in enumerate(token_id_list)}
# here also use dict comprehession for exchange key to value and value to key.
new2old = {v: k for k, v in old2new.items()}

In [16]:
# make dictionery db_data_dict having key "Patient query" and "Doctor response" with list of vlaues patient and doctor
db_data_dict = {'Patient query': patient, 'Doctor response': assistant}
print(len(patient))
# create data frame db_data_dict
db_df = pd.DataFrame(db_data_dict)
# call head method of df, to get five data rows
db_df.head()

4100


Unnamed: 0,Patient query,Doctor response
0,what time is benjamins appointment,the appointment time of benjamin is benjamin_a...
1,who is michaels doctor,the doctor for michael is michael_doctor
2,what is the contact number for kenneth,the contact number for kenneth is kenneth_contact
3,what time is blakes appointment,the appointment time of blake is blake_appoint...
4,where is brenda having their appointment,brenda is having their appointment at brenda_h...


In [17]:
# apply prepocess text on patient query and doctor response
db_df['Patient query'] = db_df['Patient query'].apply(preprocess_data)
db_df['Doctor response'] = db_df['Doctor response'].apply(preprocess_data)

In [18]:
dataset_db = CustomDataset(db_df)

In [19]:
# split data into train, val sets
train_set, val_set = torch.utils.data.random_split(dataset_db, [3500, 600])

In [48]:
def collate_fn(batch):
    questions, answers = zip(*batch) 

    padded_questions = pad_sequence(questions, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_answers = pad_sequence(answers, batch_first=True, padding_value=tokenizer.pad_token_id)

    return padded_questions, padded_answers

In [49]:
train_loader_db = DataLoader(train_set , batch_size=8, shuffle=True, collate_fn=collate_fn)
val_loader_db = DataLoader(val_set , batch_size=8, shuffle=True, collate_fn=collate_fn)

In [60]:
max = 0
for x, y in train_loader_db:
    x = x
    y = y
    break


In [52]:
# load embed model into device , it may be cuda or cpu
embed_model = embed_model.to(device)

In [25]:
# initialize key embed list
keys_embed = []
# iterate all keys
for key in all_keys:
    # tokenize each key to get token id
    key = tokenizer(key, return_tensors='pt')['input_ids'][:,:-1]
    # embed tokenize key id
    key_embed = embed_model(key.to(device), )['last_hidden_state'][0]
    # append key embed to keys_embed list
    keys_embed.append(key_embed)

# stack keys_embed list on dim=0 , apply squeeze to decrease dimension on axis=2, 
# detach for autograd false
keys_embed = torch.stack(keys_embed, dim=0).squeeze(1).detach()

In [26]:
# tokenize start tokens
start_token = tokenizer('<|im_start|>', return_tensors='pt')['input_ids'][:,0].unsqueeze(0).to(device)

In [55]:
pos_encoding = positional_encoding(15, 1024).to(device)

In [59]:
pos_encoding[1].shape

torch.Size([1024])

In [71]:
pos_encoding[:x_len].shape

torch.Size([8, 1024])

In [72]:
x.shape

torch.Size([8, 9])

In [79]:
x_len = x.shape[0]

In [86]:
pos_encod.expand(8, -1,-1).shape

torch.Size([8, 8, 1024])

In [167]:
x.shape

torch.Size([8, 9])

In [None]:
SelfAttention(1024, 1024)

In [None]:
for x, y in train_loader_db:
    pos_encod = pos_encoding[:x.shape[-1]]
    pos_encod = pos_encod.expand(8, -1, -1)
    with torch.no_grad():
            input_embeds = embed_model(x.to(device))['last_hidden_state'].detach()
    
    break


torch.Size([8, 9, 1024])


In [None]:
# get vocabsize
vocab_size = len(tokenizer.vocab)
# intialize num of head
num_head = 8
# initialize embed_dim
embed_dim = 1024
# initialize model with (embed_model, vocab-size, hidden dim , embedding )
model = TransformerBlock(num_heads=num_head, embed_dim=embed_dim, vocab_size=vocab_size)

TypeError: MaskAttention.__init__() takes 2 positional arguments but 3 were given

In [75]:
TransformerBlock(num_heads=num_head, embed_dim=embed_dim, vocab_size=vocab_size)

TypeError: MaskAttention.__init__() takes 2 positional arguments but 3 were given