[q] [SEP] [sent1 + sent2 + sent3 + ... + sentn]

In [None]:
# connect your Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive"

Mounted at /content/gdrive


Install the dependencies

*Hardware : GPU A100 on Google colab pro*

In [None]:
!pip install --quiet torch==2.2.2+cu121 torchvision==0.17.2+cu121 torchaudio==2.2.2+cu121 --index-url https://download.pytorch.org/whl/cu121
!pip install --quiet --upgrade transformers sentence-transformers
!pip install numpy==1.26.4



In [None]:
!pip install ninja packaging
!MAX_JOBS=8 pip install flash-attn --no-build-isolation



In [None]:
import torch
print("Torch version:", torch.__version__)

Torch version: 2.2.2+cu121


Load the augmented dataset

In [None]:
import pandas as pd
#Load the augmented dataset
df = pd.read_excel("'/content/gdrive/MyDrive/dataset_excel_bionlp/final_dataset/qa_train_dataset_structured_05-05.xlsx'")

In [None]:
df

Unnamed: 0,case_id,note_excerpt,question_generated,sentence_id,ref_excerpt,relevance,source
0,1,\n\nHISTORY OF PRESENT ILLNESS :\nThe patient ...,Is her recurrent left upper quadrant pain due ...,1,HISTORY OF PRESENT ILLNESS :\nThe patient is a...,not-relevant,i2b2
1,1,\n\nHISTORY OF PRESENT ILLNESS :\nThe patient ...,Is her recurrent left upper quadrant pain due ...,2,She presented with left upper quadrant pain as...,essential,i2b2
2,1,\n\nHISTORY OF PRESENT ILLNESS :\nThe patient ...,Is her recurrent left upper quadrant pain due ...,3,She was diagnosed in 1991 during the birth of ...,not-relevant,i2b2
3,1,\n\nHISTORY OF PRESENT ILLNESS :\nThe patient ...,Is her recurrent left upper quadrant pain due ...,4,She claims she does not know why she is HIV po...,not-relevant,i2b2
4,1,\n\nHISTORY OF PRESENT ILLNESS :\nThe patient ...,Is her recurrent left upper quadrant pain due ...,5,"She is from Maryland , apparently had no blood...",not-relevant,i2b2
...,...,...,...,...,...,...,...
24073,748,history of present illness: is the former 34 a...,Will her episodes of apnea and bradycardia of ...,36,2.,not-relevant,mimic-iii
24074,748,history of present illness: is the former 34 a...,Will her episodes of apnea and bradycardia of ...,37,visiting nurse to come to home as mother is an...,essential,mimic-iii
24075,748,history of present illness: is the former 34 a...,Will her episodes of apnea and bradycardia of ...,38,visit will be within one to two days of post d...,essential,mimic-iii
24076,748,history of present illness: is the former 34 a...,Will her episodes of apnea and bradycardia of ...,39,", m.d.",not-relevant,mimic-iii


Fine tuning

Using just train (no validation)

In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report
import numpy as np

from tqdm import tqdm

# -----------------------
# CONFIG
# -----------------------

MODEL_NAME = "jinaai/jina-embeddings-v3"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 1
EPOCHS = 6

# -----------------------
# DATA LOADING
# -----------------------
# df = pd.read_excel("/content/gdrive/MyDrive/dataset_excel_bionlp/final_dataset/qa_results_structured_03-05.xlsx")
df["binary_relevance"] = df["relevance"].apply(lambda x: 1 if x.strip().lower() == "essential" else 0)

# Group by case
grouped_data = []
for case_id, group in df.groupby("case_id"):
    question = group["question_generated"].iloc[0]
    sentences = group["ref_excerpt"].tolist()
    labels = group["binary_relevance"].tolist()
    grouped_data.append({
        "question": question,
        "sentences": [s.strip() for s in sentences],
        "labels": labels
    })

# -----------------------
# DATASET
# -----------------------
class SentenceClassificationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item["question"]
        sentences = item["sentences"]
        labels = item["labels"]


        separator = "</s>"
        text = question + f" {separator} " + f" {separator} ".join(sentences)



        # 🔍 Check token count before truncation
        tokens_total = len(self.tokenizer.tokenize(text))
        if tokens_total > self.max_length:
            print(f"[!] Truncated from {tokens_total} → {self.max_length} tokens")


        encoding = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=self.max_length)

        input_ids = encoding["input_ids"][0]
        sent_token_id = tokenizer.convert_tokens_to_ids("</s>")
        sep_positions = (input_ids == sent_token_id).nonzero(as_tuple=True)[0]


        # sep_positions = (input_ids == sep_token_id).nonzero(as_tuple=True)[0][1:]

        # If no [SEP] tokens are found (after question), fallback
        if len(sep_positions) == 0:
            # Insert dummy [SEP] and dummy label
            sep_positions = torch.tensor([1])
            labels = [0]

        # Also truncate labels to match sep_positions
        labels = labels[:len(sep_positions)]
        # DEBUG PRINT (only for a few examples)
        # if idx < 3:  # Only print for first 3 batches
        #     print(f"\nExample {idx}")
        #     print(f"Question: {question}")
        #     print(f"Sentences: {sentences}")
        #     print(f"Labels: {labels}")
        #     print(f"Number of SEP positions found: {len(sep_positions)}")
        #     print(f"Input IDs shape: {input_ids.shape}")

        return {
            "input_ids": input_ids,
            "attention_mask": encoding["attention_mask"][0],
            "sep_positions": sep_positions,
            "labels": torch.tensor(labels)
        }


# -----------------------
# MODEL WRAPPER
# -----------------------
class MultiSentenceClassifier(nn.Module):
    def __init__(self, base_model_name):
        super().__init__()
        self.encoder = SentenceTransformer(
            base_model_name,
            trust_remote_code=True,
            model_kwargs={"default_task": "classification","lora_main_params_trainable": True})
        self.encoder[0].default_task = "classification"
        #self.encoder = self.encoder.float()
        hidden_size = self.encoder.get_sentence_embedding_dimension()
        self.classifier = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask, sep_positions):
        output = self.encoder[0].auto_model(input_ids=input_ids, attention_mask=attention_mask)
        token_embeddings = output.last_hidden_state

        preds = []
        for i in range(input_ids.shape[0]):
            sep_pos = sep_positions[i]
            if len(sep_pos) == 0:
                continue  # skip if somehow empty
            sentence_embs = token_embeddings[i, sep_pos, :].float()
            logits = self.classifier(sentence_embs).squeeze(-1)
            preds.append(logits)

        return preds


class FocalLoss(nn.Module):
    def __init__(self, alpha=0.75, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        BCE_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        probs = torch.sigmoid(inputs)
        alpha_factor = targets * self.alpha + (1 - targets) * (1 - self.alpha)
        focal_weight = (targets * (1 - probs) + (1 - targets) * probs) ** self.gamma
        loss = alpha_factor * focal_weight * BCE_loss

        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss




# -----------------------
# TRAINING
# -----------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)


model = MultiSentenceClassifier(MODEL_NAME).to(DEVICE)


dataset = SentenceClassificationDataset(grouped_data, tokenizer, max_length=4096)

dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)



pos = df["binary_relevance"].value_counts()[1]
neg = df["binary_relevance"].value_counts()[0]
pos_weight_value = (neg / pos)


pos_weight = torch.tensor([pos_weight_value], device=DEVICE)

print("Using pos weight value:", pos_weight_value)


criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)


# criterion = FocalLoss(alpha=0.65, gamma=2)



model.train()
for epoch in range(EPOCHS):
    total_loss = 0
    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        sep_positions = [x.to(DEVICE) for x in batch["sep_positions"]]
        labels = [x.to(DEVICE).float() for x in batch["labels"]]

        optimizer.zero_grad()

        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
            outputs = model(input_ids, attention_mask, sep_positions)

            sample_losses = []

            for pred, target in zip(outputs, labels):
                if pred.numel() == 0 or target.numel() == 0:
                    continue
                if pred.shape != target.shape:
                    min_len = min(pred.shape[0], target.shape[0])
                    pred = pred[:min_len]
                    target = target[:min_len]
                sample_losses.append(criterion(pred, target))


            if sample_losses:
                loss = torch.stack(sample_losses).mean()
            else:
                continue  # skip if nothing valid

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{EPOCHS} - Loss: {total_loss:.4f}")

    model.eval()
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for batch in dataloader:  # replace with val_dataloader when ready
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            sep_positions = [x.to(DEVICE) for x in batch["sep_positions"]]
            labels = [x.to(DEVICE).float() for x in batch["labels"]]

            outputs = model(input_ids, attention_mask, sep_positions)

            for pred, target in zip(outputs, labels):
                if pred.numel() == 0 or target.numel() == 0:
                    continue
                if pred.shape != target.shape:
                    min_len = min(pred.shape[0], target.shape[0])
                    pred = pred[:min_len]
                    target = target[:min_len]

                probs = torch.sigmoid(pred).cpu().numpy()
                binarized = (probs > 0.5).astype(int)

                all_preds.extend(binarized.tolist())
                all_targets.extend(target.cpu().numpy().tolist())

    print("\nEvaluation Metrics:")
    print(classification_report(all_targets, all_preds, digits=3))
    model.train()


# -----------------------
# SAVE
# -----------------------
torch.save(model.state_dict(), "/content/gdrive/MyDrive/qlora_outputs/fine_tuned_jina_v3_multisent.pt")


Use train, validation split


In [None]:
# import os
# os.environ["WANDB_DISABLED"] = "true"

# import pandas as pd
# import torch
# import torch.nn as nn
# from torch.utils.data import Dataset, DataLoader
# from transformers import AutoTokenizer
# from sentence_transformers import SentenceTransformer
# from sklearn.metrics import classification_report
# from sklearn.model_selection import train_test_split
# import numpy as np
# from tqdm import tqdm

# # -----------------------
# # CONFIG
# # -----------------------
# MODEL_NAME = "jinaai/jina-embeddings-v3"
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# BATCH_SIZE = 1
# EPOCHS = 6
# PATIENCE = 2  # for early stopping

# # -----------------------
# # DATA LOADING
# # -----------------------


# df["binary_relevance"] = df["relevance"].apply(lambda x: 1 if x.strip().lower() == "essential" else 0)

# grouped_data = []
# for case_id, group in df.groupby("case_id"):
#     question = group["question_generated"].iloc[0]
#     sentences = group["ref_excerpt"].tolist()
#     labels = group["binary_relevance"].tolist()
#     grouped_data.append({
#         "question": question,
#         "sentences": [s.strip() for s in sentences],
#         "labels": labels
#     })

# # -----------------------
# # DATASET CLASS
# # -----------------------
# class SentenceClassificationDataset(Dataset):
#     def __init__(self, data, tokenizer, max_length=512):
#         self.data = data
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         item = self.data[idx]
#         question = item["question"]
#         sentences = item["sentences"]
#         labels = item["labels"]
#         separator = "</s>"
#         text = question + f" {separator} " + f" {separator} ".join(sentences)

#         encoding = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=self.max_length)
#         input_ids = encoding["input_ids"][0]
#         sep_token_id = tokenizer.convert_tokens_to_ids("</s>")
#         sep_positions = (input_ids == sep_token_id).nonzero(as_tuple=True)[0]

#         if len(sep_positions) == 0:
#             sep_positions = torch.tensor([1])
#             labels = [0]

#         labels = labels[:len(sep_positions)]

#         return {
#             "input_ids": input_ids,
#             "attention_mask": encoding["attention_mask"][0],
#             "sep_positions": sep_positions,
#             "labels": torch.tensor(labels)
#         }

# # -----------------------
# # MODEL
# # -----------------------
# class MultiSentenceClassifier(nn.Module):
#     def __init__(self, base_model_name):
#         super().__init__()
#         self.encoder = SentenceTransformer(
#             base_model_name,
#             trust_remote_code=True,
#             model_kwargs={"default_task": "classification", "lora_main_params_trainable": True})
#         self.encoder[0].default_task = "classification"
#         hidden_size = self.encoder.get_sentence_embedding_dimension()
#         self.classifier = nn.Linear(hidden_size, 1)

#     def forward(self, input_ids, attention_mask, sep_positions):
#         output = self.encoder[0].auto_model(input_ids=input_ids, attention_mask=attention_mask)
#         token_embeddings = output.last_hidden_state
#         preds = []
#         for i in range(input_ids.shape[0]):
#             sep_pos = sep_positions[i]
#             if len(sep_pos) == 0:
#                 continue
#             sentence_embs = token_embeddings[i, sep_pos, :].float()
#             logits = self.classifier(sentence_embs).squeeze(-1)
#             preds.append(logits)
#         return preds


# class FocalLoss(nn.Module):
#     def __init__(self, alpha=0.75, gamma=2.0, reduction='mean'):
#         super(FocalLoss, self).__init__()
#         self.alpha = alpha
#         self.gamma = gamma
#         self.reduction = reduction

#     def forward(self, inputs, targets):
#         BCE_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
#         probs = torch.sigmoid(inputs)
#         alpha_factor = targets * self.alpha + (1 - targets) * (1 - self.alpha)
#         focal_weight = (targets * (1 - probs) + (1 - targets) * probs) ** self.gamma
#         loss = alpha_factor * focal_weight * BCE_loss

#         if self.reduction == 'mean':
#             return loss.mean()
#         elif self.reduction == 'sum':
#             return loss.sum()
#         else:
#             return loss

# # -----------------------
# # TRAINING SETUP
# # -----------------------
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
# model = MultiSentenceClassifier(MODEL_NAME).to(DEVICE)

# train_data, val_data = train_test_split(grouped_data, test_size=0.2, random_state=42)

# train_dataset = SentenceClassificationDataset(train_data, tokenizer, max_length=4096)
# val_dataset = SentenceClassificationDataset(val_data, tokenizer, max_length=4096)

# train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# pos = df["binary_relevance"].value_counts()[1]
# neg = df["binary_relevance"].value_counts()[0]
# # pos_weight_value = (neg / pos)
# # pos_weight = torch.tensor([pos_weight_value], device=DEVICE)

# # print("Using pos weight value:", pos_weight_value)
# # criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

# criterion = FocalLoss(alpha=0.75, gamma=2)

# # -----------------------
# # TRAINING LOOP W/ EARLY STOPPING
# # -----------------------
# best_f1 = 0.0
# no_improve_epochs = 0

# for epoch in range(EPOCHS):
#     model.train()
#     total_loss = 0
#     for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
#         input_ids = batch["input_ids"].to(DEVICE)
#         attention_mask = batch["attention_mask"].to(DEVICE)
#         sep_positions = [x.to(DEVICE) for x in batch["sep_positions"]]
#         labels = [x.to(DEVICE).float() for x in batch["labels"]]

#         optimizer.zero_grad()
#         with torch.cuda.amp.autocast(dtype=torch.bfloat16):
#             outputs = model(input_ids, attention_mask, sep_positions)
#             sample_losses = []
#             for pred, target in zip(outputs, labels):
#                 if pred.numel() == 0 or target.numel() == 0:
#                     continue
#                 min_len = min(pred.shape[0], target.shape[0])
#                 sample_losses.append(criterion(pred[:min_len], target[:min_len]))
#             if sample_losses:
#                 loss = torch.stack(sample_losses).mean()
#             else:
#                 continue
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()

#     print(f"Epoch {epoch + 1}/{EPOCHS} - Loss: {total_loss:.4f}")

#     # -----------------------
#     # VALIDATION
#     # -----------------------
#     model.eval()
#     all_preds, all_targets = [], []

#     with torch.no_grad():
#         for batch in val_loader:
#             input_ids = batch["input_ids"].to(DEVICE)
#             attention_mask = batch["attention_mask"].to(DEVICE)
#             sep_positions = [x.to(DEVICE) for x in batch["sep_positions"]]
#             labels = [x.to(DEVICE).float() for x in batch["labels"]]

#             outputs = model(input_ids, attention_mask, sep_positions)

#             for pred, target in zip(outputs, labels):
#                 if pred.numel() == 0 or target.numel() == 0:
#                     continue
#                 min_len = min(pred.shape[0], target.shape[0])
#                 probs = torch.sigmoid(pred[:min_len]).cpu().numpy()
#                 binarized = (probs > 0.5).astype(int)
#                 all_preds.extend(binarized.tolist())
#                 all_targets.extend(target[:min_len].cpu().numpy().tolist())

#     print("\nValidation Metrics:")
#     report = classification_report(all_targets, all_preds, digits=3, output_dict=True)
#     print(classification_report(all_targets, all_preds, digits=3))

#     f1 = report.get(1.0, report.get('1.0', {})).get('f1-score', 0.0)
#     if f1 > best_f1:
#         best_f1 = f1
#         no_improve_epochs = 0
#         torch.save(model.state_dict(), "/content/gdrive/MyDrive/qlora_outputs/best_model.pt")
#         print("New best model saved.")
#     else:
#         no_improve_epochs += 1
#         print(f"No improvement for {no_improve_epochs} epoch(s)")

#     if no_improve_epochs >= PATIENCE:
#         print("Early stopping triggered.")
#         break

Inference

In [None]:
# connect your Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive"

Mounted at /content/gdrive


Evaluate the fine tuned model

Using Patient narrative

In [None]:
import torch
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.metrics import classification_report
import pandas as pd
from tqdm import tqdm

# -----------------------
# CONFIG
# -----------------------
MODEL_PATH = "/content/gdrive/MyDrive/qlora_outputs/05-05_BCEWithLogitsLoss_5_epochs/fine_tuned_jina_v3_multisent.pt"
# MODEL_PATH = "/content/gdrive/MyDrive/qlora_outputs/fine_tuned_jina_v3_multisent.pt"
MODEL_NAME = "jinaai/jina-embeddings-v3"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MAX_LENGTH = 4096

# -----------------------
# LOAD TEST DATA
# -----------------------
df_test = pd.read_excel("/content/gdrive/MyDrive/dataset_excel_bionlp/merged_notes_cases.xlsx")

# question_column = 'clinician_question'
question_column = 'patient_narrative'
df_test = df_test.rename(columns={question_column: 'question_generated'})


df_test["ref_excerpt"] = df_test["ref_excerpt"].astype(str)
df_test["binary_relevance"] = df_test["relevance"].apply(lambda x: 1 if x.strip().lower() == "essential" else 0)

grouped_data = []
for case_id, group in df_test.groupby("case_id"):
    question = group["question_generated"].iloc[0]
    sentences = group["ref_excerpt"].tolist()
    labels = group["binary_relevance"].tolist()
    grouped_data.append({
        "question": question,
        "sentences": [s.strip() for s in sentences],
        "labels": labels
    })

# -----------------------
# TOKENIZER AND MODEL
# -----------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

class MultiSentenceClassifier(torch.nn.Module):
    def __init__(self, base_model_name):
        super().__init__()
        self.encoder = SentenceTransformer(
            base_model_name,
            trust_remote_code=True,
            model_kwargs={"default_task": "classification", "use_flash_attn": True}
        )
        hidden_size = self.encoder.get_sentence_embedding_dimension()
        self.classifier = torch.nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask, sep_positions):
        output = self.encoder[0].auto_model(input_ids=input_ids, attention_mask=attention_mask)
        token_embeddings = output.last_hidden_state
        preds = []
        for i in range(input_ids.shape[0]):
            sep_pos = sep_positions[i]
            if len(sep_pos) == 0:
                continue
            sentence_embs = token_embeddings[i, sep_pos, :].float()  # ensure float32
            logits = self.classifier(sentence_embs).squeeze(-1)
            preds.append(logits)
        return preds

model = MultiSentenceClassifier(MODEL_NAME).to(DEVICE)
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
model.eval()

# -----------------------
# INFERENCE
# -----------------------


all_preds = []
all_targets = []

with torch.no_grad():
    for item in tqdm(grouped_data):
        question = item["question"]
        sentences = item["sentences"]
        true_labels = item["labels"]

        text = question + " </s> " + " </s> ".join(sentences)
        encoding = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LENGTH)
        input_ids = encoding["input_ids"].to(DEVICE)
        attention_mask = encoding["attention_mask"].to(DEVICE)

        sent_token_id = tokenizer.convert_tokens_to_ids("</s>")
        sep_positions = (input_ids[0] == sent_token_id).nonzero(as_tuple=True)[0]
        if len(sep_positions) == 0:
            sep_positions = torch.tensor([1]).to(DEVICE)

        sep_positions = sep_positions.unsqueeze(0)

        outputs = model(input_ids, attention_mask, [sep_positions])
        if outputs:
            probs = torch.sigmoid(outputs[0]).cpu().numpy().reshape(-1)
            preds = (probs > 0.5).astype(int)
            # truncate to shortest length (safety check)
            min_len = min(len(probs), len(true_labels))
            probs = probs[:min_len]
            preds = preds[:min_len]
            labels = true_labels[:min_len]

            # accumulate
            all_preds.extend(preds.tolist())
            all_targets.extend(labels)

            # Optional: print per sentence
            for i, (sent, prob, pred, label) in enumerate(zip(sentences[:min_len], probs, preds, labels)):
                print(f"Q: {question}")
                print(f"SENT {i+1}: {sent}")
                #print(f" → Predicted: {pred}, Prob: {prob:.3f}, True: {label}\n")
                print(f" → Predicted: {pred}, True: {label}\n")

# -----------------------
# GLOBAL METRICS
# -----------------------
print("\nOverall Evaluation on Test Set:")
print(classification_report(all_targets, all_preds, digits=3))




# Raw metrics
precision, recall, f1, support = precision_recall_fscore_support(all_targets, all_preds, average='binary', pos_label=1)

print("\nBinary-Averaged Metrics (positive class = 1):")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")
# print(f"Support:   {support}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/378 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/734k [00:00<?, ?B/s]

custom_st.py:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-embeddings-v3:
- custom_st.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

configuration_xlm_roberta.py:   0%|          | 0.00/6.54k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- configuration_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_lora.py:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

modeling_xlm_roberta.py:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

rotary.py:   0%|          | 0.00/24.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


xlm_padding.py:   0%|          | 0.00/10.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- xlm_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


embedding.py:   0%|          | 0.00/3.88k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


mlp.py:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mlp.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


block.py:   0%|          | 0.00/17.8k [00:00<?, ?B/s]

mha.py:   0%|          | 0.00/34.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mha.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


stochastic_depth.py:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- stochastic_depth.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- block.py
- mha.py
- stochastic_depth.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- modeling_xlm_roberta.py
- rotary.py
- xlm_padding.py
- embedding.py
- mlp.py
- block.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following fi

model.safetensors:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/192 [00:00<?, ?B/s]

 20%|██        | 4/20 [00:01<00:05,  2.75it/s]

Q: I had severe abdomen pain and was hospitalised for 15 days in ICU, diagnoised with CBD sludge thereafter on udiliv. Doctor advised for ERCP. My question is if the sludge was there does not the medication help in flushing it out? Whether ERCP was the only cure?
SENT 1: Brief Hospital Course:
 → Predicted: 1, True: 0

Q: I had severe abdomen pain and was hospitalised for 15 days in ICU, diagnoised with CBD sludge thereafter on udiliv. Doctor advised for ERCP. My question is if the sludge was there does not the medication help in flushing it out? Whether ERCP was the only cure?
SENT 2: During the ERCP a pancreatic stent was required to facilitate
access to the biliary system (removed at the end of the
procedure), and a common bile duct stent was placed to allow
drainage of the biliary obstruction caused by stones and sludge.
 → Predicted: 1, True: 1

Q: I had severe abdomen pain and was hospitalised for 15 days in ICU, diagnoised with CBD sludge thereafter on udiliv. Doctor advised for

 45%|████▌     | 9/20 [00:02<00:01,  7.05it/s]

Q: I overdosed October 4th on trihexyphenidyl, thorazine, and cocaine. Ended up in icu with prolonged qt for 8 days. I have had chest pain in my left upper quadrant ever since. My doctor said it s related to muscle and bone. It s a dull to deep pain. Any ideas?
SENT 1: Brief Hospital Course:
 → Predicted: 0, True: 0

Q: I overdosed October 4th on trihexyphenidyl, thorazine, and cocaine. Ended up in icu with prolonged qt for 8 days. I have had chest pain in my left upper quadrant ever since. My doctor said it s related to muscle and bone. It s a dull to deep pain. Any ideas?
SENT 2: # Bipolar d/o, PTSD, schizophrenia:  Psychiatry consult
recommended that all psych medications be held until they could
be re-prescribed by pt's outpatient psychiatrist.
 → Predicted: 1, True: 0

Q: I overdosed October 4th on trihexyphenidyl, thorazine, and cocaine. Ended up in icu with prolonged qt for 8 days. I have had chest pain in my left upper quadrant ever since. My doctor said it s related to muscle 

 65%|██████▌   | 13/20 [00:02<00:00, 11.21it/s]

Q: A friend went to the emergency room this past Monday morning with severe flu-like symptoms. On Thursday, she was diagnosed with meningitis. On Friday afternoon, it was determined to be viral, and she was rereleased with instructions to stay home and rest for 1 week. Yesterday she was beginning to feel better, but is not feeling as well today as yesterday. Is this fall back today a normal part of the recovery process, or Is it a sign of a possible relapse? Should she perhaps be gotten back to the hospitals? Thanks for your time.
SENT 1: Brief Hospital Course:
___ with h/o hypothyroidism secondary to Hashimoto's, Chiari
malformation s/p posterior fossa decompression surgery ___,
h/o aseptic meningitis ___ p/w meningitis
 → Predicted: 1, True: 0

Q: A friend went to the emergency room this past Monday morning with severe flu-like symptoms. On Thursday, she was diagnosed with meningitis. On Friday afternoon, it was determined to be viral, and she was rereleased with instructions to stay

100%|██████████| 20/20 [00:02<00:00,  7.45it/s]

Q: I spent yesterday in the ER with thumping heart beats i.e. palpitations. I had all blood work, a full panel, enzymes etc., EKG, chest x-ray, and TSH because I’ve had a total thyroidectomy. In October of last year, I had a walking stress test and all the same tests then, all of which are normal, showing no sign of any cardiac issues. My palpitations are benign, I’m told. Fine, how can I slow them or stop them without some antiarrhythmic meds? Of course, even though I’ve been told I’m fine, I feel them and sometimes worry which triggers the stress bug, then my chest gets tight, I have to take deep breaths to get out from under the stress. Is there anything I can do to relieve them? They really started when I started taking levothyroxine.
SENT 1: Major Surgical or Invasive Procedure:
Thyroidectomy to remove multinodular goiter, performed by ___ on
___
 → Predicted: 1, True: 1

Q: I spent yesterday in the ER with thumping heart beats i.e. palpitations. I had all blood work, a full panel




In [None]:
# -----------------------
# CREATE EXPORTABLE RESULTS
# -----------------------

rows = []

for case_id, item in tqdm(zip(df_test["case_id"].unique(), grouped_data), total=len(grouped_data), desc="Generating report"):
    question = item["question"]
    labels = item["labels"]
    sentences = item["sentences"]

    # Rerun encoding for the item
    text = question + " </s> " + " </s> ".join(sentences)
    encoding = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LENGTH)
    input_ids = encoding["input_ids"].to(DEVICE)
    attention_mask = encoding["attention_mask"].to(DEVICE)

    sent_token_id = tokenizer.convert_tokens_to_ids("</s>")
    sep_positions = (input_ids[0] == sent_token_id).nonzero(as_tuple=True)[0]
    if len(sep_positions) == 0:
        sep_positions = torch.tensor([1]).to(DEVICE)
    sep_positions = sep_positions.unsqueeze(0)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask, [sep_positions])
        probs = torch.sigmoid(outputs[0]).cpu().numpy().reshape(-1)
        preds = (probs > 0.5).astype(int)

    min_len = min(len(preds), len(labels))
    cited_ids = [str(i) for i, p in enumerate(preds[:min_len]) if p == 1]
    gold_ids = [str(i) for i, l in enumerate(labels[:min_len]) if l == 1]

    rows.append({
        "case_id": case_id,
        "question": question,
        "cited_sentence_ids": ", ".join(cited_ids),
        "gold_essential_sentence_ids": ", ".join(gold_ids)
    })

# Create DataFrame
df_out = pd.DataFrame(rows)

# Save to Excel
output_path = "/content/gdrive/MyDrive/qlora_outputs/05-05_BCEWithLogitsLoss_5_epochs/prediction_outputs/jina_citation_results_fine_tuned_patient_narrative.xlsx"
df_out.to_excel(output_path, index=False)

print(f"\nSaved output to: {output_path}")

Generating report: 100%|██████████| 20/20 [00:00<00:00, 30.15it/s]


Saved output to: /content/gdrive/MyDrive/qlora_outputs/05-05_BCEWithLogitsLoss_5_epochs/prediction_outputs/citation_results_fine_tuned_patient_narrative.xlsx





In [None]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

# -----------------------
# CONFIG
# -----------------------
PREDICTION_FILE = "/content/gdrive/MyDrive/qlora_outputs/05-05_BCEWithLogitsLoss_5_epochs/prediction_outputs/jina_citation_results_fine_tuned_patient_narrative.xlsx"

# -----------------------
# LOAD
# -----------------------
df = pd.read_excel(PREDICTION_FILE)

y_true = []
y_pred = []

for _, row in df.iterrows():
    gold_ids = set(map(int, str(row["gold_essential_sentence_ids"]).split(","))) if pd.notna(row["gold_essential_sentence_ids"]) else set()
    pred_ids = set(map(int, str(row["cited_sentence_ids"]).split(","))) if pd.notna(row["cited_sentence_ids"]) else set()

    max_id = max(gold_ids.union(pred_ids)) if gold_ids or pred_ids else -1

    for i in range(max_id + 1):
        y_true.append(1 if i in gold_ids else 0)
        y_pred.append(1 if i in pred_ids else 0)

# -----------------------
# METRICS
# -----------------------
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", pos_label=1)

print("Evaluation Results of fine tuned model on dev with patient narrative:")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")

Evaluation Results of fine tuned model on dev with patient narrative:
Precision: 0.394
Recall:    0.688
F1-score:  0.501


Using Clinician question

In [None]:
import torch
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.metrics import classification_report
import pandas as pd
from tqdm import tqdm

# -----------------------
# CONFIG
# -----------------------
MODEL_PATH = "/content/gdrive/MyDrive/qlora_outputs/05-05_BCEWithLogitsLoss_5_epochs/fine_tuned_jina_v3_multisent.pt"
MODEL_NAME = "jinaai/jina-embeddings-v3"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MAX_LENGTH = 4096

# -----------------------
# LOAD TEST DATA
# -----------------------
df_test = pd.read_excel("/content/gdrive/MyDrive/dataset_excel_bionlp/merged_notes_cases.xlsx")

question_column = 'clinician_question'
# question_column = 'patient_narrative'
df_test = df_test.rename(columns={question_column: 'question_generated'})


df_test["ref_excerpt"] = df_test["ref_excerpt"].astype(str)
df_test["binary_relevance"] = df_test["relevance"].apply(lambda x: 1 if x.strip().lower() == "essential" else 0)

grouped_data = []
for case_id, group in df_test.groupby("case_id"):
    question = group["question_generated"].iloc[0]
    sentences = group["ref_excerpt"].tolist()
    labels = group["binary_relevance"].tolist()
    grouped_data.append({
        "question": question,
        "sentences": [s.strip() for s in sentences],
        "labels": labels
    })

# -----------------------
# TOKENIZER AND MODEL
# -----------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

class MultiSentenceClassifier(torch.nn.Module):
    def __init__(self, base_model_name):
        super().__init__()
        self.encoder = SentenceTransformer(
            base_model_name,
            trust_remote_code=True,
            model_kwargs={"default_task": "classification", "use_flash_attn": True}
        )
        hidden_size = self.encoder.get_sentence_embedding_dimension()
        self.classifier = torch.nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask, sep_positions):
        output = self.encoder[0].auto_model(input_ids=input_ids, attention_mask=attention_mask)
        token_embeddings = output.last_hidden_state
        preds = []
        for i in range(input_ids.shape[0]):
            sep_pos = sep_positions[i]
            if len(sep_pos) == 0:
                continue
            sentence_embs = token_embeddings[i, sep_pos, :].float()  # ensure float32
            logits = self.classifier(sentence_embs).squeeze(-1)
            preds.append(logits)
        return preds

model = MultiSentenceClassifier(MODEL_NAME).to(DEVICE)
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
model.eval()

# -----------------------
# INFERENCE
# -----------------------


all_preds = []
all_targets = []

with torch.no_grad():
    for item in tqdm(grouped_data):
        question = item["question"]
        sentences = item["sentences"]
        true_labels = item["labels"]

        text = question + " </s> " + " </s> ".join(sentences)
        encoding = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LENGTH)
        input_ids = encoding["input_ids"].to(DEVICE)
        attention_mask = encoding["attention_mask"].to(DEVICE)

        sent_token_id = tokenizer.convert_tokens_to_ids("</s>")
        sep_positions = (input_ids[0] == sent_token_id).nonzero(as_tuple=True)[0]
        if len(sep_positions) == 0:
            sep_positions = torch.tensor([1]).to(DEVICE)

        sep_positions = sep_positions.unsqueeze(0)

        outputs = model(input_ids, attention_mask, [sep_positions])
        if outputs:
            probs = torch.sigmoid(outputs[0]).cpu().numpy().reshape(-1)
            preds = (probs > 0.5).astype(int)
            # truncate to shortest length (safety check)
            min_len = min(len(probs), len(true_labels))
            probs = probs[:min_len]
            preds = preds[:min_len]
            labels = true_labels[:min_len]

            # accumulate
            all_preds.extend(preds.tolist())
            all_targets.extend(labels)

            # Optional: print per sentence
            for i, (sent, prob, pred, label) in enumerate(zip(sentences[:min_len], probs, preds, labels)):
                print(f"Q: {question}")
                print(f"SENT {i+1}: {sent}")
                #print(f" → Predicted: {pred}, Prob: {prob:.3f}, True: {label}\n")
                print(f" → Predicted: {pred}, True: {label}\n")

# -----------------------
# GLOBAL METRICS
# -----------------------
print("\nOverall Evaluation on Test Set:")
print(classification_report(all_targets, all_preds, digits=3))




# Raw metrics
precision, recall, f1, support = precision_recall_fscore_support(all_targets, all_preds, average='binary', pos_label=1)

print("\nBinary-Averaged Metrics")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")
print(f"Support:   {support}")

 30%|███       | 6/20 [00:00<00:00, 29.03it/s]

Q: Why was ERCP recommended to him over continuing a medication-based treatment?
SENT 1: Brief Hospital Course:
 → Predicted: 1, True: 0

Q: Why was ERCP recommended to him over continuing a medication-based treatment?
SENT 2: During the ERCP a pancreatic stent was required to facilitate
access to the biliary system (removed at the end of the
procedure), and a common bile duct stent was placed to allow
drainage of the biliary obstruction caused by stones and sludge.
 → Predicted: 1, True: 1

Q: Why was ERCP recommended to him over continuing a medication-based treatment?
SENT 3: However, due to the patient's elevated INR, no sphincterotomy or
stone removal was performed.
 → Predicted: 1, True: 0

Q: Why was ERCP recommended to him over continuing a medication-based treatment?
SENT 4: Frank pus was noted to be draining
from the common bile duct, and post-ERCP it was recommended that
the patient remain on IV Zosyn for at least a week.
 → Predicted: 1, True: 0

Q: Why was ERCP recommended

 65%|██████▌   | 13/20 [00:00<00:00, 30.12it/s]

Q: Are there specific instructions about blood thinners due to her subarachnoid brain hemorrhage?
SENT 1: Brief Hospital Course:
 → Predicted: 1, True: 0

Q: Are there specific instructions about blood thinners due to her subarachnoid brain hemorrhage?
SENT 2: Neuro: Patient was followed by neurosurgery through out her
stay.
 → Predicted: 1, True: 0

Q: Are there specific instructions about blood thinners due to her subarachnoid brain hemorrhage?
SENT 3: Due to the extent of hemorrhage, a CTA was obtained to
determine if an aneurysm was present, but none were visualized.
 → Predicted: 1, True: 1

Q: Are there specific instructions about blood thinners due to her subarachnoid brain hemorrhage?
SENT 4: Her neurologic status gradually improved and she was weaned from
sedation.
 → Predicted: 1, True: 0

Q: Are there specific instructions about blood thinners due to her subarachnoid brain hemorrhage?
SENT 5: Her GCS was 15, though intermittently agitated.
 → Predicted: 0, True: 0

Q: Are th

100%|██████████| 20/20 [00:00<00:00, 26.18it/s]

Q: Was there any evidence for stomach cancer?
SENT 1: Discharge Instructions:
You were admitted to the hospital with a partial small bowel
obstruction.
 → Predicted: 1, True: 1

Q: Was there any evidence for stomach cancer?
SENT 2: With time, your obstruction dramatically improved
and you were able to eat regular food.
 → Predicted: 1, True: 1

Q: Was there any evidence for stomach cancer?
SENT 3: Constipation will worsen your symptoms.
 → Predicted: 1, True: 0

Q: Was there any evidence for stomach cancer?
SENT 4: Unfortunately, you have
trouble tolerating stool softeners and you get diarrhea very
easily.
 → Predicted: 1, True: 0

Q: Was there any evidence for stomach cancer?
SENT 5: Fiber is the most gentle treatment for constipation.
 → Predicted: 1, True: 0

Q: Was there any evidence for stomach cancer?
SENT 6: Please take fiber supplements (Metamucil) twice daily.
 → Predicted: 1, True: 0

Q: Was there any evidence for stomach cancer?
SENT 7: Consider
also taking ___ each day if y




In [None]:
# -----------------------
# CREATE EXPORTABLE RESULTS
# -----------------------

rows = []

for case_id, item in tqdm(zip(df_test["case_id"].unique(), grouped_data), total=len(grouped_data), desc="Generating report"):
    question = item["question"]
    labels = item["labels"]
    sentences = item["sentences"]

    # Rerun encoding for the item
    text = question + " </s> " + " </s> ".join(sentences)
    encoding = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LENGTH)
    input_ids = encoding["input_ids"].to(DEVICE)
    attention_mask = encoding["attention_mask"].to(DEVICE)

    sent_token_id = tokenizer.convert_tokens_to_ids("</s>")
    sep_positions = (input_ids[0] == sent_token_id).nonzero(as_tuple=True)[0]
    if len(sep_positions) == 0:
        sep_positions = torch.tensor([1]).to(DEVICE)
    sep_positions = sep_positions.unsqueeze(0)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask, [sep_positions])
        probs = torch.sigmoid(outputs[0]).cpu().numpy().reshape(-1)
        preds = (probs > 0.5).astype(int)

    min_len = min(len(preds), len(labels))
    cited_ids = [str(i) for i, p in enumerate(preds[:min_len]) if p == 1]
    gold_ids = [str(i) for i, l in enumerate(labels[:min_len]) if l == 1]

    rows.append({
        "case_id": case_id,
        "question": question,
        "cited_sentence_ids": ", ".join(cited_ids),
        "gold_essential_sentence_ids": ", ".join(gold_ids)
    })

# Create DataFrame
df_out = pd.DataFrame(rows)

# Save to Excel
output_path = "/content/gdrive/MyDrive/qlora_outputs/05-05_BCEWithLogitsLoss_5_epochs/prediction_outputs/jina_citation_results_fine_tuned_clinician_question.xlsx"
df_out.to_excel(output_path, index=False)

print(f"\nSaved output to: {output_path}")

Generating report: 100%|██████████| 20/20 [00:00<00:00, 33.31it/s]


Saved output to: /content/gdrive/MyDrive/qlora_outputs/05-05_BCEWithLogitsLoss_5_epochs/prediction_outputs/jina_citation_results_fine_tuned_clinician_question.xlsx





In [None]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

# -----------------------
# CONFIG
# -----------------------
PREDICTION_FILE = "/content/gdrive/MyDrive/qlora_outputs/05-05_BCEWithLogitsLoss_5_epochs/prediction_outputs/jina_citation_results_fine_tuned_clinician_question.xlsx"

# -----------------------
# LOAD
# -----------------------
df = pd.read_excel(PREDICTION_FILE)

y_true = []
y_pred = []

for _, row in df.iterrows():
    gold_ids = set(map(int, str(row["gold_essential_sentence_ids"]).split(","))) if pd.notna(row["gold_essential_sentence_ids"]) else set()
    pred_ids = set(map(int, str(row["cited_sentence_ids"]).split(","))) if pd.notna(row["cited_sentence_ids"]) else set()

    max_id = max(gold_ids.union(pred_ids)) if gold_ids or pred_ids else -1

    for i in range(max_id + 1):
        y_true.append(1 if i in gold_ids else 0)
        y_pred.append(1 if i in pred_ids else 0)

# -----------------------
# METRICS
# -----------------------
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", pos_label=1)

print("Evaluation Results of fine tuned model on dev with clinician question:")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")

Evaluation Results of fine tuned model on dev with clinician question:
Precision: 0.360
Recall:    0.710
F1-score:  0.478


Evaluate the Untuned Model

Using Patient narrative

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch
from sklearn.metrics import classification_report
import pandas as pd
from tqdm import tqdm

# Load pretrained model
MODEL_NAME = "jinaai/jina-embeddings-v3"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(MODEL_NAME, trust_remote_code=True).to(DEVICE)

# Load test data
df_test = pd.read_excel("/content/gdrive/MyDrive/dataset_excel_bionlp/merged_notes_cases.xlsx")
# question_column = 'clinician_question'
question_column = 'patient_narrative'
df_test = df_test.rename(columns={question_column: 'question_generated'})
df_test["ref_excerpt"] = df_test["ref_excerpt"].astype(str)
df_test["binary_relevance"] = df_test["relevance"].apply(lambda x: 1 if x.strip().lower() == "essential" else 0)

# Group test cases
grouped_data = []
for case_id, group in df_test.groupby("case_id"):
    question = group["question_generated"].iloc[0]
    sentences = group["ref_excerpt"].tolist()
    labels = group["binary_relevance"].tolist()
    grouped_data.append({
        "question": question,
        "sentences": [s.strip() for s in sentences],
        "labels": labels
    })

# Inference
all_preds = []
all_targets = []

for item in tqdm(grouped_data):
    question = item["question"]
    sentences = item["sentences"]
    labels = item["labels"]

    # Encode question and sentences
    # q_emb = model.encode(question, convert_to_tensor=True, task="classification")
    # sent_embs = model.encode(sentences, convert_to_tensor=True, task="classification")
    q_emb = model.encode(question, convert_to_tensor=True)
    sent_embs = model.encode(sentences, convert_to_tensor=True)

    # Cosine similarity
    sims = util.cos_sim(q_emb, sent_embs)[0]  # shape: [#sentences]

    # Threshold (you may want to tune this threshold based on val set)
    threshold = 0.5
    preds = (sims > threshold).int().tolist()

    all_preds.extend(preds)
    all_targets.extend(labels)

    # Optional: print per sample
    for i, (sent, pred, label) in enumerate(zip(sentences, preds, labels)):
        print(f"Q: {question}")
        print(f"SENT {i+1}: {sent}")
        print(f" → Predicted: {pred}, True: {label}\n")

# Evaluation
print("\nBaseline (untuned) model evaluation:")
print(classification_report(all_targets, all_preds, digits=3))

 10%|█         | 2/20 [00:00<00:01, 13.93it/s]

Q: I had severe abdomen pain and was hospitalised for 15 days in ICU, diagnoised with CBD sludge thereafter on udiliv. Doctor advised for ERCP. My question is if the sludge was there does not the medication help in flushing it out? Whether ERCP was the only cure?
SENT 1: Brief Hospital Course:
 → Predicted: 0, True: 0

Q: I had severe abdomen pain and was hospitalised for 15 days in ICU, diagnoised with CBD sludge thereafter on udiliv. Doctor advised for ERCP. My question is if the sludge was there does not the medication help in flushing it out? Whether ERCP was the only cure?
SENT 2: During the ERCP a pancreatic stent was required to facilitate
access to the biliary system (removed at the end of the
procedure), and a common bile duct stent was placed to allow
drainage of the biliary obstruction caused by stones and sludge.
 → Predicted: 1, True: 1

Q: I had severe abdomen pain and was hospitalised for 15 days in ICU, diagnoised with CBD sludge thereafter on udiliv. Doctor advised for

 30%|███       | 6/20 [00:00<00:01, 13.85it/s]

Q: I am 48 years old. On February 20, I passed out, was taken to the hospital, and had two other episodes. I have chronic kidney disease with creatine around 1.5. I had anemia and hemoglobin was 10.3. I was in ICU 8 days and discharged in stable condition. My doctor performed a cardiac catherization. I had no increase in cardiac enzymes and an ECHO in the hospital showed 25% LVEF. Was this invasive, risky procedure necessary.
SENT 1: History of Present Illness:
 → Predicted: 0, True: 0

Q: I am 48 years old. On February 20, I passed out, was taken to the hospital, and had two other episodes. I have chronic kidney disease with creatine around 1.5. I had anemia and hemoglobin was 10.3. I was in ICU 8 days and discharged in stable condition. My doctor performed a cardiac catherization. I had no increase in cardiac enzymes and an ECHO in the hospital showed 25% LVEF. Was this invasive, risky procedure necessary.
SENT 2: On the cardiology service his abdominal pain, nausea, vomitting
was fe

 40%|████      | 8/20 [00:00<00:00, 13.18it/s]

Q: My mother n law had a surgery about 10 years ago, replacing all veins from top of leg to foot. Although that surgery has done well the doctors have had her on blood thinner ever since. Just recently she has gone to hospital due to a fall and they found a subarachnoid hemorrhage in the brain. They took her off the blood thinner, kept in icu for 3 weeks. We asked what would happen now to the original vein surgery (since stopping Coumadin). Are there any specific instructions about stopping Coumadin?
SENT 1: Brief Hospital Course:
 → Predicted: 0, True: 0

Q: My mother n law had a surgery about 10 years ago, replacing all veins from top of leg to foot. Although that surgery has done well the doctors have had her on blood thinner ever since. Just recently she has gone to hospital due to a fall and they found a subarachnoid hemorrhage in the brain. They took her off the blood thinner, kept in icu for 3 weeks. We asked what would happen now to the original vein surgery (since stopping Cou

 60%|██████    | 12/20 [00:00<00:00, 13.97it/s]

Q: Hi There, my best friend who is 36 yrs old have a severe asthma attack, quick led to a heart attack and stopped breathing for 13 min before being resuscitated. After being in ICU for 6 days, doctors removed his breathing apparatus and he has been able to breathe independently since. I realize he most likely sustained some brain damage , but do you think he mte be able to continue breathing for long and survive?
SENT 1: Brief Hospital Course:
 → Predicted: 0, True: 0

Q: Hi There, my best friend who is 36 yrs old have a severe asthma attack, quick led to a heart attack and stopped breathing for 13 min before being resuscitated. After being in ICU for 6 days, doctors removed his breathing apparatus and he has been able to breathe independently since. I realize he most likely sustained some brain damage , but do you think he mte be able to continue breathing for long and survive?
SENT 2: # PEA arrest and subsequent anoxic brain injury.:
 → Predicted: 1, True: 1

Q: Hi There, my best fr

 80%|████████  | 16/20 [00:01<00:00, 14.34it/s]

Q: I have a tumor in my back and I have 3 disc bulge. I also have peripheral neuropathy. I went to the er and that's what they found. Now my legs can't move and I'm in severe pain. They told me to see my doctor in a week but he's on vacation. What can I do to help myself?
SENT 1: Brief Hospital Course:
Patient admitted to the general neurology service.
 → Predicted: 0, True: 0

Q: I have a tumor in my back and I have 3 disc bulge. I also have peripheral neuropathy. I went to the er and that's what they found. Now my legs can't move and I'm in severe pain. They told me to see my doctor in a week but he's on vacation. What can I do to help myself?
SENT 2: MRI of the T
and L spine were obtained which showed stable degenerative disc
disease without spinal cord compromise.
 → Predicted: 0, True: 0

Q: I have a tumor in my back and I have 3 disc bulge. I also have peripheral neuropathy. I went to the er and that's what they found. Now my legs can't move and I'm in severe pain. They told me t

100%|██████████| 20/20 [00:01<00:00, 13.84it/s]

Q: I spent yesterday in the ER with thumping heart beats i.e. palpitations. I had all blood work, a full panel, enzymes etc., EKG, chest x-ray, and TSH because I’ve had a total thyroidectomy. In October of last year, I had a walking stress test and all the same tests then, all of which are normal, showing no sign of any cardiac issues. My palpitations are benign, I’m told. Fine, how can I slow them or stop them without some antiarrhythmic meds? Of course, even though I’ve been told I’m fine, I feel them and sometimes worry which triggers the stress bug, then my chest gets tight, I have to take deep breaths to get out from under the stress. Is there anything I can do to relieve them? They really started when I started taking levothyroxine.
SENT 1: Major Surgical or Invasive Procedure:
Thyroidectomy to remove multinodular goiter, performed by ___ on
___
 → Predicted: 0, True: 1

Q: I spent yesterday in the ER with thumping heart beats i.e. palpitations. I had all blood work, a full panel




In [None]:
# -----------------------
# CREATE EXPORTABLE RESULTS
# -----------------------

results = []

for case_id, item in zip(df_test["case_id"].unique(), tqdm(grouped_data, desc="Running inference", total=len(grouped_data))):
    question = item["question"]
    sentences = item["sentences"]
    labels = item["labels"]

    # Encode question and sentences
    q_emb = model.encode(question, convert_to_tensor=True)
    sent_embs = model.encode(sentences, convert_to_tensor=True)

    # Cosine similarity
    sims = util.cos_sim(q_emb, sent_embs)[0]
    threshold = 0.5
    preds = (sims > threshold).int().tolist()

    # Record sentence IDs (indices) that were predicted or truly essential
    cited_ids = [str(i) for i, p in enumerate(preds) if p == 1]
    gold_ids = [str(i) for i, g in enumerate(labels) if g == 1]

    results.append({
        "case_id": case_id,
        "question": question,
        "cited_sentence_ids": ",".join(cited_ids),
        "gold_essential_sentence_ids": ",".join(gold_ids),
    })

# Save to Excel
df_results = pd.DataFrame(results)
df_results.to_excel("/content/gdrive/MyDrive/qlora_outputs/05-05_BCEWithLogitsLoss_5_epochs/prediction_outputs/jina_citation_results_untuned_base_patient_narrative.xlsx", index=False)

Running inference:  95%|█████████▌| 19/20 [00:01<00:00, 13.41it/s]


In [None]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

# -----------------------
# CONFIG
# -----------------------
PREDICTION_FILE = "/content/gdrive/MyDrive/qlora_outputs/05-05_BCEWithLogitsLoss_5_epochs/prediction_outputs/jina_citation_results_untuned_base_patient_narrative.xlsx"

# -----------------------
# LOAD
# -----------------------
df = pd.read_excel(PREDICTION_FILE)

y_true = []
y_pred = []

for _, row in df.iterrows():
    gold_ids = set(map(int, str(row["gold_essential_sentence_ids"]).split(","))) if pd.notna(row["gold_essential_sentence_ids"]) else set()
    pred_ids = set(map(int, str(row["cited_sentence_ids"]).split(","))) if pd.notna(row["cited_sentence_ids"]) else set()

    max_id = max(gold_ids.union(pred_ids)) if gold_ids or pred_ids else -1

    for i in range(max_id + 1):
        y_true.append(1 if i in gold_ids else 0)
        y_pred.append(1 if i in pred_ids else 0)

# -----------------------
# METRICS
# -----------------------
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", pos_label=1)

print("Evaluation Results of fine tuned model on dev with patient narrative question:")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")

Evaluation Results of fine tuned model on dev with patient narrative question:
Precision: 0.494
Recall:    0.558
F1-score:  0.524


Using Clinician question

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch
from sklearn.metrics import classification_report
import pandas as pd
from tqdm import tqdm

# Load pretrained model
MODEL_NAME = "jinaai/jina-embeddings-v3"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(MODEL_NAME, trust_remote_code=True).to(DEVICE)

# Load test data
df_test = pd.read_excel("/content/gdrive/MyDrive/dataset_excel_bionlp/merged_notes_cases.xlsx")
question_column = 'clinician_question'
# question_column = 'patient_narrative'
df_test = df_test.rename(columns={question_column: 'question_generated'})
df_test["ref_excerpt"] = df_test["ref_excerpt"].astype(str)
df_test["binary_relevance"] = df_test["relevance"].apply(lambda x: 1 if x.strip().lower() == "essential" else 0)

# Group test cases
grouped_data = []
for case_id, group in df_test.groupby("case_id"):
    question = group["question_generated"].iloc[0]
    sentences = group["ref_excerpt"].tolist()
    labels = group["binary_relevance"].tolist()
    grouped_data.append({
        "question": question,
        "sentences": [s.strip() for s in sentences],
        "labels": labels
    })

# Inference
all_preds = []
all_targets = []

for item in tqdm(grouped_data):
    question = item["question"]
    sentences = item["sentences"]
    labels = item["labels"]

    # Encode question and sentences
    q_emb = model.encode(question, convert_to_tensor=True)
    sent_embs = model.encode(sentences, convert_to_tensor=True)

    # Cosine similarity
    sims = util.cos_sim(q_emb, sent_embs)[0]  # shape: [#sentences]

    # Threshold (you may want to tune this threshold based on val set)
    threshold = 0.5
    preds = (sims > threshold).int().tolist()

    all_preds.extend(preds)
    all_targets.extend(labels)

    # Optional: print per sample
    for i, (sent, pred, label) in enumerate(zip(sentences, preds, labels)):
        print(f"Q: {question}")
        print(f"SENT {i+1}: {sent}")
        print(f" → Predicted: {pred}, True: {label}\n")

# Evaluation
print("\nBaseline (untuned) model evaluation:")
print(classification_report(all_targets, all_preds, digits=3))

 10%|█         | 2/20 [00:00<00:01, 13.41it/s]

Q: Why was ERCP recommended to him over continuing a medication-based treatment?
SENT 1: Brief Hospital Course:
 → Predicted: 0, True: 0

Q: Why was ERCP recommended to him over continuing a medication-based treatment?
SENT 2: During the ERCP a pancreatic stent was required to facilitate
access to the biliary system (removed at the end of the
procedure), and a common bile duct stent was placed to allow
drainage of the biliary obstruction caused by stones and sludge.
 → Predicted: 1, True: 1

Q: Why was ERCP recommended to him over continuing a medication-based treatment?
SENT 3: However, due to the patient's elevated INR, no sphincterotomy or
stone removal was performed.
 → Predicted: 1, True: 0

Q: Why was ERCP recommended to him over continuing a medication-based treatment?
SENT 4: Frank pus was noted to be draining
from the common bile duct, and post-ERCP it was recommended that
the patient remain on IV Zosyn for at least a week.
 → Predicted: 1, True: 0

Q: Why was ERCP recommended

 30%|███       | 6/20 [00:00<00:01, 13.66it/s]

Q: Why was cardiac catheterization recommended to the patient?
SENT 1: History of Present Illness:
 → Predicted: 0, True: 0

Q: Why was cardiac catheterization recommended to the patient?
SENT 2: On the cardiology service his abdominal pain, nausea, vomitting
was felt to be secondary to congestive hepatopathy, his cough
due to CHF vs asthma, and his syncope was felt to be secondary
to a coughing spell leading to increased intra-abdominal
pressures and therefore reduced preload in the setting of
low-output state.
 → Predicted: 1, True: 0

Q: Why was cardiac catheterization recommended to the patient?
SENT 3: His ICD interrogation was negative for any
events.
 → Predicted: 1, True: 0

Q: Why was cardiac catheterization recommended to the patient?
SENT 4: He was aggressively diuresed with a net 10 liters
negative since admission.
 → Predicted: 1, True: 0

Q: Why was cardiac catheterization recommended to the patient?
SENT 5: He underwent RHC for milrinone trial,
which proved to be success

 40%|████      | 8/20 [00:00<00:00, 12.88it/s]

Q: Are there specific instructions about blood thinners due to her subarachnoid brain hemorrhage?
SENT 1: Brief Hospital Course:
 → Predicted: 0, True: 0

Q: Are there specific instructions about blood thinners due to her subarachnoid brain hemorrhage?
SENT 2: Neuro: Patient was followed by neurosurgery through out her
stay.
 → Predicted: 0, True: 0

Q: Are there specific instructions about blood thinners due to her subarachnoid brain hemorrhage?
SENT 3: Due to the extent of hemorrhage, a CTA was obtained to
determine if an aneurysm was present, but none were visualized.
 → Predicted: 1, True: 1

Q: Are there specific instructions about blood thinners due to her subarachnoid brain hemorrhage?
SENT 4: Her neurologic status gradually improved and she was weaned from
sedation.
 → Predicted: 1, True: 0

Q: Are there specific instructions about blood thinners due to her subarachnoid brain hemorrhage?
SENT 5: Her GCS was 15, though intermittently agitated.
 → Predicted: 1, True: 0

Q: Are th

 60%|██████    | 12/20 [00:00<00:00, 13.87it/s]

Q: Did she sustain any brain damage from the heart attack?
SENT 1: Brief Hospital Course:
 → Predicted: 0, True: 0

Q: Did she sustain any brain damage from the heart attack?
SENT 2: # PEA arrest and subsequent anoxic brain injury.:
 → Predicted: 1, True: 1

Q: Did she sustain any brain damage from the heart attack?
SENT 3: Suspect that
original OSH PEA arrest due to hypoxemia or acidosis, with [**Hospital1 18**]
ED PEA arrest due to acidosis with admission pH 7.16 on arrival.
 → Predicted: 0, True: 1

Q: Did she sustain any brain damage from the heart attack?
SENT 4: TTE with evidence of RV failure to suggest PE. LVEF 30% with
known dilated cardiomyopathy.
 → Predicted: 0, True: 0

Q: Did she sustain any brain damage from the heart attack?
SENT 5: He was cooled per protocol.
 → Predicted: 0, True: 0

Q: Did she sustain any brain damage from the heart attack?
SENT 6: Initially, his EEG was concerning without evident brain
activity.
 → Predicted: 1, True: 0

Q: Did she sustain any brain

 80%|████████  | 16/20 [00:01<00:00, 14.22it/s]

Q: Was there any evidence for stomach cancer?
SENT 1: Discharge Instructions:
You were admitted to the hospital with a partial small bowel
obstruction.
 → Predicted: 0, True: 1

Q: Was there any evidence for stomach cancer?
SENT 2: With time, your obstruction dramatically improved
and you were able to eat regular food.
 → Predicted: 1, True: 1

Q: Was there any evidence for stomach cancer?
SENT 3: Constipation will worsen your symptoms.
 → Predicted: 0, True: 0

Q: Was there any evidence for stomach cancer?
SENT 4: Unfortunately, you have
trouble tolerating stool softeners and you get diarrhea very
easily.
 → Predicted: 0, True: 0

Q: Was there any evidence for stomach cancer?
SENT 5: Fiber is the most gentle treatment for constipation.
 → Predicted: 0, True: 0

Q: Was there any evidence for stomach cancer?
SENT 6: Please take fiber supplements (Metamucil) twice daily.
 → Predicted: 0, True: 0

Q: Was there any evidence for stomach cancer?
SENT 7: Consider
also taking ___ each day if y

100%|██████████| 20/20 [00:01<00:00, 13.73it/s]

Q: What should he do to relieve palpitations and anxiety?
SENT 1: Major Surgical or Invasive Procedure:
Thyroidectomy to remove multinodular goiter, performed by ___ on
___
 → Predicted: 0, True: 1

Q: What should he do to relieve palpitations and anxiety?
SENT 2: Brief Hospital Course:
 → Predicted: 0, True: 0

Q: What should he do to relieve palpitations and anxiety?
SENT 3: # Hypercarbic respiratory failure, multifactorial
# Multi-Nodular Goiter
# Obstructive sleep apnea, severe
 → Predicted: 0, True: 0

Q: What should he do to relieve palpitations and anxiety?
SENT 4: The patient has had severe obstructive sleep apnea diagnosed in
___ for which he has been on various forms of PAP.
 → Predicted: 1, True: 0

Q: What should he do to relieve palpitations and anxiety?
SENT 5: He has
a history of non-compliance with his CPAP.
 → Predicted: 1, True: 0

Q: What should he do to relieve palpitations and anxiety?
SENT 6: The patient lives in
a group home due to his schizophrenia, where he was


