In [40]:
import os
import pandas as pd
import numpy as np
import torch
import faiss
from typing import List, Dict
from collections import defaultdict
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import requests
import re

In [None]:
# Initialize with your API key
API_KEY = ''
NO_SYN=[]
SYN = {}
PARSED=[]
class UMLSClient:
    """Client for interacting with UMLS REST API"""

    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_uri = "https://uts-ws.nlm.nih.gov/rest"
        self.service_ticket = None

    def get_service_ticket(self):
        """Get a service ticket for authentication"""
        tgt_url = f"https://utslogin.nlm.nih.gov/cas/v1/api-key"
        params = {'apikey': self.api_key}

        try:
            r = requests.post(tgt_url, data=params)
            r.raise_for_status()

            # Extract TGT from response
            response_text = r.text
            tgt_match = re.search(r'action="(.*?)"', response_text)
            if tgt_match:
                tgt = tgt_match.group(1)
                # Get service ticket
                st_params = {'service': 'http://umlsks.nlm.nih.gov'}
                st_response = requests.post(tgt, data=st_params)
                return st_response.text.strip()
            return None
        except Exception as e:
            print(f"Error getting service ticket: {e}")
            return None

    def search_term(self, term: str, search_type: str = "exact", sabs: str = "MTH"):
        """
        Search UMLS for a term

        Args:
            term: The term to search for
            search_type: Type of search (exact, words, leftTruncation, rightTruncation, approximate)
            sabs: Source vocabularies (comma-separated)

        Returns:
            List of dictionaries containing CUI, name, and source vocabulary
        """
        # Get fresh service ticket
        ticket = self.get_service_ticket()
        if not ticket:
            return []

        search_url = f"{self.base_uri}/search/current"
        params = {
            'string': term,
            'searchType': search_type,
            'sabs': sabs,
            'ticket': ticket
        }

        try:
            response = requests.get(search_url, params=params)
            response.raise_for_status()

            results = []
            data = response.json()

            if 'result' in data and 'results' in data['result']:
                for result in data['result']['results']:
                    results.append({
                        'cui': result.get('ui', ''),
                        'name': result.get('name', ''),
                        'rootSource': result.get('rootSource', '')
                    })

            return results

        except Exception as e:
            print(f"Error searching for term '{term}': {e}")
            return []
def get_umls_synonyms(term: str, api_key: str, search_type: str = "exact", sabs: str = "MTH") -> dict:
    """
    Get all synonyms for a given term from UMLS

    Args:
        term: The term to search for
        api_key: UMLS API key
        search_type: Type of search (exact, words, leftTruncation, rightTruncation, approximate)
        sabs: Source vocabularies (comma-separated). Default "MTH" for Metathesaurus

    Returns:
        Dictionary containing:
            - 'original_term': The input term
            - 'cuis': List of CUIs found
            - 'synonyms': Set of all unique synonyms across all CUIs
            - 'details': List of dicts with CUI-specific synonym information
    """
    client = UMLSClient(api_key)

    # Search for the term to get CUIs
    search_results = client.search_term(term, search_type, sabs)

    if not search_results:
        return {
            'original_term': term,
            'cuis': [],
            'synonyms': set(),
            'details': []
        }

    all_synonyms = set()
    cui_details = []
    cuis = []

    # For each CUI found, get all atoms (synonyms)
    for result in search_results:
        cui = result['cui']
        cuis.append(cui)

        # Get service ticket for this request
        ticket = client.get_service_ticket()
        if not ticket:
            continue

        # Get atoms (terms/synonyms) for this CUI
        atoms_url = f"{client.base_uri}/content/current/CUI/{cui}/atoms"
        params = {
            'ticket': ticket,
            'sabs': sabs,
            'pageSize': 1000  # Adjust if you need more results
        }

        try:
            response = requests.get(atoms_url, params=params)
            response.raise_for_status()
            data = response.json()

            cui_synonyms = set()

            if 'result' in data:
                for atom in data['result']:
                    synonym = atom.get('name', '')
                    if synonym:
                        cui_synonyms.add(synonym)
                        all_synonyms.add(synonym)

            cui_details.append({
                'cui': cui,
                'preferred_name': result['name'],
                'source': result['rootSource'],
                'synonyms': list(cui_synonyms)
            })

        except Exception as e:
            print(f"Error getting atoms for CUI {cui}: {e}")
            continue

    return {
        'original_term': term,
        'cuis': cuis,
        'synonyms': all_synonyms,
        'details': cui_details
    }

In [None]:
class MedicalNotesRAG:
    def __init__(self, df: pd.DataFrame, chunk_size: int = 400,auth_token = '',name_file = 'rag_file_prompt.csv'):
        """
        Initialize RAG system for medical notes.

        Args:
            df: DataFrame with columns [text, note_id, hadm_id, criterion, question_type, question, answer, not_specified]
            chunk_size: Target number of tokens per chunk
            top_p: Top-p threshold for retrieval (cumulative probability)
        """
        self.df = df
        self.chunk_size = chunk_size
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.file_name = name_file
        # Initialize models
        print("Loading Bio_ClinicalBERT for embeddings...")
        self.embedding_model_name = "emilyalsentzer/Bio_ClinicalBERT"
        self.embedding_tokenizer = AutoTokenizer.from_pretrained(self.embedding_model_name)
        self.embedding_model = AutoModel.from_pretrained(self.embedding_model_name).to(self.device)
        self.embedding_model.eval()
        # Storage
        self.notes_df = None
        self.chunks_df = None
        self.queries_df = None
        self.vector_index = None
        self.chunk_embeddings = None
        self.embedding_dim = 768  # BioBERT dimension

    #Extract unique medical notes
    def extract_notes(self):
        # Group by note_id to get unique notes
        notes_data = []
        for note_id, group in self.df.groupby('note_id'):
            hadm_id = group['hadm_id'].iloc[0]
            text = group['text'].iloc[0]
            notes_data.append({
                'note_id': note_id,
                'hadm_id': hadm_id,
                'text': text
            })

        self.notes_df = pd.DataFrame(notes_data)
        print(f"✓ Extracted {len(self.notes_df)} unique notes")
        print(f"✓ From {self.notes_df['hadm_id'].nunique()} unique patients")

    #Utility for splitting text into chunks
    def _chunk_text(self, text: str, note_id: str, hadm_id: int) -> List[Dict]:
        #Split text into overlapping chunks.
        sentences = re.split(r'[.!?]+', text)
        sentences = [s.strip() for s in sentences if s.strip()]
    
        chunks = []
        current_chunk = []
        current_tokens = 0
        chunk_id = 0
    
        for sentence in sentences:
            tokens = len(self.embedding_tokenizer.encode(sentence))
    
            if current_tokens + tokens > self.chunk_size and current_chunk:
                chunk_text = '. '.join(current_chunk) + '.'
                chunks.append({
                    'chunk_id': f"{note_id}_chunk_{chunk_id}",
                    'note_id': note_id,
                    'hadm_id': hadm_id,
                    'chunk_text': chunk_text
                })
                chunk_id += 1
                
                # Overlap: keep last sentence
                current_chunk = [current_chunk[-1]] if current_chunk else []
                current_tokens = len(self.embedding_tokenizer.encode(current_chunk[0])) if current_chunk else 0
    
            current_chunk.append(sentence)
            current_tokens += tokens
    
        # Add remaining chunk
        if current_chunk:
            chunk_text = '. '.join(current_chunk) + '.'
            chunks.append({
                'chunk_id': f"{note_id}_chunk_{chunk_id}",
                'note_id': note_id,
                'hadm_id': hadm_id,
                'chunk_text': chunk_text
            })
    
        return chunks
 
    #Create chunks from medical notes
    def create_chunks(self):
        all_chunks = []
        for _, row in self.notes_df.iterrows():
            chunks = self._chunk_text(row['text'], row['note_id'], row['hadm_id'])
            all_chunks.extend(chunks)
        self.chunks_df = pd.DataFrame(all_chunks)
        print(f"✓ Created {len(self.chunks_df)} chunks from {len(self.notes_df)} notes")
        print(f"✓ Average chunks per note: {len(self.chunks_df) / len(self.notes_df):.2f}")

    #Utility to generate vector embeddings
    def _embed_text(self, text: str) -> np.ndarray:
        #Generate embeddings using BioBERT
        
        inputs = self.embedding_tokenizer(
            text,
            return_tensors='pt',
            truncation=True,
            max_length=512,
            padding=True
        ).to(self.device)

        with torch.no_grad():
            outputs = self.embedding_model(**inputs)
            # Mean pooling
            last_hidden = outputs.last_hidden_state
            mask = inputs['attention_mask'].unsqueeze(-1).float()
            pooled = (last_hidden * mask).sum(dim=1) / mask.sum(dim=1)

        return pooled.cpu().numpy()[0]
    
    #Generate embeddings for chunks.
    def create_embeddings(self):
        embeddings = []
        for idx, row in self.chunks_df.iterrows():
            emb = self._embed_text(row['chunk_text'])
            embeddings.append(emb)
            if (idx + 1) % 50 == 0:
                print(f"  Embedded {idx + 1}/{len(self.chunks_df)} chunks")
        self.chunk_embeddings = np.array(embeddings).astype('float32')
        print(f"✓ Generated {len(self.chunk_embeddings)} vector embeddings")

    #Utility to create dataframe containing queries, hadm_id, question_type, and answers.
    def create_queries_dataframe(self):
        queries_data = []
        for _, row in self.df.iterrows():
            queries_data.append({
                'hadm_id': row['hadm_id'],
                'note_id': row['note_id'],
                'question': row['question'],
                'question_type': row['question_type'],
                'answer': row['answer'],
                'criterion': row.get('criterion', ''),
                'not_specified': row.get('not_specified', False)
            })

        self.queries_df = pd.DataFrame(queries_data)
        print(f"✓ Created queries dataframe with {len(self.queries_df)} questions")
        print(f"✓ Question types: {self.queries_df['question_type'].value_counts().to_dict()}")
        print(f"✓ Questions per patient (avg): {len(self.queries_df) / self.queries_df['hadm_id'].nunique():.2f}")
        return self.queries_df

    # Retrieve chunks using top-p / top-k filtering (with cosine similarity)
    def retrieve_with_top_k(self, query, hadm_id, top_k) -> List[Dict]:
        # Get query embedding
        query_emb = self._embed_text(query).reshape(1, -1).astype('float32')

        # Filter chunks to only this patient
        patient_chunk_indices = self.chunks_df[self.chunks_df['hadm_id'] == hadm_id].index.tolist()

        if len(patient_chunk_indices) == 0:
            print(f"No chunks found for patient {hadm_id}")
            return []

        # Get embeddings for this patient's chunks
        patient_embeddings = self.chunk_embeddings[patient_chunk_indices].astype('float32')

        # ---- COSINE SIMILARITY SETUP ----
        faiss.normalize_L2(patient_embeddings)
        faiss.normalize_L2(query_emb)

        temp_index = faiss.IndexFlatIP(self.embedding_dim)
        temp_index.add(patient_embeddings)

        # distances are actually cosine similarities in [-1, 1]
        similarities, indices = temp_index.search(query_emb, len(patient_chunk_indices))

        sim = similarities[0]
        neighbor_local = indices[0]

        # Convert similarities to probabilities using softmax
        exp_similarities = np.exp(sim - np.max(sim))
        probabilities = exp_similarities / np.sum(exp_similarities)

        # Sort by probability (descending) in neighbor-rank space
        sorted_ranks = np.argsort(probabilities)[::-1]
        sorted_probs = probabilities[sorted_ranks]

        # ----- Selection logic -----
        # if self.top == 'p':
        #     cumulative_probs = np.cumsum(sorted_probs)
        #     top_p_mask = cumulative_probs <= self.top_p
        #     if not np.any(top_p_mask):
        #         top_p_mask[0] = True
        #     selected_sorted_positions = np.where(top_p_mask)[0]

        k = min(top_k, len(sorted_ranks))
        selected_sorted_positions = np.arange(k)

        selected_ranks = sorted_ranks[selected_sorted_positions]
        selected_probs = sorted_probs[selected_sorted_positions]

        results = []
        for rank, prob in zip(selected_ranks, selected_probs):
            local_idx = neighbor_local[rank]
            global_idx = patient_chunk_indices[local_idx]

            chunk_data = self.chunks_df.iloc[global_idx].to_dict()

            cosine_sim = float(sim[rank])
            cosine_dist = 1.0 - cosine_sim

            chunk_data['chunk_idx'] = int(global_idx)
            chunk_data['distance'] = cosine_dist
            chunk_data['similarity'] = cosine_sim
            chunk_data['probability'] = float(prob)
            results.append(chunk_data)

        return results

    def keyword_search(self, query,criterion, hadm_id, top_k) -> List[Dict]:
        # Filter to this patient's chunks
        patient_mask = self.chunks_df['hadm_id'] == hadm_id
        patient_chunks = self.chunks_df[patient_mask].copy()

        if patient_chunks.empty:
            print(f"No chunks found for patient {hadm_id}")
            return []

        # Tokenize query into terms
        # terms = re.findall(r"\w+", query.lower())
        # stopwords = {"a", "an", "the", "of", "in", "on", "at", "to", "and", "or"}
        # terms = [t for t in terms if t not in stopwords]
        criteria = criterion.replace('_', ' ').replace('-', ' ')
        terms = [criteria,criteria.lower()]
        if criteria not in NO_SYN and criteria not in PARSED:
            extra_syn = get_umls_synonyms(criteria, API_KEY)
        elif criteria in PARSED:
            extra_syn = {'synonyms': SYN[criteria]}
        else:
            extra_syn = {'synonyms': []}
        if len(extra_syn['synonyms'])>0:
            list_syn = []
            for syn in extra_syn['synonyms']:
                terms.append(syn)
                list_syn.append(syn)
            PARSED.append(criteria)
            SYN[criteria] = list_syn
        else:
            # print(f"No synonyms found for {criteria} from UMLS")
            NO_SYN.append(criteria)
        if not terms:
            # print("No meaningful terms in query; returning empty keyword results.")
            return []

        def score_text(text):
            if not isinstance(text, str):
                text = str(text)
            text_l = text.lower()
            return sum(term in text_l for term in terms)

        # score each chunk (uses 'chunk_text' column)
        patient_chunks['keyword_score'] = patient_chunks['chunk_text'].apply(score_text)

        # keep only chunks with at least one hit
        patient_chunks = patient_chunks[patient_chunks['keyword_score'] > 0]
        if patient_chunks.empty:
            # print(f"No keyword hits for query '{query}' in patient {hadm_id}")
            return []

        # sort by keyword score descending
        patient_chunks = patient_chunks.sort_values(
            by=['keyword_score'],
            ascending=[False]
        )

        # take top_k
        top_k = min(top_k, len(patient_chunks))
        patient_chunks = patient_chunks.head(top_k)

        results = []
        for idx, row in patient_chunks.iterrows():
            item = row.to_dict()
            item['chunk_idx'] = int(idx)
            item['keyword_score'] = int(row['keyword_score'])
            results.append(item)

        return results

    def retrieve_hybrid(self, query, criterion, hadm_id, kw_top_k, sem_top_k, alpha, final_top_k):
        """
        Hybrid retrieval = semantic (FAISS cosine) + keyword search.

        alpha = 1.0 -> pure semantic
        alpha = 0.0 -> pure keyword
        """

        # --- 1. Semantic retrieval (temporarily widen top_k) ---
        semantic_results = self.retrieve_with_top_k(query, hadm_id, sem_top_k)

        # --- 2. Keyword retrieval ---
        keyword_results = self.keyword_search(query,criterion, hadm_id, kw_top_k)

        # Fallbacks if one side is empty
        if not semantic_results and not keyword_results:
            return []
        if not semantic_results:
            return keyword_results[:final_top_k]
        if not keyword_results:
            return semantic_results[:final_top_k]

        # --- 3. Build a map by chunk_idx to fuse scores ---
        hybrid_map = {}  # chunk_idx -> {"sem_score": ..., "kw_score": ...}

        # semantic probability already ~[0,1]
        for r in semantic_results:
            cid = r['chunk_idx']
            entry = hybrid_map.get(cid, {"sem_score": 0.0, "kw_score": 0.0})
            entry["sem_score"] = max(entry["sem_score"], float(r.get("probability", 0.0)))
            hybrid_map[cid] = entry

        # keyword score is integer count, we'll normalize later
        for r in keyword_results:
            cid = r['chunk_idx']
            entry = hybrid_map.get(cid, {"sem_score": 0.0, "kw_score": 0.0})
            entry["kw_score"] = max(entry["kw_score"], float(r.get("keyword_score", 0.0)))
            hybrid_map[cid] = entry

        # --- 4. Normalize keyword scores and compute combined score ---
        max_kw = max((v["kw_score"] for v in hybrid_map.values()), default=0.0)
        if max_kw <= 0:
            max_kw = 1.0  # avoid division by zero

        results = []
        for cid, scores in hybrid_map.items():
            sem_score = scores["sem_score"]                     # [0,1]
            kw_norm = scores["kw_score"] / max_kw               # [0,1]

            combined = alpha * sem_score + (1.0 - alpha) * kw_norm

            row = self.chunks_df.iloc[cid].to_dict()
            row['chunk_idx'] = int(cid)
            row['semantic_score'] = float(sem_score)
            row['keyword_score'] = float(scores["kw_score"])
            row['combined_score'] = float(combined)

            results.append(row)

        # --- 5. Sort by combined_score and return top-N ---
        results.sort(key=lambda x: x['combined_score'], reverse=True)
        return results[:final_top_k]
        
    def create_prompt(self, question: str, question_type: str, context: str) -> str:
        """
        Create prompt based on question type with strict output format rules.

        """
        if question_type.lower() == 'yes':
            rule = "You must answer with ONLY 'yes' or 'no'. No other words or explanations."
            format_instruction = "Answer (yes/no):"
        else:  # numeric
            rule = "You must answer with ONLY a number (with decimals if needed). No units, no other words."
            format_instruction = "Answer (number only):"

        prompt = f"""Medical Context:
            {context}

            Question: {question}

            RULE: {rule}

            {format_instruction}"""

        return prompt

    def save_prompt(self, kw_top_k, sem_top_k, alpha, final_top_k) -> Dict:
        """""
        Saves a .csv file with query, true_answer and prompt created using the patients medical notes - for every patient and every query - 2300 rows
        """

        results = []
        prompts_data = []
        patient_results = defaultdict(lambda: {'correct': [], 'total': 0, 'numeric_errors': []})
        question_results = defaultdict(lambda: {'correct': [], 'total': 0})

        # LIMIT TO FIRST 10 QUERIES
        queries_to_process = self.queries_df.head(2300)
        print(f'Queries to process: {len(queries_to_process)}')
        # Use tqdm with the dataframe iteration
        with tqdm(total=len(queries_to_process), desc="Evaluating queries", unit="query") as pbar:
            for idx, row in queries_to_process.iterrows():
                question = row['question']
                true_answer = str(row['answer']).strip().lower()
                hadm_id = row['hadm_id']
                question_type = row['question_type']
                criterion = row['criterion']

                # Retrieve chunks with top-p filtering (patient-isolated)
                retrieved_chunks = self.retrieve_hybrid(
                    question,
                    criterion,
                    hadm_id,
                    kw_top_k,
                    sem_top_k,
                    alpha,
                    final_top_k)

                if len(retrieved_chunks) == 0:
                    predicted_answer = "no_data"
                    is_correct = False
                    percentage_error = None
                    prompt = "NO DATA - No chunks retrieved"
                else:
                    # Create context from retrieved chunks
                    context = "\n\n".join([chunk['chunk_text'] for chunk in retrieved_chunks])

                    # Create prompt based on question type
                    prompt = self.create_prompt(question, question_type, context)


                # Store prompt data
                prompts_data.append({
                    'query_index': idx,
                    'hadm_id': hadm_id,
                    'question': question,
                    'criterion': criterion,
                    'question_type': question_type,
                    'true_answer': true_answer,
                    'prompt': prompt,
                    'num_chunks_retrieved': len(retrieved_chunks)
                })



                # Update progress bar
                pbar.update(1)

        # Save prompts to CSV
        prompts_df = pd.DataFrame(prompts_data)
        prompts_df.to_csv(self.file_name, index=False)
        print(f"\n✓ Saved all prompts to {self.file_name}")


        return prompts_df

In [49]:
if __name__ == "__main__":
    print("Loading dataset...")
    df = pd.read_csv('./annotated_apixaban_combined_fixed.csv')

    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"Unique notes: {df['note_id'].nunique()}")
    print(f"Unique patients: {df['hadm_id'].nunique()}")
    iter_list = [[50,15],[100,8],[150,5],[200,4],[300,2],[400,2],[500,1]]
    # iter_list = [50,100,150,200,250]
    kw_top_k = 15
    sem_top_k = 15
    alpha = 0.3
    for chunk,final_top_k in iter_list:
        # Initialize RAG system
        f_name = f'fixed_RAG_prompt_save_c_{chunk}_k_{final_top_k}.csv'
        print(f_name)
        rag_system = MedicalNotesRAG(df, chunk_size=chunk, name_file=f_name)

        # Execute pipeline
        print("=" * 80)
        print("STEP 1: EXTRACT UNIQUE MEDICAL NOTES")
        print("=" * 80)
        notes = rag_system.extract_notes()

        print("\n" + "="*80)
        print("STEP 2: CREATING CHUNKS FROM NOTES")
        print("="*80)
        chunks = rag_system.create_chunks()

        print("\n" + "="*80)
        print("STEP 3: CREATING EMBEDDINGS FOR CHUNKS")
        print("="*80)
        vector_index = rag_system.create_embeddings()

        print("\n" + "="*80)
        print("STEP 4: QUERYING PATIENT QUESTIONS AGAINST NOTE CHUNKS")
        print("="*80)
        queries_dataframe = rag_system.create_queries_dataframe()
        
        prompt = rag_system.save_prompt(kw_top_k=kw_top_k,
                                        sem_top_k= sem_top_k,
                                        alpha = alpha,
                                        final_top_k=final_top_k)

Loading dataset...
Dataset shape: (2300, 8)
Columns: ['text', 'note_id', 'hadm_id', 'criterion', 'question_type', 'question', 'answer', 'not_specified']
Unique notes: 100
Unique patients: 100
fixed_RAG_prompt_save_c_50_k_15.csv
Loading Bio_ClinicalBERT for embeddings...
STEP 1: EXTRACT UNIQUE MEDICAL NOTES
✓ Extracted 100 unique notes
✓ From 100 unique patients

STEP 2: CREATING CHUNKS FROM NOTES
✓ Created 13829 chunks from 100 notes
✓ Average chunks per note: 138.29

STEP 3: CREATING EMBEDDINGS FOR CHUNKS
  Embedded 50/13829 chunks
  Embedded 100/13829 chunks
  Embedded 150/13829 chunks
  Embedded 200/13829 chunks
  Embedded 250/13829 chunks
  Embedded 300/13829 chunks
  Embedded 350/13829 chunks
  Embedded 400/13829 chunks
  Embedded 450/13829 chunks
  Embedded 500/13829 chunks
  Embedded 550/13829 chunks
  Embedded 600/13829 chunks
  Embedded 650/13829 chunks
  Embedded 700/13829 chunks
  Embedded 750/13829 chunks
  Embedded 800/13829 chunks
  Embedded 850/13829 chunks
  Embedded 90

Evaluating queries: 100%|██████████| 2300/2300 [01:45<00:00, 21.71query/s]



✓ Saved all prompts to fixed_RAG_prompt_save_c_50_k_15.csv
fixed_RAG_prompt_save_c_100_k_8.csv
Loading Bio_ClinicalBERT for embeddings...
STEP 1: EXTRACT UNIQUE MEDICAL NOTES
✓ Extracted 100 unique notes
✓ From 100 unique patients

STEP 2: CREATING CHUNKS FROM NOTES
✓ Created 6774 chunks from 100 notes
✓ Average chunks per note: 67.74

STEP 3: CREATING EMBEDDINGS FOR CHUNKS
  Embedded 50/6774 chunks
  Embedded 100/6774 chunks
  Embedded 150/6774 chunks
  Embedded 200/6774 chunks
  Embedded 250/6774 chunks
  Embedded 300/6774 chunks
  Embedded 350/6774 chunks
  Embedded 400/6774 chunks
  Embedded 450/6774 chunks
  Embedded 500/6774 chunks
  Embedded 550/6774 chunks
  Embedded 600/6774 chunks
  Embedded 650/6774 chunks
  Embedded 700/6774 chunks
  Embedded 750/6774 chunks
  Embedded 800/6774 chunks
  Embedded 850/6774 chunks
  Embedded 900/6774 chunks
  Embedded 950/6774 chunks
  Embedded 1000/6774 chunks
  Embedded 1050/6774 chunks
  Embedded 1100/6774 chunks
  Embedded 1150/6774 chunk

Evaluating queries: 100%|██████████| 2300/2300 [01:39<00:00, 23.23query/s]



✓ Saved all prompts to fixed_RAG_prompt_save_c_100_k_8.csv
fixed_RAG_prompt_save_c_150_k_5.csv
Loading Bio_ClinicalBERT for embeddings...
STEP 1: EXTRACT UNIQUE MEDICAL NOTES
✓ Extracted 100 unique notes
✓ From 100 unique patients

STEP 2: CREATING CHUNKS FROM NOTES
✓ Created 4226 chunks from 100 notes
✓ Average chunks per note: 42.26

STEP 3: CREATING EMBEDDINGS FOR CHUNKS
  Embedded 50/4226 chunks
  Embedded 100/4226 chunks
  Embedded 150/4226 chunks
  Embedded 200/4226 chunks
  Embedded 250/4226 chunks
  Embedded 300/4226 chunks
  Embedded 350/4226 chunks
  Embedded 400/4226 chunks
  Embedded 450/4226 chunks
  Embedded 500/4226 chunks
  Embedded 550/4226 chunks
  Embedded 600/4226 chunks
  Embedded 650/4226 chunks
  Embedded 700/4226 chunks
  Embedded 750/4226 chunks
  Embedded 800/4226 chunks
  Embedded 850/4226 chunks
  Embedded 900/4226 chunks
  Embedded 950/4226 chunks
  Embedded 1000/4226 chunks
  Embedded 1050/4226 chunks
  Embedded 1100/4226 chunks
  Embedded 1150/4226 chunk

Evaluating queries: 100%|██████████| 2300/2300 [01:52<00:00, 20.53query/s]



✓ Saved all prompts to fixed_RAG_prompt_save_c_150_k_5.csv
fixed_RAG_prompt_save_c_200_k_4.csv
Loading Bio_ClinicalBERT for embeddings...
STEP 1: EXTRACT UNIQUE MEDICAL NOTES
✓ Extracted 100 unique notes
✓ From 100 unique patients

STEP 2: CREATING CHUNKS FROM NOTES
✓ Created 2976 chunks from 100 notes
✓ Average chunks per note: 29.76

STEP 3: CREATING EMBEDDINGS FOR CHUNKS
  Embedded 50/2976 chunks
  Embedded 100/2976 chunks
  Embedded 150/2976 chunks
  Embedded 200/2976 chunks
  Embedded 250/2976 chunks
  Embedded 300/2976 chunks
  Embedded 350/2976 chunks
  Embedded 400/2976 chunks
  Embedded 450/2976 chunks
  Embedded 500/2976 chunks
  Embedded 550/2976 chunks
  Embedded 600/2976 chunks
  Embedded 650/2976 chunks
  Embedded 700/2976 chunks
  Embedded 750/2976 chunks
  Embedded 800/2976 chunks
  Embedded 850/2976 chunks
  Embedded 900/2976 chunks
  Embedded 950/2976 chunks
  Embedded 1000/2976 chunks
  Embedded 1050/2976 chunks
  Embedded 1100/2976 chunks
  Embedded 1150/2976 chunk

Evaluating queries: 100%|██████████| 2300/2300 [01:27<00:00, 26.31query/s]



✓ Saved all prompts to fixed_RAG_prompt_save_c_200_k_4.csv
fixed_RAG_prompt_save_c_300_k_2.csv
Loading Bio_ClinicalBERT for embeddings...
STEP 1: EXTRACT UNIQUE MEDICAL NOTES
✓ Extracted 100 unique notes
✓ From 100 unique patients

STEP 2: CREATING CHUNKS FROM NOTES
✓ Created 1850 chunks from 100 notes
✓ Average chunks per note: 18.50

STEP 3: CREATING EMBEDDINGS FOR CHUNKS
  Embedded 50/1850 chunks
  Embedded 100/1850 chunks
  Embedded 150/1850 chunks
  Embedded 200/1850 chunks
  Embedded 250/1850 chunks
  Embedded 300/1850 chunks
  Embedded 350/1850 chunks
  Embedded 400/1850 chunks
  Embedded 450/1850 chunks
  Embedded 500/1850 chunks
  Embedded 550/1850 chunks
  Embedded 600/1850 chunks
  Embedded 650/1850 chunks
  Embedded 700/1850 chunks
  Embedded 750/1850 chunks
  Embedded 800/1850 chunks
  Embedded 850/1850 chunks
  Embedded 900/1850 chunks
  Embedded 950/1850 chunks
  Embedded 1000/1850 chunks
  Embedded 1050/1850 chunks
  Embedded 1100/1850 chunks
  Embedded 1150/1850 chunk

Evaluating queries: 100%|██████████| 2300/2300 [01:23<00:00, 27.40query/s]



✓ Saved all prompts to fixed_RAG_prompt_save_c_300_k_2.csv
fixed_RAG_prompt_save_c_400_k_2.csv
Loading Bio_ClinicalBERT for embeddings...
STEP 1: EXTRACT UNIQUE MEDICAL NOTES
✓ Extracted 100 unique notes
✓ From 100 unique patients

STEP 2: CREATING CHUNKS FROM NOTES
✓ Created 1346 chunks from 100 notes
✓ Average chunks per note: 13.46

STEP 3: CREATING EMBEDDINGS FOR CHUNKS
  Embedded 50/1346 chunks
  Embedded 100/1346 chunks
  Embedded 150/1346 chunks
  Embedded 200/1346 chunks
  Embedded 250/1346 chunks
  Embedded 300/1346 chunks
  Embedded 350/1346 chunks
  Embedded 400/1346 chunks
  Embedded 450/1346 chunks
  Embedded 500/1346 chunks
  Embedded 550/1346 chunks
  Embedded 600/1346 chunks
  Embedded 650/1346 chunks
  Embedded 700/1346 chunks
  Embedded 750/1346 chunks
  Embedded 800/1346 chunks
  Embedded 850/1346 chunks
  Embedded 900/1346 chunks
  Embedded 950/1346 chunks
  Embedded 1000/1346 chunks
  Embedded 1050/1346 chunks
  Embedded 1100/1346 chunks
  Embedded 1150/1346 chunk

Evaluating queries: 100%|██████████| 2300/2300 [01:38<00:00, 23.25query/s]



✓ Saved all prompts to fixed_RAG_prompt_save_c_400_k_2.csv
fixed_RAG_prompt_save_c_500_k_1.csv
Loading Bio_ClinicalBERT for embeddings...
STEP 1: EXTRACT UNIQUE MEDICAL NOTES
✓ Extracted 100 unique notes
✓ From 100 unique patients

STEP 2: CREATING CHUNKS FROM NOTES
✓ Created 1048 chunks from 100 notes
✓ Average chunks per note: 10.48

STEP 3: CREATING EMBEDDINGS FOR CHUNKS
  Embedded 50/1048 chunks
  Embedded 100/1048 chunks
  Embedded 150/1048 chunks
  Embedded 200/1048 chunks
  Embedded 250/1048 chunks
  Embedded 300/1048 chunks
  Embedded 350/1048 chunks
  Embedded 400/1048 chunks
  Embedded 450/1048 chunks
  Embedded 500/1048 chunks
  Embedded 550/1048 chunks
  Embedded 600/1048 chunks
  Embedded 650/1048 chunks
  Embedded 700/1048 chunks
  Embedded 750/1048 chunks
  Embedded 800/1048 chunks
  Embedded 850/1048 chunks
  Embedded 900/1048 chunks
  Embedded 950/1048 chunks
  Embedded 1000/1048 chunks
✓ Generated 1048 vector embeddings

STEP 4: QUERYING PATIENT QUESTIONS AGAINST NOT

Evaluating queries: 100%|██████████| 2300/2300 [01:23<00:00, 27.66query/s]



✓ Saved all prompts to fixed_RAG_prompt_save_c_500_k_1.csv
