# Implementing Biomedical NER with Multi RAG system

This notebook enhances the core workflows presented in the paper **"LLMs in Biomedical: A Study on Named Entity Recognition"**. I will adapt the paper's methods for an open-source model, replacing the proprietary GPT-4 with **Gemini3**.

The goal is to perform **Named Entity Recognition (NER)** on biomedical text, identifying entities like diseases, treatments, and tests.


--- 
## 1. Setup and Dependencies


In [1]:
import torch
import numpy as np
import pandas as pd
from dask.multiprocessing import exceptions
from sklearn.externals.array_api_compat.dask.array import astype
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
from datasets import load_dataset
from sklearn.neighbors import NearestNeighbors
from transformers import pipeline
import os
import requests
import  ast
import re
import ast

# Check for GPU availability for faster processing
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


---
## 2. Loading Models and Data

Here, we load the TinyLlama chat model, the BioClinicalBERT model for embeddings.

In [2]:
os.getenv(".env")

In [3]:
# loading the tiny llama model 
pipe = pipeline('text-generation', model="meta-llama/Llama-3.2-3B-Instruct", dtype=torch.bfloat16, device_map="auto")

# loading the BioClinicalBert model for encodings
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

print("Models Loaded Successfully!!")



Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.85s/it]
Device set to use cuda:0


Models Loaded Successfully!!


In [4]:
torch.cuda.is_available()

True

In [5]:
# Convert data to a dataframe
db = pd.read_csv("data-words/train.tsv", sep='\t')
db = db.dropna()
db

Unnamed: 0,Identification,O
0,of,O
1,APC2,O
2,",",O
3,a,O
4,homologue,O
...,...,...
135994,and,O
135995,increased,O
135996,survival,O
135997,.,O


## This function is a part of the RAG module for the DiRAG for Zero-Shot Entity identification.

In [6]:

def get_context(search_term,search_db):
    """This function extract the documents which are required for the context for Zero-Shot DiRAG module"""
    BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

    esearch_url = BASE_URL + "esearch.fcgi"

    esearch_params = {
        "db": search_db,
        "term": search_term,
        "retmax": 5,
        "retmode": "json",
        "usehistory": "y",
        "tool": "MyPubMedAPIScript",
        "email": os.getenv("email"),
        "api_key": os.getenv("ncbi_token")
    }

    response = requests.get(esearch_url,params=esearch_params)
    response.raise_for_status

    result_json = response.json()

    result = result_json.get("esearchresult",{})
    ids = result.get("idlist",[])
    count = result.get("count",0)
    webenv = result.get("webenv")

    final_result = {"search term": search_term,
                    "total results:": count,
                    "id list": ids,
                    "web environment": webenv}


    return final_result["id list"]


## Creating a workflow with a small sample size

# Zero-Shot Entity Identification

## 1. Identification of potential entities

In [7]:
# Creating a database of Punctuations and stopwords to be removed for consideration for predictions
import string
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
punctuation_list = list(string.punctuation)
punctuation_list.append(stopwords)
print(punctuation_list)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'mor

In [8]:
# Creating a list of prompts

current_words = db["Identification"].iloc[0:500]
current_text = " ".join(current_words)
all_prompts = []
for i in current_words:
    if i not in punctuation_list:
        prompt_first_classification = [
          {
            "role": "system",
            "content": "You are an expert in medical domain. Given the following word, your task is to identify that it could potentially be a medical term for a disease or not, do not add any other text just return the output format specified. The output should be either 'o' if the word is not a potential medical disease and 'e' if the word is a potential entity specifying a disease, the format of the output should be a dictionary: for example {'dancing': 'o'}"
             },
            {
            "role": "user",
            "content": f"data: {i}"
            },
            ]
        prompt = pipe.tokenizer.apply_chat_template(prompt_first_classification, tokenize=False, add_generation_prompt=True)
        all_prompts.append(prompt)

In [9]:
# Passing the prompts to the LLM for initial identification of entities and adding the results to a list (test_entities_from_doc)
pipe.tokenizer.pad_token_id = pipe.tokenizer.eos_token_id
test_entities_from_doc = pipe(
        all_prompts,
        max_new_tokens=1000,
        temperature=0.1,
        batch_size=16,
        return_full_text=False)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [10]:
test_entities_from_doc

[[{'generated_text': "assistant\n\n{'of': 'o'}"}],
 [{'generated_text': "assistant\n\n{'APC2': 'e'}"}],
 [{'generated_text': 'assistant\n\n{o}'}],
 [{'generated_text': "assistant\n\n{'homologue': 'e'}"}],
 [{'generated_text': 'assistant\n\n{o}'}],
 [{'generated_text': 'assistant\n\n{o}'}],
 [{'generated_text': "{'adenomatous': 'e'}"}],
 [{'generated_text': "{'polyposis': 'e'}"}],
 [{'generated_text': "assistant\n\n{'coli': 'e'}"}],
 [{'generated_text': "assistant\n\n{'tumour': 'e'}"}],
 [{'generated_text': "assistant\n\n{'suppressor': 'e'}"}],
 [{'generated_text': 'assistant\n\n{o}'}],
 [{'generated_text': "{'adenomatous': 'e'}"}],
 [{'generated_text': "{'polyposis': 'e'}"}],
 [{'generated_text': "assistant\n\n{'coli': 'e'}"}],
 [{'generated_text': "assistant\n\n{'APC': 'e'}"}],
 [{'generated_text': "{'tumour': 'e'}"}],
 [{'generated_text': "{'suppressor': 'e'}"}],
 [{'generated_text': "assistant\n\n{'protein': 'e'}"}],
 [{'generated_text': "assistant\n\n{'controls': 'o'}"}],
 [{'gener

## Creating chunks of the column of size 1500 and adding them to the list of prompts and Calling the pipeline for the entire sample
(This is taking in the entire sample, I have taken a smaller sample size of 500 since working with the entire sample size takes a lot of time. Once the workflow is build, I will be using the entire sample)

In [11]:
# chunks = 1500
# overlap = 200
# all_potential_entities = set()
# all_prompts = []
#
# for i in range(0,len(db["Identification"]),chunks-overlap):
#
#         current_words = db["Identification"][i:i+chunks]
#         current_text = " ".join(current_words)
#
#         prompt_tanl = [
#             {
#               "role": "system",
#               "content": "You are an expert in medical domain. Given the following document, your task is to identify that it could potentially be a medical entity, do not add any other text just return the output format specified. The output should be a list of strings where the strings will be the potential medical entities nothing else, for example: the output format should be: ['entity 1', 'entity 2' ... and so on]"
#                         },
#                 {
#                  "role": "user",
#                  "content": f"data: {current_text}"
#                 },
#                 ]
#
#         prompt = pipe.tokenizer.apply_chat_template(prompt_tanl, tokenize=False, add_generation_prompt=True)
#         all_prompts.append(prompt)
#
# print("All prompts added")
# pipe.tokenizer.pad_token = pipe.tokenizer.eos_token


In [12]:
# entities_from_doc = pipe(all_prompts,
#         max_new_tokens=1024,
#         temperature=0.1,
#         return_full_text=False,
#         batch_size=8)

In [13]:
# This code block will format the results in a dictionary format

cleaned_results = []
json_pattern = re.compile(r"\{.*\}")

# We use zip() to iterate over the input words and the output results at the same time
for original_word, item in zip(current_words, test_entities_from_doc):
    generated_text = item[0]['generated_text']

    # 1. Handle the specific Broken Case: {o}
    # If the LLM just said {o}, we assume the tag is 'o' for the current word
    if "{o}" in generated_text:
        cleaned_results.append({'word': original_word, 'prediction': 'o'})
        continue # Skip to next item

    # 2. Handle the specific Broken Case: {e}
    if "{e}" in generated_text:
        cleaned_results.append({'word': original_word, 'prediction': 'e'})
        continue # Skip to next item

    # 3. Handle Normal Case: {'word': 'e'}
    match = json_pattern.search(generated_text)
    if match:
        json_string = match.group(0)
        try:
            parsed_dict = ast.literal_eval(json_string)
            predicted_label = list(parsed_dict.values())[0]
            cleaned_results.append({'word': original_word, 'prediction': predicted_label})
        except:
            # If it still fails, force the original word with an error tag
            cleaned_results.append({'word': original_word, 'prediction': 'parsing_error'})
    else:
        cleaned_results.append({'word': original_word, 'prediction': 'parsing_error'})

print(cleaned_results)

[{'word': 'of', 'prediction': 'o'}, {'word': 'APC2', 'prediction': 'e'}, {'word': ',', 'prediction': 'o'}, {'word': 'a', 'prediction': 'e'}, {'word': 'homologue', 'prediction': 'o'}, {'word': 'of', 'prediction': 'o'}, {'word': 'the', 'prediction': 'e'}, {'word': 'adenomatous', 'prediction': 'e'}, {'word': 'polyposis', 'prediction': 'e'}, {'word': 'coli', 'prediction': 'e'}, {'word': 'tumour', 'prediction': 'e'}, {'word': 'suppressor', 'prediction': 'o'}, {'word': '.', 'prediction': 'e'}, {'word': 'The', 'prediction': 'e'}, {'word': 'adenomatous', 'prediction': 'e'}, {'word': 'polyposis', 'prediction': 'e'}, {'word': 'coli', 'prediction': 'e'}, {'word': '(', 'prediction': 'e'}, {'word': 'APC', 'prediction': 'e'}, {'word': ')', 'prediction': 'o'}, {'word': 'tumour', 'prediction': 'o'}, {'word': '-', 'prediction': 'e'}, {'word': 'suppressor', 'prediction': 'e'}, {'word': 'protein', 'prediction': 'e'}, {'word': 'controls', 'prediction': 'o'}, {'word': 'the', 'prediction': 'o'}, {'word': 'W

In [14]:
# cleaned_results = np.array(cleaned_results)
db_temp = pd.DataFrame(cleaned_results)
words_for_rag = []
for i in range(0,len(db_temp)):
    if db_temp["prediction"].iloc[i]  == "e" and db_temp["prediction"].iloc[i] not in punctuation_list:
        words_for_rag.append(db_temp["word"].iloc[i])

words_for_rag


['APC2',
 'a',
 'the',
 'adenomatous',
 'polyposis',
 'coli',
 'tumour',
 '.',
 'The',
 'adenomatous',
 'polyposis',
 'coli',
 '(',
 'APC',
 '-',
 'suppressor',
 'protein',
 'by',
 'forming',
 'a',
 'complex',
 'with',
 'glycogen',
 'synthase',
 'kinase',
 '(',
 '3beta',
 ',',
 'axin',
 'conductin',
 '.',
 'Complex',
 'the',
 'In',
 'cells',
 'to',
 'accumulation',
 'betacatenin',
 'activates',
 'Tcf',
 '-',
 'transcription',
 'factor',
 '(',
 'reviewed',
 ']',
 ']',
 ')',
 'Here',
 ',',
 'closely',
 'APC',
 'in',
 'overall',
 'was',
 'functionally',
 'analyzed',
 'and',
 'to',
 'two',
 'SAMP',
 'domains',
 'of',
 'are',
 'required',
 'for',
 'binding',
 '.',
 'Like',
 ',',
 'regulates',
 'Tcf',
 ',',
 'using',
 'transient',
 '-',
 'colon',
 'Human',
 'APC2',
 'to',
 'chromosome',
 'APC',
 'therefore',
 'have',
 'comparable',
 'functions',
 'development',
 'common',
 'mutation',
 'in',
 'English',
 'and',
 'families',
 ',',
 'colorectal',
 'and',
 'expression',
 'of',
 'germline',
 'pr

## 2. Implementing the RAG based entity identification

## This is the final iteration of LLM trying to identify entities in Zero-Shot method.
### 1. This function will first look up the word in Pubmed database and find the Ids of the context of the word.
### 2. It will then validate the id's.
### 3. It will then find the xml document associated with that id and add it to the content to be passed in the final prompt.
### 4. It will then create the final prompt with the word and the context of the word extracted from Pubmed database.
### 5. Finally the prompt will be passed to the LLM and a final prediction will be obtained.


In [15]:
from Bio import Entrez
from urllib.error import HTTPError
Entrez.email = "atripathi2024@fau.edu"

In [23]:

def get_context_xml(test_result_preclassification, max_tokens=500):
    """This function will input the results from the preclassification and it will search for the ids of the words, It will then retreive the context in the form of xml which will be added to an array and an array with context will be returned"""
    context_array = []
    for i in test_result_preclassification:

       if i not in punctuation_list:
         search_term = i
         ids = get_context(i,"pubmed")
         valid_ids = [str(j) for j in ids if j]
         context_xml = ""
         if valid_ids:
             try:
                 list_of_ids = ",".join(valid_ids[:5])
                 handle = Entrez.efetch(db="pubmed", id=list_of_ids, retmode="xml")
                 context_xml = handle.read()
                 handle.close()
                 if isinstance(context_xml, bytes):
                     text = context_xml.decode('utf-8', errors='ignore')
                 else:
                     text = context_xml
                 clean_text = re.sub(r'<[^>]+>', ' ', text)           # strip tags
                 clean_text = re.sub(r'\s+', ' ', clean_text).strip() # normalize spaces
                 tokens = tokenizer.encode(clean_text)
                 if len(tokens) > max_tokens:
                    tokens = tokens[:max_tokens]
                 context_xml = tokenizer.decode(tokens, skip_special_tokens=True)

             except HTTPError as e:
                 print(f"HTTP Error fetching IDs for term '{search_term}': {e}")
                 pass
             except Exception as e:
                 print(f"An error occurred for term '{search_term}': {e}")
                 pass
       else:
          context_xml = "not a word"
       context_array.append(context_xml)



    return  context_array




In [17]:
def create_final_prompts(word_list,context):
    """This function will input the context array and the words array and will return a list of final prompts which will contain the prompt and the context for the final prediction in Zero-Shot prediction."""
    all_prompts = []
    for i in range(0,len(word_list)-1):
        prompt_final_classification = [
               {
                 "role": "system",
                  "content": "You are an expert in medical domain. Given the following word, and a context which has been taken from Pubmed clinical     database, your task is to identify that it could potentially be a medical disease word or not, consider those words which are even closely related, do not add any other text just return the output format specified. The prediction should be either 'o'- which     represents outside clinical term  (for non clinical terms), 'B-CLINICAL'- for the words which are beginning of a clinical disease     term and 'I-CLINICAL' - for the words which  can be a subset of a clinical disease term, and the output format should be  a dictionary: for example {'dancing': 'o'}, where the key is the searched word and value is the prediction"
               },
               {
                  "role": "user",
                  "content": f"word: {word_list[i]}, context: {context[i]}"
               },
               ]
        all_prompts.append(prompt_final_classification)

    return all_prompts

In [24]:
context = get_context_xml(words_for_rag) # Calling the function for context

In [20]:
prompts_final = create_final_prompts(words_for_rag,context) # Calling the function for final prompts

In [21]:
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

model_id = "meta-llama/Llama-3.2-3B-Instruct"

# Load model in 4-bit to fit in 8GB VRAM
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    load_in_4bit=True,  # <--- CRITICAL FIX
    torch_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
final_output = pipe(prompts_final,
                     max_new_tokens=1000,
                     temperature=0.1,
                     return_full_text=False,
                     batch_size=16
                     )
# final_result = output[0]["generated_text"]
# print(test_result)

In [25]:
pipe.tokenizer.pad_token_id = pipe.tokenizer.eos_token_id
final_output = pipe(
        prompts_final,
        max_new_tokens=1000,
        temperature=0.1,
        batch_size=16,
        return_full_text=False)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x0000020A4C4D5A90>>
Traceback (most recent call last):
  File "C:\Users\apoor\projects\NLP\.venv\Lib\site-packages\ipykernel\ipkernel.py", line 796, in _clean_thread_parent_frames
    active_threads = {thread.ident for thread in threading.enumerate()}
  File "C:\Users\apoor\AppData\Local\Programs\Python\Python313\Lib\threading.py", line 1479, in enumerate
    def enumerate():
KeyboardInterrupt: 


KeyboardInterrupt: 

In [26]:
# This code block will format the results in a dictionary format

cleaned_final_results = []
json_pattern = re.compile(r"\{.*\}")

# We use zip() to iterate over the input words and the output results at the same time
for original_word, item in zip(current_words, final_output):
    generated_text = item[0]['generated_text']

    # 1. Handle the specific Broken Case: {o}
    # If the LLM just said {o}, we assume the tag is 'o' for the current word
    if "{o}" in generated_text:
        cleaned_final_results.append({'word': original_word, 'prediction': 'o'})
        continue # Skip to next item

    # 2. Handle the specific Broken Case: {e}
    if "{e}" in generated_text:
        cleaned_final_results.append({'word': original_word, 'prediction': 'e'})
        continue # Skip to next item

    # 3. Handle Normal Case: {'word': 'e'}
    match = json_pattern.search(generated_text)
    if match:
        json_string = match.group(0)
        try:
            parsed_dict = ast.literal_eval(json_string)
            predicted_label = list(parsed_dict.values())[0]
            cleaned_final_results.append({'word': original_word, 'prediction': predicted_label})
        except:
            # If it still fails, force the original word with an error tag
            cleaned_final_results.append({'word': original_word, 'prediction': 'parsing_error'})
    else:
        cleaned_final_results.append({'word': original_word, 'prediction': 'parsing_error'})


NameError: name 'final_output' is not defined

In [59]:
print(cleaned_final_results)

[{nan: 'I-CLINICAL'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'I-CLINICAL'}, {nan: 'B'}, {nan: 'B'}, {nan: 'I-CLINICAL'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'I-CLINICAL'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'I-CLINICAL'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'I-CLINICAL'}, {nan: 'B'}, {nan: 'I-CLINICAL'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'I-CLINICAL'}, {nan: 'I-CLINICAL'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'I-CLINICAL'}, {nan: 'I-CLINICAL'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'B'}, {nan: 'I-CLINICAL'}, {nan: 'B'}, {nan: 'B'}, {nan: 'I-CLINIC

In [20]:
db_predicted =  pd.DataFrame(cleaned_final_results)

Unnamed: 0,Term,Classification
0,,I-CLINICAL
1,,B
2,,B
3,,B
4,,B


In [None]:
db_predicted