In [1]:
!pip install langchain PyMuPDF chromadb fuzzywuzzy transformers pandas torch langchain_community tiktoken langchain-openai langchainhub chromadb langchain

Collecting PyMuPDF
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.19-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.3.7-py3-none-any.whl.metadata (2.3 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting async-timeout<5.0.0,>=4.0.0 (from langchain)
  Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.11-py3-none-any.whl.met

In [None]:
#create .env and add OPENAI_API_KEY, LANGCHAIN_API_KEY

import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ["LANGCHAIN_API_KEY"] = ""
os.environ["OPENAI_API_KEY"] = ""

In [5]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import PyMuPDFLoader
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

# Load lab report PDFs
pdf_path = "/kaggle/input/labdata/137253305_bgaawavakhroaw0xoj05dgcq.pdf"  # Replace with your file path
loader = PyMuPDFLoader(pdf_path)
documents = loader.load()

# Create vector store with OpenAI embeddings
vectorstore = Chroma.from_documents(documents, embedding=OpenAIEmbeddings())

# Create retriever chain
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})  # Top 5 relevant chunks
qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(), retriever=retriever)

# Extract lab tests from the report
query = "List all lab tests mentioned in the report."
extracted_lab_tests = qa.run(query)

# Convert extracted text into a list (assuming it's comma-separated)
lab_tests = [test.strip() for test in extracted_lab_tests.split(",")]

print("Extracted Lab Tests:", lab_tests)

  qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(), retriever=retriever)
  extracted_lab_tests = qa.run(query)


Extracted Lab Tests: ['The lab tests mentioned in the report are:\n\n1. Lipid Profile', 'Basic', 'Serum\n2. HbA1c (Glycosylated Hemoglobin)', 'Blood']


In [9]:
import pandas as pd
from fuzzywuzzy import process

# Load ICD-11 data
icd_data = pd.read_csv("/kaggle/input/labdata/ICD_11_Codes.csv")  # Ensure this has 'Code' & 'Title'
icd_data['Title'] = icd_data['Title'].astype(str).str.lower().str.strip()
icd_titles = icd_data['Title'].tolist()

# Function to match lab tests to ICD-11
def find_icd_code(test_name):
    test_name = test_name.lower().strip()

    if not icd_titles:
        return "No ICD-11 match found"

    match = process.extractOne(test_name, icd_titles)

    if match and match[1] > 80:  # Only accept matches > 80% similarity
        best_match = match[0]
        icd_code = icd_data.loc[icd_data['Title'] == best_match, 'Code'].values
        return icd_code[0] if len(icd_code) > 0 else "No ICD-11 match found"
    
    return "No ICD-11 match found"

# Map extracted lab tests to ICD-11 codes
lab_test_to_icd = {test: find_icd_code(test) for test in lab_tests}

print("Lab Test to ICD-11 Mapping:", lab_test_to_icd)

Lab Test to ICD-11 Mapping: {'The lab tests mentioned in the report are:\n\n1. Lipid Profile': '1B10', 'Basic': 'VD03', 'Serum\n2. HbA1c (Glycosylated Hemoglobin)': '4A01.01', 'Blood': 'XA8EC5'}


In [10]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load BioBERT model
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# Function to generate embeddings
def generate_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :]  # Use the [CLS] token embedding

# Generate embeddings for lab tests and ICD-11 codes
embeddings_dict = {}
for test, icd_code in lab_test_to_icd.items():
    if icd_code != "No ICD-11 match found":
        test_embedding = generate_embedding(test)
        icd_embedding = generate_embedding(icd_code)
        embeddings_dict[test] = {"lab_test_embedding": test_embedding, "icd_embedding": icd_embedding}

# Print results
for test, info in embeddings_dict.items():
    print(f"Lab Test: {test} | Embedding Shape: {info['lab_test_embedding'].shape}")

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Lab Test: The lab tests mentioned in the report are:

1. Lipid Profile | Embedding Shape: torch.Size([1, 768])
Lab Test: Basic | Embedding Shape: torch.Size([1, 768])
Lab Test: Serum
2. HbA1c (Glycosylated Hemoglobin) | Embedding Shape: torch.Size([1, 768])
Lab Test: Blood | Embedding Shape: torch.Size([1, 768])


In [11]:
import pickle
import torch

# Create dictionary to store embeddings before fusion
saved_embeddings = {}

for test, info in embeddings_dict.items():
    saved_embeddings[test] = {
        "lab_test_embedding": info["lab_test_embedding"].detach().cpu(),
        "icd_embedding": info["icd_embedding"].detach().cpu(),
        "icd_code": lab_test_to_icd[test]
    }

# Save embeddings as a .pt file (PyTorch format)
torch.save(saved_embeddings, "lab_icd_embeddings.pt")

# Save embeddings as a .pkl file (Pickle format)
with open("lab_icd_embeddings.pkl", "wb") as f:
    pickle.dump(saved_embeddings, f)

print("Embeddings saved successfully!")

Embeddings saved successfully!
