In [7]:
import torch
from transformers import AutoTokenizer, AutoModel


In [8]:
# Load pre-trained BlueBERT model and tokenizer
model_name = "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/441M [00:00<?, ?B/s]

In [9]:

# Sample conversation
conversation = """
Patient: Hi, I’ve been feeling unwell for the past few days.
Patient: I’ve had a fever for the past three days, along with a sore throat and body aches.
Patient: I do have a mild cough, and my throat feels dry. But I don’t have any nausea or difficulty breathing.
Patient: No, I haven’t traveled, but a few of my colleagues at work had a cold last week.
Patient: No, I don’t have any chronic illnesses, and I’m not allergic to anything.
Patient: I took a paracetamol yesterday for the fever, and I’ve been drinking warm water and resting.
"""


In [10]:
# Tokenize the conversation
inputs = tokenizer(conversation, return_tensors="pt", truncation=True, padding=True)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [11]:
# Get model embeddings
with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state  # Shape: (batch_size, seq_length, hidden_size)

In [13]:
embeddings

tensor([[[ 0.6700,  0.0529, -0.0106,  ...,  0.0439,  0.3718,  0.1110],
         [-0.8095,  0.0119, -0.5421,  ..., -0.1386,  0.1275, -0.2058],
         [-0.3707,  0.1895,  0.0814,  ..., -0.3273,  0.5411,  0.0334],
         ...,
         [ 0.5727,  0.7360,  0.2596,  ...,  0.3456,  0.1816, -0.1227],
         [-0.0589,  0.0079,  0.2925,  ...,  0.2673,  0.8035, -0.5467],
         [ 0.9751,  0.1525, -0.1767,  ..., -0.2151,  0.5104, -0.0517]]])

In [12]:
keywords = {
    "symptoms": ["fever", "sore throat", "body aches", "cough", "headache", "nausea", "difficulty breathing"],
    "exposure": ["contact", "sick", "colleagues", "traveled"],
    "medical_history": ["diabetes", "high blood pressure", "asthma", "allergies", "chronic illness"],
    "medications": ["paracetamol", "ibuprofen", "antibiotic"]
}

In [14]:
# Extract detected keywords
extracted_info = {key: [] for key in keywords}
for key, values in keywords.items():
    for word in values:
        if word in conversation.lower():
            extracted_info[key].append(word)

# Print extracted details
print("Extracted Medical Information:")
for category, details in extracted_info.items():
    print(f"{category.capitalize()}: {', '.join(details) if details else 'None found'}")


Extracted Medical Information:
Symptoms: fever, sore throat, body aches, cough, nausea, difficulty breathing
Exposure: colleagues, traveled
Medical_history: chronic illness
Medications: paracetamol


Langchain Based Approach

In [1]:
#Create a langchain project on langsmith and create an api key

from google.colab import userdata
userdata.get('LANGCHAIN_API_KEY')
userdata.get('OPENAI_API_KEY')
userdata.get('LANGCHAIN_PROJECT')

'Tutorial1'

In [15]:
%pip install transformers torch


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [20]:
import torch
from transformers import AutoTokenizer, AutoModel
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
import os
import json


In [None]:
# Set OpenAI API Key (Replace with your actual API key)
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [22]:
# Load OpenAI model via LangChain
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

BlueBERT Embeddings

In [23]:
# Load BlueBERT model and tokenizer
model_name = "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [24]:
# Function to extract medical information using LangChain
def extract_medical_info(conversation):
    prompt = f"""
    Extract relevant medical information from the following conversation.
    Categorize the extracted details under:
    - Symptoms
    - Exposure
    - Medical History
    - Medications

    Only return structured JSON format without extra explanations.

    Conversation:
    \"\"\"{conversation}\"\"\"

    Output:
    {{
        "symptoms": [],
        "exposure": [],
        "medical_history": [],
        "medications": []
    }}
    """

    messages = [
        SystemMessage(content="You are an AI assistant specialized in medical text processing."),
        HumanMessage(content=prompt)
    ]

    response = llm(messages)

    # Parse JSON output
    extracted_info = json.loads(response.content)
    return extracted_info

In [25]:
# Function to generate BlueBERT embeddings
def get_bluebert_embedding(text):
    if not text:
        return None  # Return None if there's no relevant information in a category

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the sentence-level embedding (mean of last hidden state)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()


In [26]:
# Sample conversation
conversation = """
Patient: Hi, I’ve been feeling unwell for the past few days.
Patient: I’ve had a fever for the past three days, along with a sore throat and body aches.
Patient: I do have a mild cough, and my throat feels dry. But I don’t have any nausea or difficulty breathing.
Patient: No, I haven’t traveled, but a few of my colleagues at work had a cold last week.
Patient: No, I don’t have any chronic illnesses, and I’m not allergic to anything.
Patient: I took a paracetamol yesterday for the fever, and I’ve been drinking warm water and resting.
"""

In [27]:
# Step 1: Extract Medical Information Using LangChain
extracted_info = extract_medical_info(conversation)

# Step 2: Generate BlueBERT embeddings for each category
embeddings = {category: get_bluebert_embedding(" ".join(terms)) for category, terms in extracted_info.items() if terms}

# Print extracted info and corresponding embeddings
print("\nExtracted Medical Information:")
print(json.dumps(extracted_info, indent=2))

print("\nGenerated BlueBERT Embeddings:")
for category, embedding in embeddings.items():
    print(f"{category.capitalize()} Embedding Shape: {embedding.shape if embedding is not None else 'No Data'}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Extracted Medical Information:
{
  "symptoms": [
    "fever",
    "sore throat",
    "body aches",
    "mild cough",
    "dry throat"
  ],
  "exposure": [
    "colleagues had a cold last week"
  ],
  "medical_history": [
    "no chronic illnesses",
    "not allergic to anything"
  ],
  "medications": [
    "paracetamol"
  ]
}

Generated BlueBERT Embeddings:
Symptoms Embedding Shape: (768,)
Exposure Embedding Shape: (768,)
Medical_history Embedding Shape: (768,)
Medications Embedding Shape: (768,)


In [29]:
embeddings

{'symptoms': array([ 2.46953562e-01,  6.29930675e-01,  3.38680893e-01, -2.80606717e-01,
         2.80621648e-02, -1.73819169e-01, -1.83900774e-01,  5.11019051e-01,
        -3.53444785e-01,  8.51969644e-02, -6.56154379e-02,  1.27280161e-01,
         1.13819428e-01,  3.17537844e-01, -1.43370435e-01,  1.13759808e-01,
        -1.34371623e-01,  1.29763171e-01,  2.99416035e-02, -1.20873526e-02,
         6.31417111e-02, -6.81701675e-02, -1.73862457e-01,  8.59656557e-02,
        -1.24896429e-01, -2.70208389e-01,  1.85427114e-01, -2.29582027e-01,
        -2.17154101e-01, -5.69195569e-01,  1.81571186e-01,  1.52499422e-01,
        -3.08648467e-01, -1.32404551e-01, -2.37944424e-01, -3.57673094e-02,
         6.61505312e-02,  1.85382348e-02, -9.46563575e-03,  2.66772598e-01,
         9.20256749e-02, -5.28828323e-01, -4.22043763e-02, -3.99918705e-01,
        -1.99016228e-01, -1.53605416e-01, -3.06149483e-01,  5.92756212e-01,
        -3.77131313e-01, -2.91549653e-01,  1.58069283e-01,  2.11397827e-01,
