In [1]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load BioBERT Model and Tokenizer
bio_bert_model = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(bio_bert_model)
model = AutoModel.from_pretrained(bio_bert_model)

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()  # Get embedding vector


In [4]:
import pandas as pd

# Load your dataset (update path as needed)
df = pd.read_excel("MilestoneW9Data.xlsx")

# Preprocess symptoms
df["Symptoms"] = df["Symptoms"].str.lower().str.replace(", ", ",")
df["Symptom_List"] = df["Symptoms"].apply(lambda x: x.split(","))
df["Symptom_String"] = df["Symptom_List"].apply(lambda x: " ".join(x))

# Generate BioBERT embeddings
df["Symptom_Embedding"] = df["Symptom_String"].apply(lambda x: get_bert_embedding(x).flatten())

# Save processed dataset
df.to_csv("BioBERT_Disease_Embeddings.csv", index=False)


In [5]:
import pandas as pd
import numpy as np
import torch
import re
from transformers import AutoTokenizer, AutoModel, AutoConfig
from sklearn.metrics.pairwise import cosine_similarity
import ast  # Securely evaluate stored embeddings

config = AutoConfig.from_pretrained("dmis-lab/biobert-base-cased-v1.1", timeout=1000)
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1", config=config)
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1", config=config)

# Load the processed dataset with BioBERT embeddings
df = pd.read_csv("BioBERT_Disease_Embeddings.csv")



def clean_embedding(embedding_str):
    """
    Cleans and converts the stored embedding string into a proper NumPy array.
    - Replaces multiple spaces with a single space.
    - Ensures the format is valid before conversion.
    """
    cleaned_str = re.sub(r'\s+', ',', embedding_str.strip())  # Replace spaces with commas
    cleaned_str = cleaned_str.replace("[,", "[").replace(",]", "]")  # Fix edge cases
    return np.array(ast.literal_eval(cleaned_str))  # Convert string to NumPy array

# Apply the cleaning function
df["Symptom_Embedding"] = df["Symptom_Embedding"].apply(lambda x: clean_embedding(x))

# Load BioBERT Model and Tokenizer
bio_bert_model = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(bio_bert_model)
model = AutoModel.from_pretrained(bio_bert_model)

def get_bert_embedding(text):
    """Generate BioBERT embeddings for user input symptoms."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()  # Get embedding vector

def predict_disease(user_symptoms):
    """
    Predicts the most probable disease(s) based on user symptoms.
    """
    # Generate BioBERT embedding for user input
    user_embedding = get_bert_embedding(user_symptoms).flatten().reshape(1, -1)

    # Compute cosine similarity with stored disease embeddings
    stored_embeddings = np.stack(df["Symptom_Embedding"].values)
    similarity_scores = cosine_similarity(user_embedding, stored_embeddings).flatten()

    # Get top 2 most similar diseases
    top_indices = similarity_scores.argsort()[-2:][::-1]
    predicted_diseases = df.iloc[top_indices][["Disease", "Note"]].reset_index(drop=True)

    return predicted_diseases

# Example: Predict disease for user symptoms
user_input = "fever, chills, muscle pain, headache"
predicted_result = predict_disease(user_input)

# Display Results
print("Predicted Diseases:")
print(predicted_result)


Predicted Diseases:
        Disease                                               Note
0       Malaria  Caused by Plasmodium parasites transmitted thr...
1  Yellow Fever    Yellow fever is a mosquito-borne viral disease.


In [12]:
pip install jupyterlab-git


Collecting jupyterlab-git
  Downloading jupyterlab_git-0.51.0-py3-none-any.whl.metadata (32 kB)
Collecting jupyter-server<3,>=2.0.1 (from jupyterlab-git)
  Downloading jupyter_server-2.14.2-py3-none-any.whl.metadata (8.4 kB)
Collecting nbdime~=4.0.1 (from jupyterlab-git)
  Downloading nbdime-4.0.2-py3-none-any.whl.metadata (9.5 kB)
Collecting pexpect (from jupyterlab-git)
  Downloading pexpect-4.9.0-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting jupyter-client>=7.4.4 (from jupyter-server<3,>=2.0.1->jupyterlab-git)
  Downloading jupyter_client-8.6.3-py3-none-any.whl.metadata (8.3 kB)
Collecting jupyter-core!=5.0.*,>=4.12 (from jupyter-server<3,>=2.0.1->jupyterlab-git)
  Downloading jupyter_core-5.7.2-py3-none-any.whl.metadata (3.4 kB)
Collecting jupyter-events>=0.9.0 (from jupyter-server<3,>=2.0.1->jupyterlab-git)
  Downloading jupyter_events-0.10.0-py3-none-any.whl.metadata (5.9 kB)
Collecting jupyter-server-terminals>=0.4.4 (from jupyter-server<3,>=2.0.1->jupyterlab-git)
  Download

  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jupyterlab 3.3.4 requires jupyter-server~=1.4, but you have jupyter-server 2.14.2 which is incompatible.
jupyterlab-server 2.13.0 requires jupyter-server<2,>=1.8, but you have jupyter-server 2.14.2 which is incompatible.
notebook-shim 0.1.0 requires jupyter-server~=1.8, but you have jupyter-server 2.14.2 which is incompatible.


In [3]:
git add origin https://github.com/arishbukhari1/Intelligent-Medical-Chatbot.git


SyntaxError: invalid syntax (555616609.py, line 1)