In [8]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
import torch

# Load Fine-Tuned BioBERT Model
model_path = "./fine_tuned_biobert"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()  # Get embedding vector


In [9]:
import pandas as pd

# Load your dataset (update path as needed)
df = pd.read_excel("MilestoneW9Data.xlsx")

# Preprocess symptoms
df["Symptoms"] = df["Symptoms"].str.lower().str.replace(", ", ",")
df["Symptom_List"] = df["Symptoms"].apply(lambda x: x.split(","))
df["Symptom_String"] = df["Symptom_List"].apply(lambda x: " ".join(x))

# Generate BioBERT embeddings
df["Symptom_Embedding"] = df["Symptom_String"].apply(lambda x: get_bert_embedding(x).flatten())

# Save processed dataset
df.to_csv("BioBERT_Disease_Embeddings.csv", index=False)


In [10]:
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
import ast  # Securely evaluate stored embeddings

config = AutoConfig.from_pretrained("dmis-lab/biobert-base-cased-v1.1", timeout=1000)
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1", config=config)
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1", config=config)

# Load the processed dataset with BioBERT embeddings
df = pd.read_csv("BioBERT_Disease_Embeddings.csv")



def clean_embedding(embedding_str):
    """
    Cleans and converts the stored embedding string into a proper NumPy array.
    - Replaces multiple spaces with a single space.
    - Ensures the format is valid before conversion.
    """
    cleaned_str = re.sub(r'\s+', ',', embedding_str.strip())  # Replace spaces with commas
    cleaned_str = cleaned_str.replace("[,", "[").replace(",]", "]")  # Fix edge cases
    return np.array(ast.literal_eval(cleaned_str))  # Convert string to NumPy array

# Apply the cleaning function
df["Symptom_Embedding"] = df["Symptom_Embedding"].apply(lambda x: clean_embedding(x))

# Load BioBERT Model and Tokenizer
#bio_bert_model = "dmis-lab/biobert-base-cased-v1.1"
#tokenizer = AutoTokenizer.from_pretrained(bio_bert_model)
#model = AutoModel.from_pretrained(bio_bert_model)

def get_bert_embedding(text):
    """Generate BioBERT embeddings for user input symptoms."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()  # Get embedding vector

def predict_disease(user_symptoms):
    """
    Predicts the most probable disease(s) based on user symptoms.
    """
    # Generate BioBERT embedding for user input
    user_embedding = get_bert_embedding(user_symptoms).flatten().reshape(1, -1)

    # Compute cosine similarity with stored disease embeddings
    stored_embeddings = np.stack(df["Symptom_Embedding"].values)
    similarity_scores = cosine_similarity(user_embedding, stored_embeddings).flatten()

    # Get top 2 most similar diseases
    top_indices = similarity_scores.argsort()[-2:][::-1]
    predicted_diseases = df.iloc[top_indices][["Disease", "Note"]].reset_index(drop=True)

    return predicted_diseases

# Example: Predict disease for user symptoms
user_input = "fever, chills, muscle pain, headache"
predicted_result = predict_disease(user_input)

# Display Results
print("Predicted Diseases:")
print(predicted_result)

Predicted Diseases:
                Disease                                               Note
0           Hepatitis B  Caused by the hepatitis B virus (HBV); prevent...
1  Mononucleosis (Mono)            Caused by the Epstein-Barr virus (EBV).


In [None]:
import os
import json
import spacy  # NLP for symptom extraction
from IPython.display import display
import requests  # Import this to avoid "NameError"

# Load NLP Model
nlp = spacy.load("en_core_web_sm")

# File to store user data
USER_DATA_FILE = "user_data.json"

# Google API Key (Replace with your actual key)
GOOGLE_API_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXX"

# Load dataset and BioBERT model
df = pd.read_csv("BioBERT_Disease_Embeddings.csv")

def clean_embedding(embedding_str):
    cleaned_str = re.sub(r'\s+', ',', embedding_str.strip())
    cleaned_str = cleaned_str.replace("[,", "[").replace(",]", "]")
    return np.array(ast.literal_eval(cleaned_str))

df["Symptom_Embedding"] = df["Symptom_Embedding"].apply(lambda x: clean_embedding(x))

# Extract all possible symptoms from dataset
all_symptoms = set()
for symptom_list in df["Symptoms"]:
    for symptom in symptom_list.split(","):
        all_symptoms.add(symptom.strip().lower())

# Load BioBERT
bio_bert_model = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(bio_bert_model)
model = AutoModel.from_pretrained(bio_bert_model)

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

def extract_symptoms(user_input):
    doc = nlp(user_input)
    extracted_symptoms = [token.text.lower() for token in doc if token.pos_ in ["NOUN", "ADJ"]]
    filtered_symptoms = [symptom for symptom in extracted_symptoms if symptom in all_symptoms]
    return " ".join(filtered_symptoms) if filtered_symptoms else None

def predict_disease(symptoms):
    user_embedding = get_bert_embedding(symptoms).flatten().reshape(1, -1)
    stored_embeddings = np.stack(df["Symptom_Embedding"].values)
    similarity_scores = cosine_similarity(user_embedding, stored_embeddings).flatten()
    top_indices = similarity_scores.argsort()[-2:][::-1]
    predicted_diseases = df.iloc[top_indices][["Disease", "Note"]].reset_index(drop=True)

    response = f"\n🤖 Bot: Based on your symptoms, you **might** have:\n"
    for index, row in predicted_diseases.iterrows():
        response += f"  - **{row['Disease']}**: {row['Note']}\n"
    
    response += "\n⚠️ *Disclaimer: This is not a medical diagnosis. Please consult a doctor or visit the nearest clinic for proper medical advice.*"
    
    return response

def ask_additional_symptoms(initial_symptoms):
    while True:
        user_response = input("\n🤖 Bot: Do you have any other symptoms? (yes/no) ").strip().lower()
        
        if user_response in ["no", "exit"]:
            return initial_symptoms

        elif user_response == "yes":
            extra_input = input("\n👤 You: Please describe any additional symptoms: ").strip().lower()
            extra_symptoms = extract_symptoms(extra_input)

            if extra_symptoms:
                return f"{initial_symptoms} {extra_symptoms}"
            else:
                print("\n🤖 Bot: I couldn't detect any new symptoms. Please try again or say 'no' to proceed.")

        else:
            print("\n🤖 Bot: Please respond with 'yes' or 'no'.")

# Function to check if user exists and load data
def load_user_data():
    if os.path.exists(USER_DATA_FILE):
        with open(USER_DATA_FILE, "r") as file:
            return json.load(file)
    return {}

# Function to register a new user
def register_user():
    print("\n🤖 Bot: Let's get some details before we start.")
    first_name = input("👤 First Name: ").strip()
    last_name = input("👤 Last Name: ").strip()
    gender = input("👤 Gender (Male/Female/Other): ").strip()
    dob = input("📅 Date of Birth (YYYY-MM-DD): ").strip()
    phone = input("📞 Phone Number: ").strip()
    email = input("📧 Email Address: ").strip()
    city = input("🏙️ City: ").strip()
    address = input("🏠 Address: ").strip()

    user_data = {
        "first_name": first_name,
        "last_name": last_name,
        "gender": gender,
        "dob": dob,
        "phone": phone,
        "email": email,
        "city": city,
        "address": address
    }

    with open(USER_DATA_FILE, "w") as file:
        json.dump(user_data, file)

    print(f"\n🤖 Bot: Thank you, {first_name}! Your information has been saved.")
    return user_data

# Function to find nearby clinics using Google Places API
def find_nearest_clinic(city, address):
    query = "clinic near " + address + ", " + city
    url = f"https://maps.googleapis.com/maps/api/place/textsearch/json?query={query}&key={GOOGLE_API_KEY}"
    
    response = requests.get(url)
    data = response.json()

    if "results" in data and len(data["results"]) > 0:
        response_message = "\n🏥 **Nearest Clinics:**\n"
        for clinic in data["results"][:3]:  # Show top 3 clinics
            name = clinic.get("name", "Unknown Clinic")
            addr = clinic.get("formatted_address", "No address available")
            phone = "Phone: Not available"
            
            # Try to get phone number from details API (optional)
            place_id = clinic.get("place_id")
            details_url = f"https://maps.googleapis.com/maps/api/place/details/json?place_id={place_id}&fields=formatted_phone_number&key={GOOGLE_API_KEY}"
            details_response = requests.get(details_url).json()
            if "result" in details_response and "formatted_phone_number" in details_response["result"]:
                phone = f"📞 {details_response['result']['formatted_phone_number']}"
            
            response_message += f"  - **{name}**\n    📍 Address: {addr}\n    {phone}\n"

        return response_message
    
    return "\n🤖 Bot: Sorry, I couldn't find any clinics near your location."

# Chatbot Function
def symptom_checker_chatbot():
    user_data = load_user_data()

    if not user_data:
        user_data = register_user()
    else:
        print(f"\n🤖 Welcome back, {user_data['first_name']}! Let's check your symptoms.")

    while True:
        user_input = input("\n👤 You: Describe your symptoms (or type 'exit' to quit): ").strip().lower()

        if user_input == "exit":
            print("\n🤖 Bot: Thank you for using the Symptom Checker! Stay healthy! 🏥")
            break

        extracted_symptoms = extract_symptoms(user_input)

        if not extracted_symptoms:
            print("\n🤖 Bot: I couldn't detect any medical symptoms. Please try again.")
            continue

        complete_symptoms = ask_additional_symptoms(extracted_symptoms)

        response = predict_disease(complete_symptoms)
        response += find_nearest_clinic(user_data["city"], user_data["address"])
        
        print(response)

# Run the chatbot
if __name__ == "__main__":
    symptom_checker_chatbot()



🤖 Welcome back, Arish! Let's check your symptoms.



👤 You: Describe your symptoms (or type 'exit' to quit):  I am having abdominal pain



🤖 Bot: I couldn't detect any medical symptoms. Please try again.



👤 You: Describe your symptoms (or type 'exit' to quit):  i have fever and cough

🤖 Bot: Do you have any other symptoms? (yes/no)  no



🤖 Bot: Based on your symptoms, you **might** have:
  - **Yellow Fever**: Yellow fever is a mosquito-borne viral disease.
  - **Plague (Yersinia pestis)**: Rare but serious; transmitted by fleas or respiratory droplets.

⚠️ *Disclaimer: This is not a medical diagnosis. Please consult a doctor or visit the nearest clinic for proper medical advice.*
🏥 **Nearest Clinics:**
  - **DOCTOR_K**
    📍 Address: Regeringsgatan 85, 111 39 Stockholm, Sweden
    📞 08-88 92 41
  - **Medical**
    📍 Address: Kommendörsgatan 44, 114 58 Stockholm, Sweden
    📞 08-545 816 70
  - **Diagnostiskt Centrum Hud i Stockholm City**
    📍 Address: Apelbergsgatan 60, 111 37 Stockholm, Sweden
    📞 08-515 115 00

