In [None]:
import streamlit as st
import os
import xml.etree.ElementTree as ET
import pandas as pd
import nltk
from nltk import word_tokenize, pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# # Download NLTK data
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

# Function to parse all MedQuAD XML files in all folders
@st.cache_data
def load_medquad_all(base_path):
    data = []
    folders = [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))]
    for folder in folders:
        folder_path = os.path.join(base_path, folder)
        files = [f for f in os.listdir(folder_path) if f.endswith('.xml')]
        for file in files:
            file_path = os.path.join(folder_path, file)
            tree = ET.parse(file_path)
            root = tree.getroot()
            for qa in root.findall('.//QAPair'):
                q_elem = qa.find('Question')
                a_elem = qa.find('Answer')
                q = q_elem.text.strip() if q_elem is not None and q_elem.text is not None else ""
                a = a_elem.text.strip() if a_elem is not None and a_elem.text is not None else ""
                if q and a:  # include only non-empty Q&A pairs
                    data.append({"question": q, "answer": a})
    df = pd.DataFrame(data)
    return df


# Named Entity Recognition (basic: nouns as entities)
def named_entity_recognition(text):
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    entities = [word for word, tag in tags if tag.startswith('NN')]
    return entities

# Retrieve most relevant answer using TF-IDF cosine similarity
def retrieve_answer(user_q, questions, answers):
    vectorizer = TfidfVectorizer().fit([user_q] + questions)
    vectors = vectorizer.transform([user_q] + questions)
    cosine_sim = cosine_similarity(vectors[0:1], vectors[1:]).flatten()
    max_idx = cosine_sim.argmax()
    max_sim = cosine_sim[max_idx]
    if max_sim > 0.2:
        return answers[max_idx], max_sim
    else:
        return "No relevant answer found.", max_sim

# Streamlit app UI
st.title("🩺 Medical Q&A Chatbot (MedQuAD)")

# Load dataset
# st.write("Loading dataset...")
df = load_medquad_all(r"C:\Users\Arunava Chakraborty\Desktop\ChatBots\Medical Q&A Chatbot\data\MedQuAD")
st.write(f"✅ Dataset loaded with {len(df)} question-answer pairs.")
st.write("Sample data preview:", df.iloc[0]['question'])

# User input
user_input = st.text_input("💬 Ask your medical question here:")

if user_input:
    # Entity recognition
    entities = named_entity_recognition(user_input)
    # st.write("🔬 **Identified Entities:**")
    # st.json(entities)

    # Retrieve and display answer
    answer, similarity = retrieve_answer(user_input, df['question'].tolist(), df['answer'].tolist())

    st.write("### 📝 Answer:")
    if answer.strip() and "No relevant answer found." not in answer:
        st.success(answer)
    else:
        st.warning("⚠️ Sorry, no suitable answer found for your query.")

    st.write(f"🔗 **Similarity Score:** {similarity:.2f}")


In [2]:
import spacy
import scispacy
from scispacy.linking import EntityLinker

# Load SciSpacy model and UMLS linker
nlp = spacy.load("en_core_sci_sm")
linker = EntityLinker(resolve_abbreviations=True, name="umls")
nlp.add_pipe("scispacy_linker", config={"linker_name": "umls"})

def explain_medical_term_local(term):
    doc = nlp(term)
    explanations = []
    for ent in doc.ents:
        for umls_ent in ent._.kb_ents:
            cui = umls_ent[0]
            definition = linker.kb.cui_to_entity[cui].definition
            if definition:
                explanations.append((ent.text, definition))
                break
    return explanations

# Example usage
term = "hyperbilirubinemia"
result = explain_medical_term_local(term)
if result:
    for ent, definition in result:
        print(f"🔹 {ent}: {definition}")
else:
    print("No definition found.")


OSError: [E050] Can't find model 'en_core_sci_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [3]:
import spacy
nlp = spacy.load("en_core_sci_sm")
print("✅ Model loaded successfully.")


OSError: [E050] Can't find model 'en_core_sci_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [4]:
from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
Traceback (most recent call last):
  File "c:\Users\Arunava Chakraborty\AppData\Local\Programs\Python\Python311\Lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 73, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: A dynamic link library (DLL) initialization routine failed.


Failed to load the native TensorFlow runtime.
See https://www.tensorflow.org/install/errors for some common causes and solutions.
If you need help, create an issue at https://github.com/tensorflow/tensorflow/issues and include the entire stack trace above this error message.

In [5]:
pip install tensorflow-cpu


Collecting tensorflow-cpu
  Downloading tensorflow_cpu-2.19.0-cp311-cp311-win_amd64.whl.metadata (4.1 kB)
Collecting tensorboard~=2.19.0 (from tensorflow-cpu)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting ml-dtypes<1.0.0,>=0.5.1 (from tensorflow-cpu)
  Downloading ml_dtypes-0.5.1-cp311-cp311-win_amd64.whl.metadata (22 kB)
Downloading tensorflow_cpu-2.19.0-cp311-cp311-win_amd64.whl (375.9 MB)
   ---------------------------------------- 0.0/375.9 MB ? eta -:--:--
   ---------------------------------------- 3.9/375.9 MB 21.3 MB/s eta 0:00:18
   - -------------------------------------- 10.5/375.9 MB 27.3 MB/s eta 0:00:14
   - -------------------------------------- 11.3/375.9 MB 18.5 MB/s eta 0:00:20
   - -------------------------------------- 12.6/375.9 MB 19.3 MB/s eta 0:00:19
   - -------------------------------------- 12.6/375.9 MB 19.3 MB/s eta 0:00:19
   - -------------------------------------- 12.6/375.9 MB 19.3 MB/s eta 0:00:19
   - -----------------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.17.0 requires ml-dtypes<0.5.0,>=0.3.1, but you have ml-dtypes 0.5.1 which is incompatible.
tensorflow-intel 2.17.0 requires tensorboard<2.18,>=2.17, but you have tensorboard 2.19.0 which is incompatible.


In [2]:
from transformers import pipeline

# Load instruction-tuned model
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-large")

# Sample Query
query = "46M, knee surgery, Pune, 3-month policy"

# Prompt to get a short direct response
prompt = f"Answer the following insurance question clearly and shortly:\n\n{query}"

# Get model output
result = qa_pipeline(prompt, max_length=64, do_sample=False)
print(result[0]['generated_text'])


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


46M, knee surgery, Pune, 3-month policy
