In [None]:
import pandas as pd
from datasets import load_dataset
import nltk
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
# Load dataset
ds = load_dataset("keivalya/MedQuad-MedicalQnADataset")
data = ds['train']

# Initialize preprocessing tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """Clean and preprocess text."""
    if not isinstance(text, str) or text.strip() == "":  # Handle missing or empty values
        return ""

    text = text.lower()  # Lowercasing
    word_tokens = word_tokenize(text)  # Tokenize into words
    filtered_words = [word for word in word_tokens if word not in stop_words]  # Remove stopwords
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]  # Lemmatization
    return ' '.join(lemmatized_words)  # Join back into string

# Process dataset (using correct column names)
processed_data = []
for entry in data:
    processed_entry = {}

    # Extract columns
    qtype = entry.get('qtype', "").strip()  # Keep as is
    question_text = entry.get('Question', "").strip()
    answer_text = entry.get('Answer', "").strip()

    # Preprocess question
    if question_text:
        cleaned_question = preprocess_text(question_text)
        processed_entry['Question'] = cleaned_question
        processed_entry['Question_Sentences'] = ' '.join(sent_tokenize(question_text))  # Keep original structure
        processed_entry['Question_Words'] = ' '.join(word_tokenize(cleaned_question))

    # Preprocess answer
    if answer_text:
        cleaned_answer = preprocess_text(answer_text)
        processed_entry['Answer'] = cleaned_answer
        processed_entry['Answer_Sentences'] = ' '.join(sent_tokenize(answer_text))
        processed_entry['Answer_Words'] = ' '.join(word_tokenize(cleaned_answer))

    processed_entry['qtype'] = qtype  # Keep original qtype

    if processed_entry:
        processed_data.append(processed_entry)

# Convert to Pandas DataFrame
df = pd.DataFrame(processed_data)

# Save DataFrame to CSV
df.to_csv("processed_medquad.csv", index=False)

print("CSV file 'processed_medquad.csv' saved successfully!")


In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from gensim.models import Word2Vec
from sklearn.manifold import TSNE

# Download necessary NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
# Load the processed dataset
file_path = "/content/processed_medquad.csv"
df = pd.read_csv(file_path)

# Ensure no NaN values in text columns
df.fillna("", inplace=True)

# Combine 'Question_Words' and 'Answer_Words' for analysis
all_words = df["Question_Words"].tolist() + df["Answer_Words"].tolist()

# Tokenize words for POS tagging
tokenized_words = [word_tokenize(sentence) for sentence in all_words if isinstance(sentence, str)]
flat_words = [word for sublist in tokenized_words for word in sublist]  # Flatten list

# Perform POS tagging
pos_tags = pos_tag(flat_words)

print("The POS Tags are as follows")
print(pos_tags)
# Count POS distribution
pos_counts = Counter(tag for word, tag in pos_tags)

# Convert POS counts to a DataFrame for visualization
pos_df = pd.DataFrame(pos_counts.items(), columns=["POS", "Count"])
pos_df = pos_df.sort_values(by="Count", ascending=False)

# Plot POS distribution
plt.figure(figsize=(12, 6))
sns.barplot(x="POS", y="Count", data=pos_df, palette="viridis")
plt.xlabel("Part of Speech (POS)")
plt.ylabel("Frequency")
plt.title("POS Distribution in Processed MedQuad Dataset")
plt.xticks(rotation=45)
plt.show()

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_words, vector_size=100, window=5, min_count=5, workers=4)

# Extract word vectors for visualization
words = list(word2vec_model.wv.index_to_key)
word_vectors = word2vec_model.wv[words]

# Reduce dimensionality using t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
word_vectors_2d = tsne.fit_transform(word_vectors)

# Convert to DataFrame for plotting
embedding_df = pd.DataFrame(word_vectors_2d, columns=["x", "y"])
embedding_df["word"] = words

# Plot t-SNE visualization
plt.figure(figsize=(12, 8))
sns.scatterplot(x="x", y="y", data=embedding_df, alpha=0.7)

# Annotate some points
for i, row in embedding_df.sample(30, random_state=42).iterrows():  # Show 30 random words
    plt.text(row["x"], row["y"], row["word"], fontsize=9)

plt.title("t-SNE Visualization of Word Embeddings (Word2Vec)")
plt.show()


In [None]:
from google import genai
from evaluate import load
from deepeval.test_run import TestRun
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from deepeval import evaluate
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.metrics import f1_score

# Initialize Gemini client
client = genai.Client(api_key="")
nltk.download('punkt')
nltk.download('punkt_tab')
def generate_medical_tot_prompt(question):
    """Generate a Tree-of-Thought (ToT) enhanced prompt for multi-path reasoning in medical queries."""
    prompt = f"""
You are an expert AI medical assistant trained on verified medical knowledge.
You have access to a **structured medical dataset (MedQuad)** containing information about **diseases, symptoms, treatments, medications, and medical conditions**.

**Question:** {question}

Use a Tree-of-Thought (ToT) approach to explore different reasoning paths before selecting the best response:

1. **Branching:** Consider multiple possible explanations or answers related to the question (e.g., differential diagnoses, alternative treatments, symptom variations).
2  **Outline:** Outline a tree of thought where you consider different possible approaches or branches
3. **Evaluation:** Assess the validity, reliability, and potential risks of each possible answer.
4. **Selection:** Choose the best explanation or recommendation based on evidence, medical guidelines, and logical reasoning.
5. **Refinement:** Verify the final answer by cross-referencing medical facts and ensuring clarity.
6. **Synthesize** the findings into a well-structured, medically accurate response.

**Final Answer:**
"""
    return prompt



def query_gemini(prompt, model="gemini-2.0-flash"):
    """Query Gemini with a given prompt and return the response."""
    response = client.models.generate_content(
        model=model,
        contents=[prompt],
    )
    return response.text

# Example usage

print("Using TOT technique")
user_query = "what are the symptoms for Parasites - Cysticercosis ?"
expected_answer="LCMV is most commonly recognized as causing neurological disease, as its name implies, though infection without symptoms or mild febrile illnesses are more common clinical manifestations. For infected persons who do become ill, onset of symptoms usually occurs 8-13 days after exposure to the virus as part of a biphasic febrile illness. This initial phase, which may last as long as a week, typically begins with any or all of the following symptoms: fever, malaise, lack of appetite, muscle aches, headache, nausea, and vomiting. Other symptoms appearing less frequently include sore throat, cough, joint pain, chest pain, testicular pain, and parotid (salivary gland) pain. Following a few days of recovery, a second phase of illness may occur. Symptoms may consist of meningitis (fever, headache, stiff neck, etc. ), encephalitis (drowsiness, confusion, sensory disturbances, and/or motor abnormalities, such as paralysis), or meningoencephalitis (inflammation of both the brain and meninges). LCMV has also been known to cause acute hydrocephalus (increased fluid on the brain), which often requires surgical shunting to relieve increased intracranial pressure. In rare instances, infection results in myelitis (inflammation of the spinal cord) and presents with symptoms such as muscle weakness, paralysis, or changes in body sensation. An association between LCMV infection and myocarditis (inflammation of the heart muscles) has been suggested. Previous observations show that most patients who develop aseptic meningitis or encephalitis due to LCMV survive. No chronic infection has been described in humans, and after the acute phase of illness, the virus is cleared from the body. However, as in all infections of the central nervous system, particularly encephalitis, temporary or permanent neurological damage is possible. Nerve deafness and arthritis have been reported. Women who become infected with LCMV during pregnancy may pass the infection on to the fetus. Infections occurring during the first trimester may result in fetal death and pregnancy termination, while in the second and third trimesters, birth defects can develop. Infants infected In utero can have many serious and permanent birth defects, including vision problems, mental retardation, and hydrocephaly (water on the brain). Pregnant women may recall a flu-like illness during pregnancy, or may not recall any illness. LCM is usually not fatal. In general, mortality is less than 1%."

tot_prompt = generate_medical_tot_prompt(user_query)
response = query_gemini(tot_prompt)
print(response)


generated_answer = response.strip()
reference_answer = expected_answer.strip()

# Tokenize
reference_tokens = nltk.word_tokenize(reference_answer.lower())
generated_tokens = nltk.word_tokenize(generated_answer.lower())

# BLEU Scores
smoothing = SmoothingFunction().method1
print("BLEU Scores:")
print("BLEU-1:", sentence_bleu([reference_tokens], generated_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothing))
print("BLEU-2:", sentence_bleu([reference_tokens], generated_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing))
print("BLEU-3:", sentence_bleu([reference_tokens], generated_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing))
print("BLEU-4:", sentence_bleu([reference_tokens], generated_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing))

# ROUGE
rouge = load("rouge")
rouge_score = rouge.compute(predictions=[generated_answer], references=[reference_answer])
print("ROUGE Score:", rouge_score)

# F1 Score (word-level token overlap)
def f1_from_tokens(ref_tokens, gen_tokens):
    ref_set = set(ref_tokens)
    gen_set = set(gen_tokens)
    true_positives = len(ref_set & gen_set)
    precision = true_positives / len(gen_set) if gen_set else 0
    recall = true_positives / len(ref_set) if ref_set else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    return f1

f1 = f1_from_tokens(reference_tokens, generated_tokens)
print("F1 Score:", round(f1, 4))






In [None]:
from google import genai

# Initialize Gemini client
client = genai.Client(api_key="")

def generate_medical_cot_prompt(question):
    """Generate a Chain-of-Thought (CoT) enhanced prompt for structured reasoning in medical queries."""
    prompt = f"""
You are an expert AI medical assistant trained on verified medical knowledge.
You have access to a **structured medical dataset (MedQuad)** containing information about **diseases, symptoms, treatments, medications, and medical conditions**.

**Question:** {question}

Think step-by-step before answering:
1. **Identify** what the user is asking (e.g., is it about a disease, symptoms, treatment, medication, or diagnosis?).
2. **Outline** outline the chain of thought before generating
2. **Determine** the best way to retrieve the answer using the MedQuad dataset.
3. **Extract** relevant medical facts and structure them logically.
4. **Interpret** the findings into a clear and medically accurate response.
5.**Generate** the final response not the code

**Final Answer:**
"""
    return prompt

def query_gemini(prompt, model="gemini-2.0-flash"):
    """Query Gemini with a given prompt and return the response."""
    response = client.models.generate_content(
        model=model,
        contents=[prompt],
    )
    return response.text

# Example usage

print("Using COT technique")
user_query = "what are the symptoms for Parasites - Cysticercosis ?"
expected_answer="LCMV is most commonly recognized as causing neurological disease, as its name implies, though infection without symptoms or mild febrile illnesses are more common clinical manifestations. For infected persons who do become ill, onset of symptoms usually occurs 8-13 days after exposure to the virus as part of a biphasic febrile illness. This initial phase, which may last as long as a week, typically begins with any or all of the following symptoms: fever, malaise, lack of appetite, muscle aches, headache, nausea, and vomiting. Other symptoms appearing less frequently include sore throat, cough, joint pain, chest pain, testicular pain, and parotid (salivary gland) pain. Following a few days of recovery, a second phase of illness may occur. Symptoms may consist of meningitis (fever, headache, stiff neck, etc. ), encephalitis (drowsiness, confusion, sensory disturbances, and/or motor abnormalities, such as paralysis), or meningoencephalitis (inflammation of both the brain and meninges). LCMV has also been known to cause acute hydrocephalus (increased fluid on the brain), which often requires surgical shunting to relieve increased intracranial pressure. In rare instances, infection results in myelitis (inflammation of the spinal cord) and presents with symptoms such as muscle weakness, paralysis, or changes in body sensation. An association between LCMV infection and myocarditis (inflammation of the heart muscles) has been suggested. Previous observations show that most patients who develop aseptic meningitis or encephalitis due to LCMV survive. No chronic infection has been described in humans, and after the acute phase of illness, the virus is cleared from the body. However, as in all infections of the central nervous system, particularly encephalitis, temporary or permanent neurological damage is possible. Nerve deafness and arthritis have been reported. Women who become infected with LCMV during pregnancy may pass the infection on to the fetus. Infections occurring during the first trimester may result in fetal death and pregnancy termination, while in the second and third trimesters, birth defects can develop. Infants infected In utero can have many serious and permanent birth defects, including vision problems, mental retardation, and hydrocephaly (water on the brain). Pregnant women may recall a flu-like illness during pregnancy, or may not recall any illness. LCM is usually not fatal. In general, mortality is less than 1%."

cot_prompt = generate_medical_cot_prompt(user_query)
response = query_gemini(cot_prompt)
print(response)

generated_answer = response.strip()
reference_answer = expected_answer.strip()

# Tokenize
reference_tokens = nltk.word_tokenize(reference_answer.lower())
generated_tokens = nltk.word_tokenize(generated_answer.lower())

# BLEU Scores
smoothing = SmoothingFunction().method1
print("BLEU Scores:")
print("BLEU-1:", sentence_bleu([reference_tokens], generated_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothing))
print("BLEU-2:", sentence_bleu([reference_tokens], generated_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing))
print("BLEU-3:", sentence_bleu([reference_tokens], generated_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing))
print("BLEU-4:", sentence_bleu([reference_tokens], generated_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing))

# ROUGE
rouge = load("rouge")
rouge_score = rouge.compute(predictions=[generated_answer], references=[reference_answer])
print("ROUGE Score:", rouge_score)

# F1 Score (word-level token overlap)
def f1_from_tokens(ref_tokens, gen_tokens):
    ref_set = set(ref_tokens)
    gen_set = set(gen_tokens)
    true_positives = len(ref_set & gen_set)
    precision = true_positives / len(gen_set) if gen_set else 0
    recall = true_positives / len(ref_set) if ref_set else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    return f1

f1 = f1_from_tokens(reference_tokens, generated_tokens)
print("F1 Score:", round(f1, 4))




In [None]:
from google import genai

# Initialize Gemini client
client = genai.Client(api_key="")

def generate_medical_got_prompt(question):
    """Generate a Graph-of-Topics (GOT) enhanced prompt for structured reasoning in medical queries."""
    prompt = f"""
You are an expert AI medical assistant trained on verified medical knowledge.

You have access to a **structured medical dataset (MedQuad)** containing information about **diseases, symptoms, treatments, medications, and medical conditions**.

**Question:** {question}

Think using a **Graph-of-Thought (GoT)** approach before answering:

1. **Identify** the key medical entity in the question (e.g., disease, symptom, treatment, medication, condition).
2. **Map Relationships** by constructing a knowledge graph:
   - **Central Node:** Define the core entity (e.g., disease, symptom).
   - **Connected Nodes:** Identify related concepts (e.g., symptoms, causes, treatments, medications).
   - **Edges:** Define the relationships between nodes (e.g., "causes", "treated with", "associated with").
3. **Retrieve** relevant information from the MedQuad dataset using the graph structure.
4. **Extract** medical knowledge as structured relationships.
5. **Generate** a final answer based on the graph generate as a text and not in json format, ensuring clarity and medical accuracy.

**Graph Representation:**
**Final Answer:**
"""


    return prompt

def query_gemini(prompt, model="gemini-2.0-flash"):
    """Query Gemini with a given prompt and return the response."""
    response = client.models.generate_content(
        model=model,
        contents=[prompt],
    )
    return response.text

# Example usage

print("Using GOT technique")
user_query = "what are the symptoms for Parasites - Cysticercosis ?"
expected_answer="LCMV is most commonly recognized as causing neurological disease, as its name implies, though infection without symptoms or mild febrile illnesses are more common clinical manifestations. For infected persons who do become ill, onset of symptoms usually occurs 8-13 days after exposure to the virus as part of a biphasic febrile illness. This initial phase, which may last as long as a week, typically begins with any or all of the following symptoms: fever, malaise, lack of appetite, muscle aches, headache, nausea, and vomiting. Other symptoms appearing less frequently include sore throat, cough, joint pain, chest pain, testicular pain, and parotid (salivary gland) pain. Following a few days of recovery, a second phase of illness may occur. Symptoms may consist of meningitis (fever, headache, stiff neck, etc. ), encephalitis (drowsiness, confusion, sensory disturbances, and/or motor abnormalities, such as paralysis), or meningoencephalitis (inflammation of both the brain and meninges). LCMV has also been known to cause acute hydrocephalus (increased fluid on the brain), which often requires surgical shunting to relieve increased intracranial pressure. In rare instances, infection results in myelitis (inflammation of the spinal cord) and presents with symptoms such as muscle weakness, paralysis, or changes in body sensation. An association between LCMV infection and myocarditis (inflammation of the heart muscles) has been suggested. Previous observations show that most patients who develop aseptic meningitis or encephalitis due to LCMV survive. No chronic infection has been described in humans, and after the acute phase of illness, the virus is cleared from the body. However, as in all infections of the central nervous system, particularly encephalitis, temporary or permanent neurological damage is possible. Nerve deafness and arthritis have been reported. Women who become infected with LCMV during pregnancy may pass the infection on to the fetus. Infections occurring during the first trimester may result in fetal death and pregnancy termination, while in the second and third trimesters, birth defects can develop. Infants infected In utero can have many serious and permanent birth defects, including vision problems, mental retardation, and hydrocephaly (water on the brain). Pregnant women may recall a flu-like illness during pregnancy, or may not recall any illness. LCM is usually not fatal. In general, mortality is less than 1%."

cot_prompt = generate_medical_got_prompt(user_query)
response = query_gemini(cot_prompt)
print(response)


generated_answer = response.strip()
reference_answer = expected_answer.strip()

# Tokenize
reference_tokens = nltk.word_tokenize(reference_answer.lower())
generated_tokens = nltk.word_tokenize(generated_answer.lower())

# BLEU Scores
smoothing = SmoothingFunction().method1
print("BLEU Scores:")
print("BLEU-1:", sentence_bleu([reference_tokens], generated_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothing))
print("BLEU-2:", sentence_bleu([reference_tokens], generated_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing))
print("BLEU-3:", sentence_bleu([reference_tokens], generated_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing))
print("BLEU-4:", sentence_bleu([reference_tokens], generated_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing))

# ROUGE
rouge = load("rouge")
rouge_score = rouge.compute(predictions=[generated_answer], references=[reference_answer])
print("ROUGE Score:", rouge_score)

# F1 Score (word-level token overlap)
def f1_from_tokens(ref_tokens, gen_tokens):
    ref_set = set(ref_tokens)
    gen_set = set(gen_tokens)
    true_positives = len(ref_set & gen_set)
    precision = true_positives / len(gen_set) if gen_set else 0
    recall = true_positives / len(ref_set) if ref_set else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    return f1

f1 = f1_from_tokens(reference_tokens, generated_tokens)
print("F1 Score:", round(f1, 4))



In [None]:
import os
import pandas as pd
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

# ✅ Load and Clean Dataset
csv_path = "/content/processed_medquad.csv"  # Adjust if needed
df = pd.read_csv(csv_path)
df = df.dropna(subset=["Question_Sentences", "Answer_Sentences"])

# ✅ Convert to LangChain Documents
docs_to_add = [
    Document(
        page_content=str(row["Question_Sentences"]).strip(),
        metadata={
            "answer": str(row["Answer_Sentences"]).strip(),
            "qtype": str(row["qtype"]).strip()
        }
    )
    for _, row in df.iterrows()
]

# ✅ Initialize Embeddings
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# ✅ Initialize Chroma Vector Store
vector_store = Chroma(
    persist_directory="./chroma_db",
    embedding_function=embedding_model
)

# ✅ Function to Add Documents in Batches
def batch_add_documents(vector_store, docs, batch_size=5000):
    for i in range(0, len(docs), batch_size):
        batch = docs[i:i + batch_size]
        vector_store.add_documents(batch)
        print(f"✅ Added batch {i//batch_size + 1} with {len(batch)} documents")

# ✅ Run Batched Insert
batch_add_documents(vector_store, docs_to_add)

print("✅ Medical QA dataset successfully added to Chroma vector store!")


In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
import sacrebleu
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from evaluate import load
from sklearn.metrics import f1_score

# ✅ Load tokenizer for nltk
nltk.download('punkt')

# ✅ Load ChromaDB (Vector Store)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = Chroma(persist_directory="./chroma_db", embedding_function=embedding_model)

# ✅ Load FLAN-T5 model & tokenizer (for answer generation)
model_name = "google/flan-t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def generate_medical_cot_prompt(question, retrieved_answer):
    """
    Generate a Chain-of-Thought (CoT) enhanced prompt for structured reasoning in medical queries.
    """
    prompt = f"""
You are an expert AI medical assistant trained on verified medical knowledge.
You have access to a **structured medical dataset (MedQuad)** containing information about **diseases, symptoms, treatments, medications, and medical conditions**.

### **User Question:**
{question}

### **Retrieved Medical Information:**
{retrieved_answer}

### **Think step-by-step before answering:**
1. **Identify** what the user is asking (e.g., is it about a disease, symptoms, treatment, medication, or diagnosis?).
2. **Outline** the chain of thought before generating.
3. **Determine** the best way to retrieve the answer using the MedQuad dataset.
4. **Extract** relevant medical facts and structure them logically.
5. **Interpret** the findings into a clear and medically accurate response.
6. **Generate** the final answer.

### **Final Answer:**
"""
    return prompt

def generate_answer_with_flan(question, top_k=3):
    """
    Retrieves the most similar question from ChromaDB and generates an answer using FLAN-T5
    with a Chain-of-Thought (CoT) enhanced prompt.
    """

    # ✅ Retrieve similar questions from ChromaDB
    search_results = vector_store.similarity_search(question, k=top_k)

    if not search_results:
        return "Sorry, I couldn't find relevant information."

    # ✅ Use the best-matching question as context
    best_match = search_results[0]
    retrieved_question = best_match.page_content
    retrieved_answer = best_match.metadata.get("answer", "No answer found.")

    # ✅ Apply CoT Prompting
    cot_prompt = generate_medical_cot_prompt(retrieved_question, retrieved_answer)

    # ✅ Generate answer using FLAN-T5
    inputs = tokenizer(cot_prompt, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_length=1024,
            min_length=100,
            num_beams=3,
            temperature=0.6,
            top_p=0.9,
            repetition_penalty=1.5,
            no_repeat_ngram_size=3
        )

    generated_answer = tokenizer.decode(output[0], skip_special_tokens=True)

    return generated_answer, retrieved_answer

# ✅ Example usage
query = "How to diagnose Crimean-Congo Hemorrhagic Fever (CCHF)?"
expected_answer = """
Laboratory tests that are used to diagnose CCHF include antigen-capture enzyme-linked immunosorbent assay (ELISA), real time polymerase chain reaction (RT-PCR), virus isolation attempts, and detection of antibody by ELISA (IgG and IgM). Laboratory diagnosis of a patient with a clinical history compatible with CCHF can be made during the acute phase of the disease by using the combination of detection of the viral antigen (ELISA antigen capture), viral RNA sequence (RT-PCR) in the blood or in tissues collected from a fatal case and virus isolation. Immunohistochemical staining can also show evidence of viral antigen in formalin-fixed tissues. Later in the course of the disease, in people surviving, antibodies can be found in the blood. But antigen, viral RNA and virus are no more present and detectable.
""".strip()

generated_answer, retrieved_answer = generate_answer_with_flan(query)
print("Generated Answer:\n", generated_answer)
print("\nReference (Expected) Answer:\n", expected_answer)

# ✅ Tokenize answers
reference_tokens = nltk.word_tokenize(expected_answer.lower())
generated_tokens = nltk.word_tokenize(generated_answer.lower())

# ✅ SacreBLEU Score
sacrebleu_score = sacrebleu.corpus_bleu([generated_answer], [[expected_answer]])
print("\nSacreBLEU Score:", round(sacrebleu_score.score, 2))

# ✅ BLEU Scores
smoothing = SmoothingFunction().method1
print("\nBLEU Scores:")
print("BLEU-1:", sentence_bleu([reference_tokens], generated_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothing))
print("BLEU-2:", sentence_bleu([reference_tokens], generated_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing))
print("BLEU-3:", sentence_bleu([reference_tokens], generated_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing))
print("BLEU-4:", sentence_bleu([reference_tokens], generated_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing))

# ✅ ROUGE Score
rouge = load("rouge")
rouge_score = rouge.compute(predictions=[generated_answer], references=[expected_answer])
print("\nROUGE Score:", rouge_score)

# ✅ F1 Score (word overlap, not the sklearn f1)
def f1_from_tokens(ref_tokens, gen_tokens):
    ref_set = set(ref_tokens)
    gen_set = set(gen_tokens)
    true_positives = len(ref_set & gen_set)
    precision = true_positives / len(gen_set) if gen_set else 0
    recall = true_positives / len(ref_set) if ref_set else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    return f1

f1 = f1_from_tokens(reference_tokens, generated_tokens)
print("F1 Score:", round(f1, 4))


In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
import sacrebleu
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from evaluate import load

# Ensure nltk punkt tokenizer is available
nltk.download('punkt')

# ✅ Load ChromaDB (Vector Store)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = Chroma(persist_directory="./chroma_db", embedding_function=embedding_model)

# ✅ Load FLAN-T5 model & tokenizer (for answer generation)
model_name = "google/flan-t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def generate_medical_tot_prompt(question, retrieved_answer):
    """Tree-of-Thought (ToT) Prompt for Multi-Step Reasoning"""
    return (
        "Please solve the following problem by exploring multiple lines of reasoning. "
        "Outline a tree of thought where you consider different possible approaches or branches, "
        "and then converge on the best solution with your final answer.\n\n"
        f"**User Question:** {question}\n\n"
        f"**Retrieved Medical Information:** {retrieved_answer}\n\n"
        "Now, apply structured reasoning and provide the best medically accurate response."
    )

def generate_answer_with_flan(question, top_k=3):
    """
    Retrieves the most similar question from ChromaDB and generates an answer using FLAN-T5
    with a Tree-of-Thought (ToT) prompt.
    """
    search_results = vector_store.similarity_search(question, k=top_k)
    if not search_results:
        return "Sorry, I couldn't find relevant information.", "No reference answer available."

    best_match = search_results[0]
    retrieved_question = best_match.page_content
    retrieved_answer = best_match.metadata.get("answer", "No answer found.")

    # ✅ Apply ToT Prompting
    tot_prompt = generate_medical_tot_prompt(retrieved_question, retrieved_answer)
    inputs = tokenizer(tot_prompt, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_length=1024,
            min_length=100,
            num_beams=3,
            temperature=0.6,
            top_p=0.9,
            repetition_penalty=1.5,
            no_repeat_ngram_size=3
        )

    generated_answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_answer, retrieved_answer

# ✅ Example usage
query = "How to diagnose Crimean-Congo Hemorrhagic Fever (CCHF) ?"
expected_answer = """Laboratory tests that are used to diagnose CCHF include antigen-capture enzyme-linked immunosorbent assay (ELISA), real time polymerase chain reaction (RT-PCR), virus isolation attempts, and detection of antibody by ELISA (IgG and IgM). Laboratory diagnosis of a patient with a clinical history compatible with CCHF can be made during the acute phase of the disease by using the combination of detection of the viral antigen (ELISA antigen capture), viral RNA sequence (RT-PCR) in the blood or in tissues collected from a fatal case and virus isolation. Immunohistochemical staining can also show evidence of viral antigen in formalin-fixed tissues. Later in the course of the disease, in people surviving, antibodies can be found in the blood. But antigen, viral RNA and virus are no more present and detectable."""

# ✅ Generate answer
generated_answer, retrieved_answer = generate_answer_with_flan(query)
print("Generated Answer:\n", generated_answer)
print("\nReference Answer:\n", expected_answer)

# ✅ Evaluation
reference_tokens = nltk.word_tokenize(expected_answer.lower())
generated_tokens = nltk.word_tokenize(generated_answer.lower())

# ✅ SacreBLEU
sacrebleu_score = sacrebleu.corpus_bleu([generated_answer], [[expected_answer]])
print("\nSacreBLEU Score:", round(sacrebleu_score.score, 2))

# ✅ BLEU Scores
smoothing = SmoothingFunction().method1
print("\nBLEU Scores:")
print("BLEU-1:", sentence_bleu([reference_tokens], generated_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothing))
print("BLEU-2:", sentence_bleu([reference_tokens], generated_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing))
print("BLEU-3:", sentence_bleu([reference_tokens], generated_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing))
print("BLEU-4:", sentence_bleu([reference_tokens], generated_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing))

# ✅ ROUGE
rouge = load("rouge")
rouge_score = rouge.compute(predictions=[generated_answer], references=[expected_answer])
print("\nROUGE Score:", rouge_score)

# ✅ F1 Score (word-level)
def f1_from_tokens(ref_tokens, gen_tokens):
    ref_set = set(ref_tokens)
    gen_set = set(gen_tokens)
    true_positives = len(ref_set & gen_set)
    precision = true_positives / len(gen_set) if gen_set else 0
    recall = true_positives / len(ref_set) if ref_set else 0
    return 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

f1 = f1_from_tokens(reference_tokens, generated_tokens)
print("F1 Score:", round(f1, 4))


In [None]:
import os
import google.generativeai as genai
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from evaluate import load
import sacrebleu

# Make sure NLTK data is downloaded
nltk.download("punkt")

# ✅ Set up Gemini API Key
os.environ["GOOGLE_API_KEY"] = ""  # 🔐 Replace with your real key
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# ✅ Load ChromaDB
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = Chroma(persist_directory="./chroma_db", embedding_function=embedding_model)

# ✅ Load Gemini model
model = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0)

def generate_medical_got_prompt(question, retrieved_answer):
    return (
        f"Build a structured medical knowledge response for the following question.\n\n"
        f"**User Question:** {question}\n\n"
        f"**Retrieved Medical Information:** {retrieved_answer}\n\n"
        "Identify key medical concepts, explain their relationships, and provide an informative yet concise response."
    )

def generate_answer_with_gemini(question, top_k=3):
    search_results = vector_store.similarity_search(question, k=top_k)
    if not search_results:
        return "Sorry, I couldn't find relevant medical information."

    best_match = search_results[0]
    retrieved_answer = best_match.metadata.get("answer", "No answer found.")

    got_prompt = generate_medical_got_prompt(question, retrieved_answer)
    gemini_response = model.invoke(got_prompt)

    return gemini_response.content.strip()

# ✅ User query & gold-standard expected answer
query = "How to diagnose Crimean-Congo Hemorrhagic Fever (CCHF) ?"
expected_answer = """Laboratory tests that are used to diagnose CCHF include antigen-capture enzyme-linked immunosorbent assay (ELISA), real time polymerase chain reaction (RT-PCR), virus isolation attempts, and detection of antibody by ELISA (IgG and IgM). Laboratory diagnosis of a patient with a clinical history compatible with CCHF can be made during the acute phase of the disease by using the combination of detection of the viral antigen (ELISA antigen capture), viral RNA sequence (RT-PCR) in the blood or in tissues collected from a fatal case and virus isolation. Immunohistochemical staining can also show evidence of viral antigen in formalin-fixed tissues. Later in the course of the disease, in people surviving, antibodies can be found in the blood. But antigen, viral RNA and virus are no more present and detectable."""

generated_answer = generate_answer_with_gemini(query)
print("🔹 Generated Answer:\n", generated_answer)
print("\n🔹 Expected Answer (Ground Truth):\n", expected_answer)

# ✅ Tokenization
expected_tokens = nltk.word_tokenize(expected_answer.lower())
generated_tokens = nltk.word_tokenize(generated_answer.lower())

# ✅ BLEU Scores
smoothing = SmoothingFunction().method1
print("\n🔹 BLEU Scores:")
print("BLEU-1:", sentence_bleu([expected_tokens], generated_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothing))
print("BLEU-2:", sentence_bleu([expected_tokens], generated_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing))
print("BLEU-3:", sentence_bleu([expected_tokens], generated_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing))
print("BLEU-4:", sentence_bleu([expected_tokens], generated_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing))

# ✅ ROUGE Score
rouge = load("rouge")
rouge_score = rouge.compute(predictions=[generated_answer], references=[expected_answer])
print("\n🔹 ROUGE Score:", rouge_score)

# ✅ F1 Score (word-level token overlap)
def f1_from_tokens(ref_tokens, gen_tokens):
    ref_set = set(ref_tokens)
    gen_set = set(gen_tokens)
    true_positives = len(ref_set & gen_set)
    precision = true_positives / len(gen_set) if gen_set else 0
    recall = true_positives / len(ref_set) if ref_set else 0
    return 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

f1 = f1_from_tokens(expected_tokens, generated_tokens)
print("🔹 F1 Score:", round(f1, 4))

# ✅ SacreBLEU
sacrebleu_score = sacrebleu.corpus_bleu([generated_answer], [[expected_answer]])
print("🔹 SacreBLEU Score:", round(sacrebleu_score.score, 2))
