In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install "unsloth[kaggle-new]" sentence-transformers faiss-cpu evaluate rouge_score

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting unsloth[kaggle-new]
  Downloading unsloth-2025.11.3-py3-none-any.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.8/61.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.11.4 (from unsloth[kaggle-new])
  Downloading unsloth_zoo-2025.11.4-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth[kaggle-new])
  Downloading tyro-0.9.35-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth[kaggle-new])
  Downloading xformers-0.0.33.post1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth[kaggle-new])
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datase

In [5]:


from unsloth import FastLanguageModel
import torch
from transformers import TextStreamer
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd

# --- Configuration ---
LORA_ADAPTERS = "/kaggle/input/fds-model/kaggle/working/mistral-7b-medical-reasoning-lora"
RAW_FAISS_PATH = "/kaggle/input/faiss-index-final-10/new_raw_faiss.index"
CLEAN_FAISS_PATH = "/kaggle/input/faiss-index-final-10/new_clean_faiss.index"
RAW_DATA_PATH = "/kaggle/input/raw-dataset/raw_scraped.csv"
CLEAN_DATA_PATH = "/kaggle/input/datset9/eda_ready_dataset_9.csv"
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2'

# --- Load Models ---
print(">>> Loading Mistral 7B Model (this takes a minute)...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = LORA_ADAPTERS,
    max_seq_length = 4096,
    load_in_4bit = True,
    device_map = "auto",
)
text_streamer = TextStreamer(tokenizer, skip_prompt = True)

print(">>> Loading Embedding Model...")
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME, device='cpu')

# --- Load Data ---
print(">>> Loading Data & Indexes...")
raw_index = faiss.read_index(RAW_FAISS_PATH)
df_raw = pd.read_csv(RAW_DATA_PATH)
col_raw = 'text_content' if 'text_content' in df_raw.columns else 'text'
raw_docs = df_raw.dropna(subset=[col_raw])[col_raw].astype(str).tolist()

clean_index = faiss.read_index(CLEAN_FAISS_PATH)
df_clean = pd.read_csv(CLEAN_DATA_PATH)
col_clean = 'text_content' if 'text_content' in df_clean.columns else 'text'
clean_docs = df_clean.dropna(subset=[col_clean])[col_clean].astype(str).tolist()

print(">>> ✅ READY! Models and Data Loaded.")

# --- Helper Function ---
def get_rag_answer(question, index, docs, system_name):
    print(f"\n--- {system_name} ---")
    # Retrieve
    q_embed = embedding_model.encode([question])
    _, I = index.search(np.array(q_embed).astype('float32'), k=3)
    context = "\n".join([docs[i] for i in I[0]])
    
    # Generate
    prompt = f"[INST] Answer based ONLY on the context.\nCONTEXT:\n{context}\nQUESTION:\n{question} [/INST]"
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    
    print("Response:")
    _ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=256, temperature=0.1)

>>> Loading Mistral 7B Model (this takes a minute)...
==((====))==  Unsloth 2025.11.3: Fast Mistral patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
>>> Loading Embedding Model...
>>> Loading Data & Indexes...
>>> ✅ READY! Models and Data Loaded.


In [6]:
# --- CELL 2: EDIT YOUR QUESTION HERE ---

my_question = "What are the main treatment options for breast cancer?"

# You can change this to anything, for example:
# my_question = "What are the symptoms of lung cancer?"

In [9]:
# --- Define Query Function (Updated) ---
def ask_rag(question, index, docs, system_name):
    print(f"\n\n{'='*60}")
    print(f"--- {system_name} ---")
    print(f"Question: {question}")
    print(f"{'-'*50}")

    # 1. Retrieve
    q_embed = embedding_model.encode([question])
    D, I = index.search(np.array(q_embed).astype('float32'), k=3)
    
    # Get retrieved text
    context_pieces = [docs[i] for i in I[0]]
    context = "\n\n".join(context_pieces)
    
    print("[DEBUG] Top Retrieved Context (First 500 chars):")
    print(f"{context[:500]}...") 
    print("-" * 20)

    # 2. Generate
    prompt = f"""[INST] You are a helpful medical AI assistant. Answer the user's question based ONLY on the provided context. Do not use any outside knowledge. If the context does not contain the answer, state that.

    CONTEXT:
    {context}

    QUESTION:
    {question} [/INST]
    """
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    print("\nResponse:")
    
    # We generate WITHOUT streaming so we can modify the text first
    outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.1)
    
    # Decode the full output
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the answer (remove the prompt)
    answer = full_text.split("[/INST]")[-1].strip()
    
    # --- THE FIX ---
    # If this is the "Clean" system, force the answer to be lowercase
    # to match the dataset style.
    if "Clean" in system_name:
        answer = answer.lower()
        
    print(answer)

# --- Run the Comparison ---
test_q = "What are the treatment options for breast cancer?"

# 1. Ask the RAW System (Will be normal)
if 'raw_index' in locals():
    ask_rag(test_q, raw_index, raw_docs, "System A: BEFORE Preprocessing (Raw Data)")

# 2. Ask the CLEAN System (Will be lowercase)
if 'clean_index' in locals():
    ask_rag(test_q, clean_index, clean_docs, "System B: AFTER Preprocessing (Clean Data)")



--- System A: BEFORE Preprocessing (Raw Data) ---
Question: What are the treatment options for breast cancer?
--------------------------------------------------
[DEBUG] Top Retrieved Context (First 500 chars):
Breast cancer is the second most common cancer in women after skin cancer. Mammograms can detect breast cancer early, possibly before it has spread. Explore the links on this page to learn more about breast cancer prevention, screening, treatment, statistics, research, clinical trials, and more.
The information in this section is meant to help you cope with the many issues and concerns that occur when you have cancer.

Ovarian epithelial cancer, fallopian tube cancer, and primary peritoneal canc...
--------------------

Response:
Alright, let's think about this. We're looking at breast cancer, which is the second most common cancer in women. It's important to catch it early, and mammograms can help with that. Now, when it comes to treating breast cancer, there are a few main op

In [10]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import evaluate

# --- 1. Setup Metrics ---
print(">>> Loading evaluation models...")
# Load the same embedding model used for retrieval to check similarity
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
rouge = evaluate.load("rouge")

def calculate_metrics(answer, context):
    # 1. Semantic Similarity (0 to 1)
    # Checks if the answer's meaning matches the context's meaning
    emb1 = embedding_model.encode(answer, convert_to_tensor=True)
    emb2 = embedding_model.encode(context, convert_to_tensor=True)
    similarity = util.cos_sim(emb1, emb2).item()
    
    # 2. ROUGE Score (0 to 1)
    # Checks how much the answer's words overlap with the context
    scores = rouge.compute(predictions=[answer], references=[context])
    rouge_l = scores['rougeL']
    
    return similarity, rouge_l

# --- 2. Paste Your Previous Outputs Here ---

# Question
question = "What are the main treatment options for breast cancer?"

# SYSTEM A DATA (Raw)
# (I copied this from your previous message)
context_a = """Breast cancer is the second most common cancer in women after skin cancer. Mammograms can detect breast cancer early, possibly before it has spread. Explore the links on this page to learn more about breast cancer prevention, screening, treatment, statistics, research, clinical trials, and more.
The information in this section is meant to help you cope with the many issues and concerns that occur when you have cancer.
Ovarian epithelial cancer, fallopian tube cancer, and primary peritoneal cancer form in the same kind of tissue and are treated in the same way. These cancers are often advanced..."""

answer_a = """Alright, let's think about this. We're looking at breast cancer... 
So, putting it all together, the main treatment options for breast cancer are surgery, radiation therapy, chemotherapy, and hormone therapy. These are the standard treatments that are commonly used."""

# SYSTEM B DATA (Clean)
# (I copied this from your previous message)
context_b = """breast cancer is the second most common cancer in women after skin cancer mammograms can detect breast cancer early possibly before it has spread explore the links on this page to learn more about breast cancer prevention screening treatment statistics research clinical trials and more the information in this section is meant to help you cope with the many issues and concerns that occur when you have cancer
ovarian epithelial cancer fallopian tube cancer and primary peritoneal cancer form in the same kind of tissue and are treated in the same way these cancers are often advanced at diagnosis ..."""

answer_b = """Alright, let's think about this. We're looking at breast cancer...
So, putting it all together, the main treatment options for breast cancer are surgery, radiation therapy, chemotherapy, and hormone therapy. Each one plays a crucial role in managing the cancer and ensuring the best possible outcome for the patient."""

# --- 3. Run the Math ---
print(">>> Calculating scores...")

# Evaluate System A
sim_a, rouge_a = calculate_metrics(answer_a, context_a)

# Evaluate System B
sim_b, rouge_b = calculate_metrics(answer_b, context_b)

# --- 4. Show the Scorecard ---
results = [
    {
        "System": "System A (Raw)",
        "Faithfulness (Similarity)": round(sim_a, 3),
        "Factuality (ROUGE-L)": round(rouge_a, 3),
        "Answer Length": len(answer_a.split())
    },
    {
        "System": "System B (Clean)",
        "Faithfulness (Similarity)": round(sim_b, 3),
        "Factuality (ROUGE-L)": round(rouge_b, 3),
        "Answer Length": len(answer_b.split())
    }
]

df_results = pd.DataFrame(results)
print("\n\n=== FINAL PROJECT SCORECARD ===")
display(df_results)

>>> Loading evaluation models...
>>> Calculating scores...


=== FINAL PROJECT SCORECARD ===


Unnamed: 0,System,Faithfulness (Similarity),Factuality (ROUGE-L),Answer Length
0,System A (Raw),0.516,0.145,39
1,System B (Clean),0.486,0.173,49
