In [None]:
import pandas as pd
import json
from pathlib import Path

In [None]:
df = pd.read_csv("/kaggle/input/ai-medical-chatbot/ai-medical-chatbot.csv")

In [None]:
df = df.rename(columns = {
"Description": "question",
"Patient": "patient_text",
"Doctor": "doctor_text",
})

In [None]:
df.head()

In [None]:
out_path = Path("/kaggle/working/data/clean_medical.jsonl")
out_path.parent.mkdir(parents = True, exist_ok = True)

In [None]:
with open(out_path, "w", encoding = "utf-8") as f:
    for i, row in df.iterrows():
        q = str(row["question"]).strip()
        p = str(row["patient_text"]).strip()
        d = str(row["doctor_text"]).strip()

        if not (q and d):
            continue
    
        obj = {
            "id": f"row_{i}",
            "question": q,
            "patient_text": p,
            "doctor_text": d
        }
        f.write(json.dumps(obj, ensure_ascii = False)+ "\n")

print("Saved cleaned dataset to ", out_path)

## Building RAG

In [None]:
!pip install fastapi uvicorn chromadb sentence-transformers torch numpy

In [None]:
!pip install langchain-nvidia-ai-endpoints langchain langchain-community

In [None]:
import json
from pathlib import Path
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

In [None]:
DATA = Path("/kaggle/working/data/clean_medical.jsonl")
PERSIST_DIR = "/kaggle/working/store/chroma"
COLLECTION = "med_consultations"

In [None]:
model = SentenceTransformer("BAAI/bge-m3")

In [None]:
client = chromadb.Client(Settings(persist_directory = PERSIST_DIR))

In [None]:
try :
    client.delete_collection(COLLECTION)
except:
    pass

In [None]:
coll = client.create_collection(COLLECTION, metadata = {"hnsw:space": "cosine"})

In [None]:
ids, texts, metas = [], [], []

In [None]:
with open(DATA, 'r', encoding = 'utf-8') as f:
    for line in f:
        row = json.loads(line)
        retrieval_text = f"Patient: {row['patient_text']} \nDoctor: {row['doctor_text']}"
        ids.append(row['id'])
        texts.append(retrieval_text)
        metas.append({
            "question": row['question'],
            'doctor_text': row['doctor_text']
        })

In [None]:
print("Encoding", len(texts), " consultations....")
import torch
torch.cuda.empty_cache()
embeddings = model.encode(texts, batch_size = 8, device = "cuda", show_progress_bar = True, normalize_embediings = True)

In [None]:
coll.add(ids = ids, embeddings = embeddings, documents = texts, metadatas = metas)
client.persist()
print(f"Indexed {len(texts)")

## Using optimized method

In [None]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
import numpy as np
import torch

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN_2")


In [None]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name, device = "cuda", token = hf_token)
model.half()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# def chunk_text(text, max_tokens = 256):
#     tokens = tokenizer.encode(text, truncation = False)
#     for i in range(0, len(tokens), max_tokens):
#         yield tokenizer.decode(tokens[i:i+max_tokens])

In [None]:
def chunk_text(text, chunk_size = 512, overlap = 50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap

In [None]:
with open("/kaggle/working/data/clean_medical.jsonl", 'r', encoding = 'utf-8') as f:
    full_text = f.read()

In [None]:
texts = [full_text]

In [None]:
full_text[:100]

In [None]:
chunked_texts = []
for t in texts:
    chunked_texts.extend(chunk_text(t, chunk_size = 512, overlap = 50))

In [None]:
embeddings = model.encode(
    chunked_texts,
    batch_size = 16,
    show_progress_bar = True,
)

## New Approach

In [None]:
!pip install pandas faiss-cpu sentence-transformers

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss

In [None]:
df = pd.read_csv("/kaggle/input/ai-medical-chatbot/ai-medical-chatbot.csv")

In [None]:
texts = []
for _, row in df.iterrows():
   if pd.notna(row["Description"]):
       texts.append(str(row["Description"]))
   if pd.notna(row["Patient"]):
       texts.append(str(row["Patient"]))
   if pd.notna(row["Doctor"]):
       texts.append(str(row["Doctor"]))

In [None]:
docs = df['Doctor'].tolist()

In [None]:
print("Total texts extracted:", len(texts))
print("Total texts extracted:", len(docs))

In [None]:
def chunk_text(text, chunk_size=500, overlap=50):
   words = text.split()
   chunks = []
   for i in range(0, len(words), chunk_size - overlap):
       chunk = " ".join(words[i:i+chunk_size])
       chunks.append(chunk)
   return chunks

In [None]:
chunked_texts = []
for t in docs:
   chunked_texts.extend(chunk_text(t))

In [None]:
print("Total chunked texts:", len(chunked_texts))

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # small & fast

In [None]:
embeddings = model.encode(chunked_texts, convert_to_numpy=True, show_progress_bar=True)


In [None]:
dimension = embeddings.shape[1]  # embedding size
index = faiss.IndexFlatL2(dimension)  # L2 distance
index.add(embeddings)
print("FAISS index built with", index.ntotal, "documents") 

In [None]:
def search(query, top_k=10):
   query_embedding = model.encode([query], convert_to_numpy=True)
   distances, indices = index.search(query_embedding, top_k)
   results = []
   for idx in indices[0]:
       results.append(chunked_texts[idx])
   return results

In [None]:
query = "I have pain in my lower back"
print("Query:", query)
print("Top results:", search(query))

In [None]:
pip install -U langchain-community

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
# Load CSV
df = pd.read_csv("/kaggle/input/ai-medical-chatbot/ai-medical-chatbot.csv")
# Use doctor responses for embeddings
docs = df["Doctor"].tolist()
# LangChain embedding wrapper
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# Convert docs to vector store
doc_texts = [str(d) for d in docs]
vectorstore = FAISS.from_texts(doc_texts, embedding_model)
# Build retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, RetrievalQA

In [None]:
custom_prompt = PromptTemplate(
   template="""
You are a professional medical doctor providing clear and accurate advice.  
Answer the patient’s question based **only on the provided context from other doctors**.  
If you don’t know, politely say you don’t know and recommend consulting a healthcare professional.  
### Context from doctors:
{context}
### Patient Question:
{question}
### Doctor’s Answer:
""",
   input_variables=["context", "question"],
)

In [None]:
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
   llm=llm,
   retriever=retriever,   # Pass retriever directly
   chain_type="stuff",
   return_source_documents=False,  # Optional, if you want sources
   chain_type_kwargs={
       "prompt": custom_prompt  # Now works correctly
   }
)

In [None]:
query = "I have a hair fall problem how to counter it?"
response = qa_chain.run(query)
print("AI Doctor:", response)

In [None]:
from kaggle_secrets import UserSecretsClient
# secret_label = "your-secret-label"
nvidia_model_name = UserSecretsClient().get_secret("NVIDIA_MODEL_NAME")
nvidia_api_key = UserSecretsClient().get_secret("NVIDIA_API_KEY")
print(nvidia_model_name , nvidia_api_key[:10])

In [None]:
!pip install langchain-nvidia-ai-endpoints

In [None]:
!pip install --upgrade langchain langchain-core pydantic langchain-nvidia-ai-endpoints

In [None]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA
# from langchain

llm = ChatNVIDIA(
    model = nvidia_model_name,
    api_key = nvidia_api_key,
)

# from langchain_nvidia_ai_endpoints import ChatNVIDIA

# llm = ChatNVIDIA(model="mistralai/mixtral-8x22b-instruct-v0.1")