<a href="https://colab.research.google.com/github/be-great/bigdata_llm/blob/main/Chat1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#chat

In [None]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!pip install pyspark==3.5.3


In [None]:
!pip install torch
!pip install faiss-cpu
!pip install typing_extensions
!pip install bitsandbytes
!pip install peft
!pip install sentence_transformers


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

base_file = "/content/bigdata_llm/"
# ====== CONFIG ======
BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
ADAPTER_PATH = base_file + "data/output/adapters"  # adapter files should be inside
KB_PATHS = {
    "doctors": base_file + "data/output/kb/doctors",
    "patients": base_file + "data/output/kb/patients",
    "appointments": base_file + "data/output/kb/appointments",
    "treatments": base_file + "data/output/kb/treatments",
    "billing": base_file + "data/output/kb/billing"

}
TOP_K = 3
# ====== DEVICE ======
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


# ===== LOAD FAISS INDEXES AND FACTS =====
kb_indexes = {}
kb_facts = {}

for kb_name, path in KB_PATHS.items():
    kb_indexes[kb_name] = faiss.read_index(f"{path}.index")
    kb_facts[kb_name] = np.load(f"{path}_facts.npy", allow_pickle=True)

# ====== LOAD EMBEDDING MODEL ======
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)


# ====== LOAD BASE + LoRA MODEL ON GPU ======
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Optional: load in 4-bit for GPU memory saving
from transformers import BitsAndBytesConfig
bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb,
    device_map="auto",  # automatically puts model on GPU
    torch_dtype=torch.bfloat16,
)

model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
model.eval()
print("âœ… LoRA/Glora model loaded on GPU!")


#Ask about the domain specific

In [None]:
from pyspark.sql import SparkSession
from sentence_transformers import SentenceTransformer
import numpy as np

# Example domain sentences for training a simple classifier
domain_examples = [
    "Which doctor specializes in cardiology?",
    "Patient billing amount for last visit",
    "Schedule an appointment for John Doe",
    "Treatment cost for chemotherapy",
    "Patient appointment time"
]
general_examples = [
    "Hi",
    "How are you?",
    "Tell me a joke",
    "What's the weather?",
    "Good morning"
]

# Create embeddings
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
domain_emb = embedder.encode(domain_examples)
general_emb = embedder.encode(general_examples)
"""train it in spark"""

spark = SparkSession.builder.appName("DomainClassifier").getOrCreate()
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import Row

# Prepare labeled data
rows = []
for vec in domain_emb:
    rows.append(Row(features=Vectors.dense(vec), label=1.0))  # domain
for vec in general_emb:
    rows.append(Row(features=Vectors.dense(vec), label=0.0))  # general

df = spark.createDataFrame(rows)

# Train a simple logistic regression classifier
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=50)
model_re = lr.fit(df)


In [None]:
from pyspark.sql import SparkSession, Row

spark = SparkSession.builder \
    .appName("DomainQuestionChecker") \
    .getOrCreate()

In [None]:
def get_kb_name(query):
    query = query.lower()
    if any(word in query for word in ["doctor", "specialization", "years experience"]):
        return "doctors"
    elif any(word in query for word in ["appointment", "visit", "schedule"]):
        return "appointments"
    elif any(word in query for word in ["treatment", "cost", "description"]):
        return "treatments"
    elif any(word in query for word in ["bill", "payment", "amount"]):
        return "billing"
    elif any(word in query for word in ["patient"]):
        return "patients"
    else:
        return None  # no KB for general question

def retrieve_context(query, top_k):
    """Retrieve top_k similar facts from a specific KB."""
    q_vec = embedder.encode([query], convert_to_numpy=True)
    kb_name = get_kb_name(query)
    if not kb_name:
        return "I don't know :(."
    D, I = kb_indexes[kb_name].search(q_vec.astype("float32"), top_k)
    return "\n".join(kb_facts[kb_name][i] for i in I[0])
def is_domain_question(query, threshold):
    vec = embedder.encode([query])[0]
    from pyspark.ml.linalg import Vectors
    spark_row = spark.createDataFrame([Row(features=Vectors.dense(vec))])
    pred = model_re.transform(spark_row).collect()[0]
    return pred['probability'][1] > threshold

while True:
    query = input("You: ")
    if query.lower() in ["exit", "quit"]:
        break

    # ====== Decide whether to retrieve context ======
    if is_domain_question(query, threshold=0.5):
        context = retrieve_context(query ,top_k=1)
        prompt = f"""
Context:
{context}
Question: {query}

"""
    else:
        # Non-domain question: normal LLM response
        prompt = f"{query}\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        temperature=0.4,
        top_p=0.9
    )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("Answer:", answer, "\n")


#Evaluate

1. Domain Classification Accuracy: Compute Precision, Recall, F1-Score, and Accuracy using a labeled dataset of domain vs. non-domain questions.
2. Retrieval Quality Metrics: Used for your FAISS-based retrieval:
