<a href="https://colab.research.google.com/github/Yashmitha22/BOSE/blob/main/QAsystem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install & show versions (may require runtime restart after bitsandbytes)
!pip install -q --upgrade pip
!apt-get -y install -qq git-lfs

# Core libs
!pip install -q transformers accelerate datasets peft bitsandbytes sentence-transformers faiss-cpu pdfplumber tiktoken

# Optional evaluation tools
!pip install -q evaluate

# Print versions for debugging

import importlib
pkgs = ["transformers","accelerate","bitsandbytes","peft","datasets","sentence_transformers","faiss","pdfplumber","tiktoken"]
for p in pkgs:
    try:
        m = importlib.import_module(p)
        print(p, getattr(m, "__version__", "n/a"))
    except Exception as e:
        print(p, "NOT INSTALLED:", e)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m0.8/1.8 MB[0m [31m23.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25htransformers 4.57.2
accelerate 1.12.0
bitsandbytes 0.48.2
peft 0.18.0
datasets 4.0.0
sentence_transformers 5.1.2
faiss 1.13.0
pdfplumber 0.11.8
tiktoken 0.12.0


In [None]:
import os

# === Edit these if you used different paths ===
PDF_FOLDER = "/content/drive/MyDrive/slm_pdfs"   # where you will put EX1280C.pdf, DM8SE.pdf etc.
OUT_TEXT_DIR = "/content/pdf_texts"
CHUNKS_JSONL = "/content/all_chunks.jsonl"
SFT_JSONL = "/content/sft_dataset.jsonl"
FAISS_INDEX_PATH = "/content/faiss_index.idx"
FAISS_META_PATH = "/content/faiss_meta.json"
OUTPUT_DIR = "/content/gemma-qlora-output"
ADAPTER_DIR = "/content/gemma-qlora-adapter"

# Model & embedding settings
BASE_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# Chunking & training hyperparams (tune for your GPU)
CHUNK_SIZE = 800
CHUNK_OVERLAP = 128
TOP_K = 3
NUM_EPOCHS = 2
PER_DEVICE_BATCH_SIZE = 1
GRAD_ACCUM_STEPS = 8
LEARNING_RATE = 2e-4
MAX_SEQ_LEN = 1024

print("Paths set. Check PDF_FOLDER exists:", os.path.exists(PDF_FOLDER))


Paths set. Check PDF_FOLDER exists: False


In [None]:
import glob, os
pdfs = glob.glob(os.path.join(PDF_FOLDER, "*.pdf")) + glob.glob("/content/*.pdf")
print("Found PDF files:", pdfs)
if len(pdfs) == 0:
    raise SystemExit("No PDF files found. Upload your datasheet PDFs (EX1280C.pdf, DM8SE.pdf) to the path in PDF_FOLDER or /content/ and re-run.")


Found PDF files: ['/content/loud speaker dataset.pdf', '/content/DSP dataset.pdf']


In [None]:
import os, pdfplumber, re, glob, json
os.makedirs(OUT_TEXT_DIR, exist_ok=True)

def clean_text(t):
    t = re.sub(r'\s+', ' ', t)
    return t.strip()

pdf_paths = glob.glob(os.path.join(PDF_FOLDER, "*.pdf")) + glob.glob("/content/*.pdf")
print("Extracting from:", pdf_paths)
for p in pdf_paths:
    fname = os.path.splitext(os.path.basename(p))[0]
    outp = os.path.join(OUT_TEXT_DIR, fname + ".txt")
    if os.path.exists(outp):
        print("Skipping existing:", outp)
        continue
    try:
        text_parts = []
        with pdfplumber.open(p) as pdf:
            for page in pdf.pages:
                text = page.extract_text() or ""
                text_parts.append(text)
        full = clean_text(" ".join(text_parts))
        with open(outp, "w", encoding="utf-8") as f:
            f.write(full)
        print("Saved:", outp)
    except Exception as e:
        print("Failed to extract", p, ":", e)
print("Extraction complete. Text files in:", OUT_TEXT_DIR)


Extracting from: ['/content/loud speaker dataset.pdf', '/content/DSP dataset.pdf']
Saved: /content/pdf_texts/loud speaker dataset.txt
Saved: /content/pdf_texts/DSP dataset.txt
Extraction complete. Text files in: /content/pdf_texts


In [None]:
import glob, json
def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    words = text.split()
    chunks = []
    i = 0
    n = len(words)
    while i < n:
        chunk = words[i:i+chunk_size]
        chunks.append(" ".join(chunk))
        i += chunk_size - overlap
    return chunks

text_files = glob.glob(os.path.join(OUT_TEXT_DIR, "*.txt"))
all_chunks = []
for tfile in text_files:
    with open(tfile, "r", encoding="utf-8") as f:
        txt = f.read()
    chunks = chunk_text(txt)
    for ci, c in enumerate(chunks):
        all_chunks.append({"source_file": os.path.basename(tfile), "chunk_id": f"{os.path.basename(tfile)}_{ci}", "text": c})

print("Total chunks:", len(all_chunks))
with open(CHUNKS_JSONL, "w", encoding="utf-8") as f:
    for c in all_chunks:
        f.write(json.dumps(c, ensure_ascii=False) + "\n")
print("Saved chunks to", CHUNKS_JSONL)


Total chunks: 4
Saved chunks to /content/all_chunks.jsonl


In [None]:
import random, json
random.shuffle(all_chunks)

def make_sft_example(chunk):
    instruction = ("You are a technical domain expert. Read the passage and produce a concise summary "
                   "followed by 5 key facts or specification items in bullet form.")
    input_text = chunk["text"]
    # NOTE: This uses the chunk itself as the 'answer' summary seed. Replace with human Q/A later.
    output = "SUMMARY:\n" + (input_text[:2000])
    return {"instruction": instruction, "input": "", "output": output}

with open(SFT_JSONL, "w", encoding="utf-8") as f:
    for c in all_chunks:
        ex = make_sft_example(c)
        f.write(json.dumps(ex, ensure_ascii=False) + "\n")
print("Saved SFT dataset (synthetic) to", SFT_JSONL)


Saved SFT dataset (synthetic) to /content/sft_dataset.jsonl


In [None]:
from sentence_transformers import SentenceTransformer
import faiss, numpy as np, json

print("Loading embedding model:", EMB_MODEL_NAME)
emb_model = SentenceTransformer(EMB_MODEL_NAME)
chunk_texts = [c["text"] for c in all_chunks]
print("Encoding", len(chunk_texts), "chunks...")
embeddings = emb_model.encode(chunk_texts, show_progress_bar=True, convert_to_numpy=True)

d = embeddings.shape[1]
index = faiss.IndexFlatIP(d)
faiss.normalize_L2(embeddings)
index.add(embeddings)
faiss.write_index(index, FAISS_INDEX_PATH)
with open(FAISS_META_PATH, "w", encoding="utf-8") as f:
    json.dump(all_chunks, f)
print("Saved FAISS index at", FAISS_INDEX_PATH)


Loading embedding model: sentence-transformers/all-MiniLM-L6-v2


special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding 4 chunks...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved FAISS index at /content/faiss_index.idx


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling

print("Preparing for QLoRA training with", BASE_MODEL)

# BitsAndBytes quant config for 4-bit
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})

# Try to load model in 4-bit; if it fails, fallback to 8-bit
try:
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        device_map="auto",
        quantization_config=quant_config,
        torch_dtype=torch.float16,
    )
    print("Loaded base model in 4-bit.")
except Exception as e:
    print("4-bit loading failed:", e)
    print("Falling back to 8-bit quantization (safe).")
    quant_config2 = BitsAndBytesConfig(load_in_8bit=True)
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        device_map="auto",
        quantization_config=quant_config2,
        torch_dtype=torch.float16,
    )

# Prepare for k-bit LoRA training and attach LoRA
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

# Load dataset
dataset = load_dataset("json", data_files=SFT_JSONL)["train"]

def format_prompt(ins, inp, out):
    return f"### Instruction:\n{ins}\n\n### Input:\n{inp}\n\n### Response:\n{out}"

def tokenize_fn(batch):
    prompts = [format_prompt(i, j, k) for i,j,k in zip(batch["instruction"], batch["input"], batch["output"])]
    tokenized = tokenizer(prompts, truncation=True, padding="max_length", max_length=MAX_SEQ_LEN)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

print("Tokenizing dataset...")
tokenized = dataset.map(tokenize_fn, batched=True, remove_columns=dataset.column_names)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    fp16=True,
    logging_steps=10,
    save_total_limit=3,
    optim="paged_adamw_32bit",
    remove_unused_columns=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    data_collator=data_collator,
)

print("Start training. If Colab kernel crashes, lower batch size or gradient accumulation.")
trainer.train()

print("Saving LoRA adapter to", ADAPTER_DIR)
model.save_pretrained(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)
print("Saved.")


Preparing for QLoRA training with Qwen/Qwen2.5-1.5B-Instruct


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Loaded base model in 4-bit.


Generating train split: 0 examples [00:00, ? examples/s]

Tokenizing dataset...


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Start training. If Colab kernel crashes, lower batch size or gradient accumulation.


  return fn(*args, **kwargs)


Step,Training Loss


Saving LoRA adapter to /content/gemma-qlora-adapter
Saved.


In [None]:
import json, faiss
from sentence_transformers import SentenceTransformer
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# Load FAISS and embeddings
emb_model = SentenceTransformer(EMB_MODEL_NAME)
index = faiss.read_index(FAISS_INDEX_PATH)
with open(FAISS_META_PATH, "r", encoding="utf-8") as f:
    meta = json.load(f)

# Quant config (same as training)
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})

# Load base model with fallback
try:
    base = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        device_map="auto",
        quantization_config=quant_config,
        torch_dtype=torch.float16,
    )
except Exception as e:
    print("4-bit load failed at inference, falling back to 8-bit:", e)
    quant_config2 = BitsAndBytesConfig(load_in_8bit=True)
    base = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        device_map="auto",
        quantization_config=quant_config2,
        torch_dtype=torch.float16,
    )

# Attach LoRA adapter
model = PeftModel.from_pretrained(base, ADAPTER_DIR)
model.eval()

def retrieve(question, k=TOP_K):
    q_emb = emb_model.encode([question], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, k)
    return [meta[i]["text"] for i in I[0]]

def make_prompt(question, contexts):
    return (
        "You are a domain expert assistant. Use only the context to answer.\n\n"
        + "\n".join([f"[CONTEXT]\n{c}" for c in contexts])
        + f"\n\nQuestion: {question}\nAnswer:"
    )

def extract_answer(full_output):
    """Extract only the generated answer after 'Answer:'."""
    if "Answer:" in full_output:
        # Get everything after the LAST 'Answer:'
        ans = full_output.split("Answer:")[-1].strip()

        # Remove cases where model repeats prompts
        garbage_tokens = ["### Instruction", "[CONTEXT", "Question:"]
        for g in garbage_tokens:
            if g in ans:
                ans = ans.split(g)[0].strip()

        return ans.strip()
    return full_output.strip()

# Interactive loop
while True:
    q = input("\nEnter question (or 'exit'): ").strip()
    if q.lower() in ("exit", "quit"):
        break

    contexts = retrieve(q, k=TOP_K)
    prompt = make_prompt(q, contexts)

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=256, do_sample=False)

    full_output = tokenizer.decode(out[0], skip_special_tokens=True)
    final_answer = extract_answer(full_output)

    print("\n=== ANSWER ===\n", final_answer)



Enter question (or 'exit'): "What is the maximum number of analog inputs on the EX-1280C?"

=== ANSWER ===
 Based on the provided technical data, the EX-1280C has 12 analog inputs. This can be determined from the following relevant excerpt:

"Analog Input Voltage Range 0 V to 3.3 V (threshold voltage = 1.6 V)" 
This indicates that each analog input has an associated threshold voltage, but it does not specify the total number of inputs. However, since there are 12 separate entries under "Input Channels", this suggests that the device has 12 analog inputs. Therefore, the correct answer is 12.

Enter question (or 'exit'): "What is the Dynamic Range of the analog signal path?"

=== ANSWER ===
 The maximum dynamic range of the analog signal path is greater than 115 dB, as indicated by the statement "Dynamic Range > 115 dB, A-weighted 20 Hz – 20 kHz, analog input to analog output". This value represents the highest possible difference between the loudest and quietest parts of the signal bef

In [None]:
# Helpful diagnostics (run only if you get an AcceleratorError)
import os, torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))
print("If device-side asserts persist, set CUDA_LAUNCH_BLOCKING=1 and rerun the failing cell.")
# To use it (restart kernel after setting):
# import os
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
