In [None]:
# Colab setup
!pip install langchain faiss-cpu sentence-transformers pytesseract pdfplumber opencv-python-headless

import os
import pytesseract
import pdfplumber
import cv2
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from tqdm import tqdm
from pathlib import Path

DATA_DIR = "/content/dataset"
INDEX_DIR = "/content/faiss_indexes"
os.makedirs(INDEX_DIR, exist_ok=True)

# Load embedding model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def ocr_image(img_path):
    img = cv2.imread(img_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY)
    return pytesseract.image_to_string(thresh)

def extract_text_from_pdf(pdf_path):
    text = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text.append(page.extract_text() or "")
    return "\n".join(text)

def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunks.append(" ".join(words[i:i+chunk_size]))
    return chunks

texts = []
sources = []

for file in tqdm(list(Path(DATA_DIR).rglob("*"))):
    if file.suffix.lower() in [".png", ".jpg", ".jpeg"]:
        txt = ocr_image(str(file))
        chunks = chunk_text(txt)
        texts.extend(chunks)
        sources.extend([str(file)] * len(chunks))
    elif file.suffix.lower() == ".pdf":
        txt = extract_text_from_pdf(str(file))
        chunks = chunk_text(txt)
        texts.extend(chunks)
        sources.extend([str(file)] * len(chunks))

# Embed and store in FAISS
embeddings = model.encode(texts, show_progress_bar=True)
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(np.array(embeddings, dtype="float32"))
faiss.write_index(index, f"{INDEX_DIR}/ncert_jee_neet.index")

print(f"Indexed {len(texts)} chunks â†’ {INDEX_DIR}/ncert_jee_neet.index")
