FINAL

In [1]:
!pip install -q sentence-transformers faiss-cpu transformers pdf2image Pillow pytesseract bitsandbytes

!apt-get update -qq
!apt-get install -y poppler-utils tesseract-ocr

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os, glob
import numpy as np
from pdf2image import convert_from_path
import faiss
import torch
from sentence_transformers import SentenceTransformer
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
import pytesseract
import re
import pickle
import time
from pypdf import PdfReader

2025-09-12 16:31:30.832514: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757694691.023077      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757694691.084970      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [10]:
FOLDER_PATH = "/kaggle/input/trial5"   
CHUNK_SIZE = 1500
OVERLAP = 200
TOP_K = 2
LLM_MAX_TOKENS = 200

def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=OVERLAP):
    chunks = []
    step = chunk_size - overlap if chunk_size > overlap else chunk_size
    start = 0
    while start < len(text):
        chunk = text[start:start + chunk_size].strip()
        if chunk:
            chunks.append(chunk)
        start += step
    return chunks

def clean_ocr_text(text: str) -> str:

    if not text:
        return ""
    # remove common OCR garbage
    text = text.replace('\r', '\n')
    # remove hyphenation at end of lines: 'exam-\nple' -> 'example'
    text = re.sub(r'(\w)-\n(\w)', r'\1\2', text)
    # join lines that don't end with sentence punctuation
    lines = text.splitlines()
    new_lines = []
    for i, line in enumerate(lines):
        line = line.strip()
        if not line:
            new_lines.append("")  # preserve paragraph breaks
            continue
        if new_lines and not re.search(r'[.!?:"\']$', new_lines[-1]):
            # previous line didn't end with punctuation -> likely broken sentence
            new_lines[-1] = new_lines[-1] + " " + line
        else:
            new_lines.append(line)
    cleaned = "\n\n".join([ln for ln in new_lines if ln.strip() != ""])
    # normalize whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned


def smart_chunk_text(text: str, chunk_size=CHUNK_SIZE, overlap=OVERLAP):
    
    if not text:
        return []
    # split into sentences (simple heuristic)
    sentences = re.split(r'(?<=[\.\?\!])\s+', text)
    chunks = []
    current = ""
    for sent in sentences:
        if len(current) + len(sent) + 1 <= chunk_size:
            current = (current + " " + sent).strip()
        else:
            if current:
                chunks.append(current)
            current = sent
    if current:
        chunks.append(current)

    # create overlap by sliding window of characters
    if overlap <= 0:
        return chunks

    merged = []
    step = chunk_size - overlap if chunk_size > overlap else chunk_size
    # re-join into one long string and slice with step (safer for sentence chunks)
    long_text = "\n\n".join(chunks)
    start = 0
    while start < len(long_text):
        part = long_text[start:start + chunk_size].strip()
        if part:
            merged.append(part)
        start += step
    return merged


def load_and_chunk_pdfs_simple(folder_path):

    all_chunks = []
    file_paths = sorted(glob.glob(os.path.join(folder_path, "*.pdf")))
    if not file_paths:
        raise FileNotFoundError(f"No PDFs found in {folder_path}")

    id_counter = 0
    for file_path in file_paths:
        basename = os.path.basename(file_path)
        try:
            from pypdf import PdfReader
            reader = PdfReader(file_path)
        except Exception as e:
            print(f"❌ Could not open {basename} with pypdf: {e}")
            continue

        for page_index, page in enumerate(reader.pages, start=1):
            try:
                text = page.extract_text() or ""
            except Exception:
                text = ""

            if not text.strip():
                continue

            # you can reuse smart_chunk_text for better splits
            page_chunks = smart_chunk_text(text)

            for ci, chunk in enumerate(page_chunks):
                uid = f"{basename.replace('.','_')}_p{page_index}_c{ci}_{id_counter}"
                id_counter += 1
                all_chunks.append({
                    "id": uid,
                    "content": chunk,
                    "metadata": {"source": basename, "page": page_index}
                })

        print(f"Simple text extraction done for {basename}, total chunks so far: {len(all_chunks)}")

    print(f"Total chunks created (simple): {len(all_chunks)}")
    return all_chunks

def load_and_chunk_pdfs(folder_path):
    all_chunks = []
    file_paths = sorted(glob.glob(os.path.join(folder_path, "*.pdf")))
    if not file_paths:
        raise FileNotFoundError(f"No PDFs found in {folder_path}")

    id_counter = 0
    for file_path in file_paths:
        basename = os.path.basename(file_path)
        try:
            pages = convert_from_path(file_path, dpi=300)  # convert each page to image
        except Exception as e:
            print(f"❌ Could not read {basename}: {e}")
            continue

        for page_index, page_image in enumerate(pages, start=1):
            try:
                raw_text = pytesseract.image_to_string(page_image, lang='eng')
            except Exception:
                raw_text = ""

            if not raw_text.strip():
                continue

            # clean OCR output and chunk
            page_text = clean_ocr_text(raw_text)
            page_chunks = smart_chunk_text(page_text)

            for ci, chunk in enumerate(page_chunks):
                uid = f"{basename.replace('.','_')}_p{page_index}_c{ci}_{id_counter}"
                id_counter += 1
                all_chunks.append({
                    "id": uid,
                    "content": chunk,
                    "metadata": {"source": basename, "page": page_index}
                })

        print(f"OCR completed for {basename}, total chunks so far: {len(all_chunks)}")

    print(f"Total chunks created: {len(all_chunks)}")
    return all_chunks

def load_and_chunk_pdfs_combined(folder_path):
    # First, get OCR chunks
    ocr_chunks = load_and_chunk_pdfs(folder_path)
    
    # Then, get simple/native text chunks
    simple_chunks = load_and_chunk_pdfs_simple(folder_path)
    
    # Combine both
    all_chunks = ocr_chunks + simple_chunks
    print(f"Total chunks after combining OCR and simple extraction: {len(all_chunks)}")
    return all_chunks


def create_faiss_index(chunks):
    embedder = SentenceTransformer('all-MiniLM-L6-v2')
    print("Encoding chunks...")
    texts = [chunk['content'] for chunk in chunks]
    embeddings = embedder.encode(texts, show_progress_bar=True, convert_to_numpy=True)

    faiss.normalize_L2(embeddings)

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension) 
    index.add(embeddings)

    print(f"FAISS index created with {len(chunks)} vectors.")

    metadata = [chunk['metadata'] for chunk in chunks]
    return embedder, index, texts, metadata


def load_falcon_generator():
    try:
        print("Loading Falcon 7B")
        tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b", use_fast=True)
        quant_config = BitsAndBytesConfig(load_in_8bit=True)
        model = AutoModelForCausalLM.from_pretrained(
            "tiiuae/falcon-7b",
            quantization_config=quant_config,
            device_map="auto",
            torch_dtype=torch.float16,
            trust_remote_code=True
        )
        generator = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            device_map="auto"
        )
        print("Falcon 7B generator ready.")
        return generator
    except Exception:
        print("❌ Falcon-7B could not be loaded. Falling back to Falcon-7B-Instruct...")
        tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct", use_fast=True)
        model = AutoModelForCausalLM.from_pretrained(
            "tiiuae/falcon-7b-instruct",
            device_map="auto",
            torch_dtype=torch.float16,
            trust_remote_code=True
        )
        generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
        print("Falcon-7B-Instruct generator ready.")
        return generator

def generate_answer_from_context(generator, query, retrieved_documents_with_meta):
    formatted = []
    for doc, meta in retrieved_documents_with_meta:
        src = meta.get("source", "unknown")
        pg = meta.get("page", "unknown")
        formatted.append(f"[{src} | page:{pg}]\n{doc}")
    context_block = "\n\n---\n\n".join(formatted)
    prompt = f"""Use ONLY the provided context to answer the question.
If the context does not contain enough info, say:
"not enough information"

Context:
{context_block}

Question:
{query}

Answer:"""
    output = generator(prompt, max_new_tokens=LLM_MAX_TOKENS, do_sample=False)
    text = output[0]["generated_text"]
    idx = text.find("Answer:")
    if idx != -1:
        return text[idx + len("Answer:"):].strip()
    return text.strip()


def answer_query_faiss(embedder, index, contents, metadata, generator, query, top_k=TOP_K):
    # embed and normalize query
    query_embedding = embedder.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(query_embedding)
    D, I = index.search(query_embedding, top_k)
    retrieved_chunks = [(contents[i], metadata[i]) for i in I[0]]
    return generate_answer_from_context(generator, query, retrieved_chunks)

In [12]:
if __name__ == "__main__":
    # 1) OCR + chunk
    chunks = load_and_chunk_pdfs_combined(FOLDER_PATH)

    # 2) build embeddings + index
    if len(chunks) == 0:
        raise SystemExit("No chunks were created. Check OCR output.")
    embedder, index, contents, metadata = create_faiss_index(chunks)
    # 3) load generator
    generator = load_falcon_generator()

    print("\nRAG system ready. Type 'quit' to exit.")
    while True:
        query = input("\nEnter your query: ").strip()
        if query.lower() == "quit":
            break
        answer = answer_query_faiss(embedder, index, contents, metadata, generator, query)
        print("\nFinal Answer:\n", answer)

OCR completed for Bajaj Hindusthan Sugar Limited 2022-2023 Business Responsibility and Sustainability Report (SustainabilityReports.com) 6.pdf, total chunks so far: 74
Total chunks created: 74
Simple text extraction done for Bajaj Hindusthan Sugar Limited 2022-2023 Business Responsibility and Sustainability Report (SustainabilityReports.com) 6.pdf, total chunks so far: 75
Total chunks created (simple): 75
Total chunks after combining OCR and simple extraction: 149
Encoding chunks...


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

FAISS index created with 149 vectors.
Loading Falcon 7B


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

❌ Falcon-7B could not be loaded. Falling back to Falcon-7B-Instruct...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_falcon.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b-instruct:
- configuration_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_falcon.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b-instruct:
- modeling_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 564.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 242.12 MiB is free. Process 2968 has 14.50 GiB memory in use. Of the allocated memory 13.53 GiB is allocated by PyTorch, and 856.94 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)