In [1]:
!pip install pypdf sentence-transformers langchain langchain-community faiss-cpu google-generativeai



Collecting pypdf
  Downloading pypdf-6.4.0-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain-community)
  Downloading typi

In [4]:
import os
import re
from pypdf import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from sentence_transformers import SentenceTransformer
import google.generativeai as genai


In [5]:
import os
import re
from pypdf import PdfReader

pdf_dir = "/content/data"  #load the pdf folder

# List PDF files
pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith(".pdf")]
print("Total PDFs found:", len(pdf_files))

# Clean text function — removes garbage tokens
def clean_text(t):
    if t is None:
        return ""
    t = t.encode("utf-8", "ignore").decode("utf-8", "ignore")  # keep valid UTF8
    t = re.sub(r"\\[a-zA-Z]+", " ", t)                         # remove LaTeX
    t = re.sub(r"[^\x00-\x7F]+", " ", t)                       # remove weird unicode
    t = re.sub(r"\w{40,}", " ", t)                             # remove long garbage
    t = " ".join(t.split())                                    # normalize spaces
    return t

# Extract and clean text
all_text = ""

for pdf in pdf_files:
    print("Reading:", pdf)
    try:
        reader = PdfReader(os.path.join(pdf_dir, pdf))
        for page in reader.pages:
            raw = page.extract_text()
            cleaned = clean_text(raw)
            if cleaned.strip():
                all_text += cleaned + "\n"
    except Exception as e:
        print(f"Error reading {pdf}: {e}")

print("\nTotal characters extracted:", len(all_text))


Total PDFs found: 3
Reading: ipc.pdf
Reading: crpc.pdf
Reading: india_cons.pdf

Total characters extracted: 2146089


In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=130,
    chunk_overlap=50
)

chunks = text_splitter.split_text(all_text)

print("Total chunks created:", len(chunks))
print("Example chunk:\n", chunks[0][:500], "...")

Total chunks created: 26730
Example chunk:
 1 THE INDIAN PENAL CODE ___________ ARRANGEMENT OF SECTIONS __________ CHAPTER I INTRODUCTION PREAMBLE SECTIONS 1. Title and ...


In [9]:
clean_chunks = []

for c in chunks:
    # Must be a string
    if not isinstance(c, str):
        continue

    c = c.strip()

    # Remove tiny garbage chunks
    if len(c) < 50:
        continue

    # Remove extremely large chunks (rare)
    if len(c) > 3000:
        continue

    # Remove chunks with too many special characters
    if sum([1 for ch in c if not ch.isalnum() and ch not in " .,-()"]) > 50:
        continue

    clean_chunks.append(c)

print("Original chunks:", len(chunks))
print("Clean usable chunks:", len(clean_chunks))
print("Example cleaned chunk:\n", clean_chunks[0][:500], "...")


Original chunks: 26730
Clean usable chunks: 26721
Example cleaned chunk:
 1 THE INDIAN PENAL CODE ___________ ARRANGEMENT OF SECTIONS __________ CHAPTER I INTRODUCTION PREAMBLE SECTIONS 1. Title and ...


In [11]:
from sentence_transformers import SentenceTransformer
from langchain.embeddings.base import Embeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# 1. Create LangChain-compatible embedding class
class MiniLMEmbeddings(Embeddings):
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        # MiniLM can ONLY accept list[str]
        return self.model.encode(texts).tolist()

    def embed_query(self, text):
        # MiniLM can ONLY accept list[str]
        return self.model.encode([text]).tolist()[0]

# 2. Initialize embedding model
embeddings = MiniLMEmbeddings()

# 3. Convert chunks → LangChain Documents
documents = [Document(page_content=c) for c in clean_chunks]

# 4. Build FAISS vector DB
print("Embedding & building FAISS… this may take ~1–3 minutes.")

vectorstore = FAISS.from_documents(documents, embeddings)

# 5. Create retriever
retriever = vectorstore.as_retriever(k=5)

print("FAISS Vector Database created successfully!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding & building FAISS… this may take ~1–3 minutes.
FAISS Vector Database created successfully!


In [12]:
vectorstore.save_local("/content/faiss_index")
print("FAISS vector database saved to /content/faiss_index")

FAISS vector database saved to /content/faiss_index


In [13]:
import os

alt_path = "/content/faiss_index_alt"
os.makedirs(alt_path, exist_ok=True)
vectorstore.save_local(alt_path)
print(f"FAISS vector database saved to {alt_path}")

FAISS vector database saved to /content/faiss_index_alt


In [14]:
import json
import os

# Assuming 'documents' list is available from previous steps
# (Each document is a langchain_core.documents.Document object)

json_output_path = os.path.join(alt_path, "documents.json")

# Convert list of Document objects to a list of dictionaries
# Ensure that the Document object has `page_content` and `metadata` attributes
documents_as_dicts = [
    {"page_content": doc.page_content, "metadata": doc.metadata}
    for doc in documents
]

# Save the list of dictionaries to a JSON file
with open(json_output_path, "w") as f:
    json.dump(documents_as_dicts, f, indent=4)

print(f"All document content saved to {json_output_path} in JSON format.")

All document content saved to /content/faiss_index_alt/documents.json in JSON format.


In [19]:
!pip install -q google-generativeai  ## CREATE GOOGLE DEPENTENCY


In [20]:
import faiss
import pickle
import numpy as np
import google.generativeai as genai
genai.configure(api_key="")
model = genai.GenerativeModel("gemini-2.0-flash")   # or 1.5-pro


In [34]:
import faiss
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
import google.generativeai as genai


In [52]:
genai.configure(api_key="")
model = genai.GenerativeModel("gemini-2.0-flash")   # or 1.5-pro

In [53]:
faiss_index = faiss.read_index("/content/faiss_index_alt/index.faiss")
print("FAISS dimension:", faiss_index.d)
print("FAISS vectors:", faiss_index.ntotal)


FAISS dimension: 384
FAISS vectors: 26721


In [54]:
def load_documents(path):
    with open(path, "rb") as f:
        data = pickle.load(f)

    # Case 1 – Data is dict
    if isinstance(data, dict):
        for key in data.keys():
            if isinstance(data[key], list) and len(data[key]) == faiss_index.ntotal:
                print("Matched documents inside key:", key)
                return data[key]
        raise ValueError("No matching list in dict for FAISS size.")

    # Case 2 – Data is tuple/list of lists
    if isinstance(data, (list, tuple)):
        for item in data:
            if isinstance(item, list) and len(item) == faiss_index.ntotal:
                print("Matched documents inside tuple/list")
                return item
        raise ValueError("No matching list found in tuple/list.")

    # Case 3 – Data itself is the list
    if isinstance(data, list) and len(data) == faiss_index.ntotal:
        print("index.pkl is a plain list.")
        return data

    raise ValueError("index.pkl structure does not match FAISS index.")


In [55]:
faiss_index = faiss.read_index("/content/faiss_index_alt/index.faiss")
print("FAISS dim:", faiss_index.d)
print("FAISS vectors:", faiss_index.ntotal)


FAISS dim: 384
FAISS vectors: 26721


In [56]:
# Load tuple (docstore, index_to_docstore_id)
with open("/content/faiss_index_alt/index.pkl", "rb") as f:
    docstore, index_to_id = pickle.load(f)

# Docstore contains all text documents in a dict
real_docstore = docstore._dict   # {uuid: Document}

print("Documents in docstore:", len(real_docstore))


Documents in docstore: 26721


In [57]:
# Convert FAISS index → text
documents = []
for idx in range(faiss_index.ntotal):
    uuid = index_to_id[idx]                 # map FAISS index → uuid
    doc = real_docstore[uuid]               # get Document object
    documents.append(doc.page_content)      # extract text


In [58]:
print("Final document count:", len(documents))  # should match FAISS vectors


Final document count: 26721


In [59]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2")


In [60]:
def get_embedding(text):
    emb = embed_model.encode([text], normalize_embeddings=True)
    return np.array(emb[0], dtype="float32")


In [61]:
def search_faiss(query, k=5):
    k = min(k, faiss_index.ntotal)
    query_vec = get_embedding(query).reshape(1, -1)
    distances, indices = faiss_index.search(query_vec, k)

    return [documents[i] for i in indices[0] if i != -1]


In [62]:
def rag_answer(query):
    retrieved = search_faiss(query, k=5)
    context = "\n\n".join(retrieved)

    prompt = f"""
You are NyayaGPT, an Indian Law RAG-based Legal Assistant.

Your responsibilities:

1. Use ONLY content retrieved from the RAG system (Acts, Articles, Sections, Rules, Case Laws).
2. If information is not retrieved, clearly say you don't have specific data and give a general explanation without hallucinating.
3. NEVER create fake legal sections, years, or case laws.

Always format the answer in the following structure:

A) Summary (Simple & clear)

B) Relevant Laws (Acts, Articles, Sections retrieved)

C) Detailed Legal Explanation (professional tone, include definitions, exceptions, burden of proof)

D) Case Laws (ONLY if provided by RAG; include year, ratio decidendi, principle)

E) Procedure / Steps (if applicable)

F) Penalties / Remedies (if applicable)

G) Risk & Severity Indicator (bailable? cognizable? civil/criminal? low/medium/high severity)

H) Simple Mode (3–4 beginner-friendly points)

Safety Rules:
- Add a small disclaimer: "This is for information only, not legal advice."
- Do not predict outcomes of real cases.
- Do not provide illegal or unethical guidance.
- If user asks something unlawful, refuse politely and explain legal consequences.

Additional Features:
- Support comparison mode when asked.
- Support translation to Indian languages.
- Maintain context across follow-up questions.
- Tag key points as (Source: Retrieved Document) without inventing citations.

Goal:
Give accurate, structured, legally correct answers with zero hallucination.


{context}

User Question: {query}
"""

    response = gemini_model.generate_content(prompt)
    return response.text


In [71]:
print(rag_answer("SECTION FOR MURDER"))


I don't have specific data about a section *for* murder. However, I can provide information about sections related to murder and culpable homicide.

A) Summary:
*   **Murder:** Defined as causing death with specific intentions or knowledge (Source: Retrieved Document).
*   **Culpable Homicide:** Causing death without necessarily intending to cause death, but with some degree of recklessness or knowledge (Source: Retrieved Document).
*   **Attempt to Murder:** Doing an act with the intention to cause death, even if death does not occur (Source: Retrieved Document).

B) Relevant Laws:
*   Section 300 of the Indian Penal Code (IPC)
*   Section 304 of the Indian Penal Code (IPC)
*   Section 309 of the Indian Penal Code (IPC)

C) Detailed Legal Explanation:

*   **Murder (Section 300, IPC):**
    *   Defines murder as culpable homicide with specific aggravating factors.
    *   Culpable homicide is murder if the act causing death is done with the intention of causing death; or if it is done