In [1]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [2]:
!pip install huggingface_hub -q

from huggingface_hub import login
login(token="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")

In [4]:
import json
import faiss
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import csv
import os

# Step 0: Config / Paths
IPC_JSON_PATH = "/content/ipc_sections.json"
RAG_OUTPUT_CSV = "/content/fbopt_baseline_outputs.csv"
TOP_K = 3 # Reduced TOP_K to shorten context
MAX_NEW_TOKENS = 100

# Step 1: Load IPC sections
with open(IPC_JSON_PATH, "r", encoding="utf-8") as f:
    ipc_sections = json.load(f)

print(f"Loaded {len(ipc_sections)} sections")

# Prepare text corpus
section_texts = [
    f"{s['section_number']}. {s['title']}. {s['body_text']}"
    for s in ipc_sections
]

# Step 2: Create Embeddings
print("Loading sentence-transformers model for embeddings...")
embed_model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

print("Generating embeddings for IPC sections...")
embeddings = embed_model.encode(section_texts, show_progress_bar=True, convert_to_numpy=True)
embedding_dim = embeddings.shape[1]

# Step 3: Build FAISS Index
print("Building FAISS index...")
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)
print(f"FAISS index contains {index.ntotal} vectors")

# Step 4: Load Model
print("Loading model...")
model_name = "facebook/opt-1.3b" # Reverted to opt-1.3b as Gemma had CUDA issues

# Configure quantization for potentially lower memory usage (optional for opt-1.3b but good practice)
# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True,
# )


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    # quantization_config=quantization_config,
)

# Step 5: Retrieval + Generation
def retrieve_sections(query, top_k=TOP_K):
    q_emb = embed_model.encode([query], convert_to_numpy=True)
    D, I = index.search(q_emb, top_k)
    return [section_texts[i] for i in I[0]]

def rag_generate(query, top_k=TOP_K, max_new_tokens=MAX_NEW_TOKENS):
    retrieved = retrieve_sections(query, top_k)
    context = "\n".join(retrieved)

    # Manually cap input length
    max_input_length = 2048  # OPT models handle ~2048 tokens safely
    inputs = tokenizer(
        f"Using the following IPC sections, answer the query:\n{context}\n\nQuery: {query}\nAnswer:",
        return_tensors="pt",
        truncation=True,
        max_length=max_input_length
    ).to(model.device)

    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer


# Step 6: Run on sample queries
sample_queries = [
    "Explain IPC Section 420",
    "Summarize IPC Section 302 in simple words",
    "A person forcibly entered someone’s house at night with intent to harm"
]

results = []
print("Running baseline...")
for q in sample_queries:
    print(f"\nQuery: {q}")
    a = rag_generate(q)
    print(f"Answer: {a}\n")
    results.append({"query": q, "answer": a})

# Step 7: Save outputs
print(f"Saving results to {RAG_OUTPUT_CSV}")
with open(RAG_OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["query", "answer"])
    writer.writeheader()
    writer.writerows(results)

print("Done")

Loaded 1433 sections
Loading sentence-transformers model for embeddings...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating embeddings for IPC sections...


Batches:   0%|          | 0/45 [00:00<?, ?it/s]

Building FAISS index...
FAISS index contains 1433 vectors
Loading model...


tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Running baseline...

Query: Explain IPC Section 420
Answer: Using the following IPC sections, answer the query:
121A. Conspiracy to commit offences punishable by section 121.. 
376C. Sexual intercourse by a person in authority.. 
50. “Section”.—The word “section” denotes one of those portions of a Chapter of this Code which.  are distinguished by prefixed numeral figures.

Query: Explain IPC Section 420
Answer:

Section 420 of the Indian Penal Code (IPC) is a criminal offence which provides for imprisonment for a term which may extend to ten years, or with fine, or with both, to any person who, having knowledge of the commission of an offence punishable by section 420, commits an offence punishable by that section.

The offence of “conspiracy to commit offences punishable by section 420” is a cognizable offence.

The offence of “sexual intercourse by a


Query: Summarize IPC Section 302 in simple words
Answer: Using the following IPC sections, answer the query:
3. Subs. by Act 13 of 20

In [5]:
import json
import faiss
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
import csv

# Step 0: Config / Paths
IPC_JSON_PATH = "/content/ipc_sections.json"
RAG_OUTPUT_CSV = "/content/distilgpt2_baseline_outputs.csv"
TOP_K = 3
MAX_NEW_TOKENS = 80

# Step 1: Load IPC sections
with open(IPC_JSON_PATH, "r", encoding="utf-8") as f:
    ipc_sections = json.load(f)

print(f"Loaded {len(ipc_sections)} sections")

# Prepare text corpus
section_texts = [
    f"{s['section_number']}. {s['title']}. {s['body_text']}"
    for s in ipc_sections
]

# Step 2: Create Embeddings
print("Loading sentence-transformers model for embeddings...")
embed_model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

print("Generating embeddings for IPC sections...")
embeddings = embed_model.encode(section_texts, show_progress_bar=True, convert_to_numpy=True)
embedding_dim = embeddings.shape[1]

# Step 3: Build FAISS Index
print("Building FAISS index...")
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)
print(f"FAISS index contains {index.ntotal} vectors")

# Step 4: Load Small Model (distilgpt2)
print("Loading distilgpt2...")
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Step 5: Retrieval + Generation
def retrieve_sections(query, top_k=TOP_K):
    q_emb = embed_model.encode([query], convert_to_numpy=True)
    D, I = index.search(q_emb, top_k)
    return [section_texts[i] for i in I[0]]

def rag_generate(query, top_k=TOP_K, max_new_tokens=MAX_NEW_TOKENS):
    retrieved = retrieve_sections(query, top_k)
    context = "\n".join(retrieved)

    prompt = f"Use the following IPC sections to answer:\n{context}\n\nQuery: {query}\nAnswer:"

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Step 6: Run on sample queries
sample_queries = [
    "Explain IPC Section 420",
    "Summarize IPC Section 302 in simple words",
    "A person forcibly entered someone’s house at night with intent to harm"
]

results = []
print("Running baseline with distilgpt2...")
for q in sample_queries:
    print(f"\nQuery: {q}")
    a = rag_generate(q)
    print(f"Answer: {a}\n")
    results.append({"query": q, "answer": a})

# Step 7: Save outputs
print(f"Saving results to {RAG_OUTPUT_CSV}")
with open(RAG_OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["query", "answer"])
    writer.writeheader()
    writer.writerows(results)

print("Done ")


Loaded 1433 sections
Loading sentence-transformers model for embeddings...
Generating embeddings for IPC sections...


Batches:   0%|          | 0/45 [00:00<?, ?it/s]

Building FAISS index...
FAISS index contains 1433 vectors
Loading distilgpt2...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Running baseline with distilgpt2...

Query: Explain IPC Section 420
Answer: Use the following IPC sections to answer:
121A. Conspiracy to commit offences punishable by section 121.. 
376C. Sexual intercourse by a person in authority.. 
50. “Section”.—The word “section” denotes one of those portions of a Chapter of this Code which.  are distinguished by prefixed numeral figures.

Query: Explain IPC Section 420
Answer: “Section” is a section of the Code which is used to describe the offence.
Query: Explain IPC Section 420
Answer: “Section” is a section of the Code which is used to describe the offence.
Query: Explain IPC Section 420
Answer: “Section” is a section of the Code which is used to describe the offence.



Query: Summarize IPC Section 302 in simple words
Answer: Use the following IPC sections to answer:
3. Subs. by Act 13 of 2013, s. 4, for “offence under section 376, section 376A, section 376B, section 376C or section 376D”.  (w.e.f. 3-2-2013).
1. Subs. by Act 13 of 2013, s. 9