In [None]:
# Add project root to path
import sys
from pathlib import Path

# Get the project root (2 levels up from notebooks)
project_root = Path().resolve().parent.parent
sys.path.insert(0, str(project_root))

# Now import the module
from retrieval.indexing.neural_rag import RAGIndexer

print(f"Project root: {project_root}")
print("RAGIndexer imported successfully!")

  from .autonotebook import tqdm as notebook_tqdm


Project root: /mnt/Extension/Workspace/uni/Fall 2025/CS 646/Final/code
SimpleRAGIndexer imported successfully!


# RAG Indexing for Bar Exam QA Dataset


**Model Used:** `BAAI/bge-small-en-v1.5` (384 dimensions)  
**Index Type:** FAISS with L2 distance

In [20]:
# Define paths
data_path = "../data/downloads/barexam_qa/barexam_qa_passages"
index_output_dir = "../data/processed/faiss_index/faiss_index.bin"
metadata_output_dir = "../data/processed/faiss_index/metadata.pkl"

print(f"Data path: {data_path}")
print(f"Index output: {index_output_dir}")

Data path: ../data/downloads/barexam_qa/barexam_qa_passages
Index output: ../data/processed/faiss_index/faiss_index.bin


In [13]:
# Let's first check the structure of the dataset
from datasets import load_from_disk

dataset = load_from_disk(data_path)
print("Dataset splits:", dataset.keys())
print("\nTrain split info:")
print(dataset["train"])
print("\nColumn names:", dataset["train"].column_names)
print("\nFirst example:")
print(dataset["train"][0])

Dataset splits: dict_keys(['train', 'validation', 'test'])

Train split info:
Dataset({
    features: ['idx', 'source', 'faiss_id', 'case_id', 'absolute_paragraph_id', 'opinion_id', 'relative_paragraph_id', 'text'],
    num_rows: 686324
})

Column names: ['idx', 'source', 'faiss_id', 'case_id', 'absolute_paragraph_id', 'opinion_id', 'relative_paragraph_id', 'text']

First example:
{'idx': 'mbe_4', 'source': 'mbe', 'faiss_id': '4', 'case_id': 'nan', 'absolute_paragraph_id': 'nan', 'opinion_id': 'nan', 'relative_paragraph_id': 'nan', 'text': "The res ipsa loquitur doctrine “enables a jury presented only with circumstantial evidence to infer negligence simply from the fact that an event happened.” St. Paul Fire & Marine Ins. Co. v. City of New York, 907 F.2d 299, 302 (2d Cir.1990).  The criteria for applying res ipsa loquitur include  “(1) the event must be of a kind which ordinarily does not occur in the absence of someone's negligence; (2) it must be caused by an agency or instrumentali

## Step 1: Build the Index

Create the FAISS index from the Bar Exam passages dataset.
- Load the passages from disk
- Generate embeddings using the HuggingFace model
- Build and save the FAISS index

In [None]:
# Create indexer
indexer = RAGIndexer()

# Build and save the index
# indexer.load_index(index_path=index_output_dir, metadata_path=metadata_output_dir)  
indexer.build_index(data_path, index_output_dir)

Loading index from ../data/processed/faiss_index/faiss_index.bin...
Index loaded successfully. Total passages: 686324


## Step 2: Search the Index

In [23]:
# Example query
query = "What is contract law?"
results = indexer.search(query, k=3)

print(f"Query: {query}\n")
print("=" * 80)

for result in results:
    print(f"\nRank {result['rank']}:")
    print(f"Passage ID: {result['passage_id']}")
    print(f"Distance: {result['distance']:.4f}")
    print(f"Text: {result['passage'][:200]}...")
    print("-" * 80)

Query: What is contract law?


Rank 1:
Passage ID: caselaw_12561526_20
Distance: 0.5026
Text: The meaning of a contract is a question of law, in connection with which an appellate court has an obligation to reach its conclusions independently of the determinations made by the court below....
--------------------------------------------------------------------------------

Rank 2:
Passage ID: caselaw_12563894_99
Distance: 0.5026
Text: The meaning of a contract is a question of law, in connection with which an appellate court has an obligation to reach its conclusions independently of the determinations made by the court below....
--------------------------------------------------------------------------------

Rank 3:
Passage ID: caselaw_12521851_37
Distance: 0.5685
Text: "Contract" is defined as "any contract, agreement, deed, lease or similar instrument, and any legally binding obligation, commitment, arrangement or understanding, whether written or oral."...
-------------------------

In [17]:
# Try multiple queries
queries = [
    "How do you prove negligence?",
    "What are the elements of a valid contract?",
    "What is criminal liability?"
]

for query in queries:
    print(f"\n{'='*80}")
    print(f"Query: {query}")
    print('='*80)
    
    results = indexer.search(query, k=2)
    
    for result in results:
        print(f"\nRank {result['rank']}: (Distance: {result['distance']:.4f})")
        print(f"{result['passage'][:250]}...")
        print()


Query: How do you prove negligence?

Rank 1: (Distance: 0.4867)
In general, to prove medical negligence, a party must establish (1) the applicable standard of care, (2) a breach of the standard of care, and (3) harm resulting from the breach. See , e.g. , Armacost v. Davis , 462 Md. 504, 526, 200 A.3d 859 (2019) ...


Rank 2: (Distance: 0.5142)
We begin with CACI 418, which states the basic doctrine of negligence per se. This instruction does not mention the burden of proof. In material part, it stated:...


Query: What are the elements of a valid contract?

Rank 1: (Distance: 0.6590)
(1) there was a valid contract, express or implied, between the parties whereby the defendant...


Rank 2: (Distance: 0.7319)
the essential elements of a contract are (1) competent parties, (2) subject matter, (3) legal consideration, (4) mutual agreement, (5) and mutual obligations.3 Consideration is any benefit conferred or agreed to be conferred upon the promisor to whic...


Query: What is criminal l

In [26]:
from src.models.openrouter_client import OpenRouterClient

In [31]:
client = OpenRouterClient(model="meta-llama/llama-3.3-70b-instruct:free")
response = client.query("What is AI?")

In [32]:
print(response)

Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that would typically require human intelligence, such as learning, problem-solving, decision-making, and perception. These systems use algorithms and data to make predictions, classify objects, and generate insights, often in a way that mimics human thought processes.

The term "Artificial Intelligence" was coined in 1956 by John McCarthy, a computer scientist and cognitive scientist, who defined it as "the science and engineering of making intelligent machines." Since then, AI has evolved to encompass a broad range of subfields, including:

1. **Machine Learning (ML)**: a type of AI that enables systems to learn from data and improve their performance over time.
2. **Natural Language Processing (NLP)**: a subfield of AI that deals with the interaction between computers and humans in natural language.
3. **Computer Vision**: a field of AI that enables computers to interpret and understand 