In [21]:
import os
import requests
import json
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama

In [22]:
# Download PDF
PDF_URL = "https://www.archives.gov/files/about/laws/basic-laws-book-2016.pdf"
PDF_PATH = "basic_laws_2016.pdf"

In [23]:
if not os.path.exists(PDF_PATH):
    print("Downloading PDF...")
    response = requests.get(PDF_URL)
    with open(PDF_PATH, "wb") as f:
        f.write(response.content)
    print("Download complete!")

In [24]:
# Load PDF and Chunk Text
print("Loading and chunking PDF...")
loader = PyPDFLoader(PDF_PATH)
raw_docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(raw_docs)
documents


Loading and chunking PDF...


[Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 2015 (Macintosh)', 'creationdate': '2016-01-20T11:30:16-05:00', 'author': 'National Archives and Records Administration, Office of the General Counsel', 'moddate': '2016-03-08T11:00:40-05:00', 'subject': 'Basic Laws and Authorities', 'title': 'Basic Laws and Authorities of the National Archives and Records Administration Book - 2016', 'trapped': '/False', 'source': 'basic_laws_2016.pdf', 'total_pages': 170, 'page': 0, 'page_label': 'i'}, page_content='2016  edition\n \nBASIC\nLAWS\nand AUTHORITIES  of the NATIONAL ARCHIVES   \nand RECORDS ADMINISTR ATION\nOffice of General Counsel\nNational Archives and Records Administration\nAdditional materials can be found on the web at: www.archives.gov'),
 Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 2015 (Macintosh)', 'creationdate': '2016-01-20T11:30:16-05:00', 'author': 'National Archives and Records Administration,

In [25]:
# Create Embeddings 
print("🔗 Creating embeddings...")
embedding = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-large",
    query_instruction="Represent the legal query for retrieval:"
)


No sentence-transformers model found with name hkunlp/instructor-large. Creating a new one with mean pooling.


🔗 Creating embeddings...


In [26]:
# Create or Load Vector DB 
VECTOR_DB_DIR = "faiss_basic_laws"
if os.path.exists(VECTOR_DB_DIR):
    print("📚 Loading existing vector database...")
    vector_db = FAISS.load_local(VECTOR_DB_DIR, embeddings=embedding, allow_dangerous_deserialization=True)
else:
    print("📚 Creating new vector database...")
    vector_db = FAISS.from_documents(documents, embedding=embedding)
    vector_db.save_local(VECTOR_DB_DIR)


📚 Loading existing vector database...


In [27]:
# Setup LLM 
print("Initializing LLM (llama3.2)...")
llm = Ollama(model="llama3.2", temperature=0.6)

Initializing LLM (llama3.2)...


In [28]:
from langchain.chains import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import PromptTemplate


# Prompt
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a knowledgeable and precise legal assistant tasked with answering questions based solely on the provided legal text.

Use ONLY the information explicitly found in the CONTEXT to answer the QUESTION.  
If the answer is not clearly stated or directly inferable from the context, respond with:  
"I don't know."

Guidelines:
- Reference specific legal titles, sections, or clauses where applicable.
- Quote directly from the source document when possible, especially when citing laws, dates, or legal definitions.
- Do NOT speculate or provide interpretations not grounded in the context.
- Maintain a formal, concise, and objective tone appropriate for legal communication.

CONTEXT:
{context}

QUESTION:
{question}

Answer:
"""
)


# Build QA chain
llm_chain = LLMChain(llm=llm, prompt=prompt_template)
combine_chain = StuffDocumentsChain(
    llm_chain=llm_chain, 
    document_variable_name="context"
)

retriever = vector_db.as_retriever(search_kwargs={"k": 8})

chain = RetrievalQA(
    retriever=retriever, 
    combine_documents_chain=combine_chain, 
    return_source_documents=True
)




  llm_chain = LLMChain(llm=llm, prompt=prompt_template)
  combine_chain = StuffDocumentsChain(
  chain = RetrievalQA(


In [33]:
query = "what rule does the context talk about?"
chain.invoke({"query": query})

`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


{'query': 'what rule does the context talk about?',
 'result': "I don't know. The context appears to be a legal document, but it doesn't specify which rule is being discussed. It mentions several sections and clauses, such as paragraph (1), section 5 of the United States Code, and subsections (C) and (I). However, it does not clearly state what specific rule or regulation is being referred to.",
 'source_documents': [Document(id='e0daddf8-fd1a-46ce-a2a8-6549dc93bd90', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 2015 (Macintosh)', 'creationdate': '2016-01-20T11:30:16-05:00', 'author': 'National Archives and Records Administration, Office of the General Counsel', 'moddate': '2016-03-08T11:00:40-05:00', 'subject': 'Basic Laws and Authorities', 'title': 'Basic Laws and Authorities of the National Archives and Records Administration Book - 2016', 'trapped': '/False', 'source': 'basic_laws_2016.pdf', 'total_pages': 170, 'page': 2, 'page_label': 'iii'}, page_

In [34]:
query = "What are the primary responsibilities of the Archivist of the United States under 44 U.S.C."
chain.invoke({"query": query})

`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


{'query': 'What are the primary responsibilities of the Archivist of the United States under 44 U.S.C.',
 'result': "I don't know. The question is not clearly stated in the provided context, and there is no specific information about the primary responsibilities of the Archivist of the United States under 44 U.S.C. that can be directly inferred from the text.",
 'source_documents': [Document(id='4b3a7c4f-475d-4d61-ac5f-168644e85b5a', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 2015 (Macintosh)', 'creationdate': '2016-01-20T11:30:16-05:00', 'author': 'National Archives and Records Administration, Office of the General Counsel', 'moddate': '2016-03-08T11:00:40-05:00', 'subject': 'Basic Laws and Authorities', 'title': 'Basic Laws and Authorities of the National Archives and Records Administration Book - 2016', 'trapped': '/False', 'source': 'basic_laws_2016.pdf', 'total_pages': 170, 'page': 12, 'page_label': '4'}, page_content='of copy reading and indexin

`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


{'query': 'How is the National Archives and Records Administration (NARA) structured, and what are its key functions?',
 'result': "According to Section 2904(a) of the BASIC LAWS AND AUTHORITIES of the NATIONAL ARCHIVES and RECORDS ADMINISTRATION, the Archivist shall provide guidance and assistance to Federal agencies with respect to ensuring adequate and effective records management.\n\nHowever, I don't know. The provided context does not explicitly state how NARA is structured or its key functions beyond the general responsibilities outlined in Section 2904.",
 'source_documents': [Document(id='a8907366-d783-4bbd-b2d2-d1e34469ffd5', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 2015 (Macintosh)', 'creationdate': '2016-01-20T11:30:16-05:00', 'author': 'National Archives and Records Administration, Office of the General Counsel', 'moddate': '2016-03-08T11:00:40-05:00', 'subject': 'Basic Laws and Authorities', 'title': 'Basic Laws and Authorities of the N

In [36]:
query = "What is the former Presidents Acts?"
chain.invoke({
    "query":query
})

`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


{'query': 'What is the former Presidents Acts?',
 'result': 'I don\'t know. The question "What is the former President\'s Act?" is not clearly stated in the provided context, and I couldn\'t find any information on it being mentioned or referenced in the text.',
 'source_documents': [Document(id='a8907366-d783-4bbd-b2d2-d1e34469ffd5', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 2015 (Macintosh)', 'creationdate': '2016-01-20T11:30:16-05:00', 'author': 'National Archives and Records Administration, Office of the General Counsel', 'moddate': '2016-03-08T11:00:40-05:00', 'subject': 'Basic Laws and Authorities', 'title': 'Basic Laws and Authorities of the National Archives and Records Administration Book - 2016', 'trapped': '/False', 'source': 'basic_laws_2016.pdf', 'total_pages': 170, 'page': 0, 'page_label': 'i'}, page_content='2016  edition\n \nBASIC\nLAWS\nand AUTHORITIES  of the NATIONAL ARCHIVES   \nand RECORDS ADMINISTR ATION\nOffice of General Couns

In [37]:
query = "What institution pays Former Presidents?"
chain.invoke({
    "query":query
})

`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


{'query': 'What institution pays Former Presidents?',
 'result': "I don't know.",
 'source_documents': [Document(id='f38e6f97-dbf8-4f7e-861b-f778d71827fb', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 2015 (Macintosh)', 'creationdate': '2016-01-20T11:30:16-05:00', 'author': 'National Archives and Records Administration, Office of the General Counsel', 'moddate': '2016-03-08T11:00:40-05:00', 'subject': 'Basic Laws and Authorities', 'title': 'Basic Laws and Authorities of the National Archives and Records Administration Book - 2016', 'trapped': '/False', 'source': 'basic_laws_2016.pdf', 'total_pages': 170, 'page': 58, 'page_label': '50'}, page_content='the Chairman of the Committee, the head of such de -\npartment or agency shall furnish such information to the \nCommittee.\n§ 2705. COMPENSATION AND TRAVEL \nEXPENSES'),
  Document(id='a8907366-d783-4bbd-b2d2-d1e34469ffd5', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 2015 