In [6]:
import os
import requests
import numpy as np
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from langchain.chains import RetrievalQA, LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import Ollama

# Download PDF
PDF_URL = "https://www.archives.gov/files/about/laws/basic-laws-book-2016.pdf"
PDF_PATH = "basic_laws_2016.pdf"

if not os.path.exists(PDF_PATH):
    print("Downloading PDF...")
    response = requests.get(PDF_URL)
    with open(PDF_PATH, "wb") as f:
        f.write(response.content)
    print("Download complete!")

# Load PDF and Chunk Text
print("Loading and chunking PDF...")
loader = PyPDFLoader(PDF_PATH)
raw_docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=300,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
documents = text_splitter.split_documents(raw_docs)

# Create Embeddings
print("Creating embeddings...")
embedding = HuggingFaceInstructEmbeddings(
    model_name="nlpaueb/legal-bert-base-uncased",  # Legal-specific model
    query_instruction="Represent this legal question for searching legislation:"
)

# Create or Load Vector DB
VECTOR_DB_DIR = "faiss_basic_laws2"
if os.path.exists(VECTOR_DB_DIR):
    print("Loading existing vector database...")
    vector_db = FAISS.load_local(VECTOR_DB_DIR, embeddings=embedding, allow_dangerous_deserialization=True)
else:
    print("Creating new vector database...")
    vector_db = FAISS.from_documents(documents, embedding=embedding)
    vector_db.save_local(VECTOR_DB_DIR)

# Setup LLM
print("Initializing LLM...")
llm = Ollama(
    model="llama3",  
    temperature=0.3,  
    top_p=0.9,
    repeat_penalty=1.1
)

# Enhanced Prompt Template
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
As a legal reference assistant, analyze the provided CONTEXT from the National Archives laws document
and provide the most precise answer to the QUESTION.

RULES:
1. MUST answer using ONLY the CONTEXT provided
2. Cite exact section numbers (e.g., '44 U.S.C. § 2102') when available
3. If uncertain, say "The document does not specify"
4. For definitions: provide the exact quoted definition
5. For procedures: list steps in order

CONTEXT:
{context}

QUESTION:
{question}

STRUCTURED RESPONSE:
[Summary Answer]
[Relevant Section]
[Direct Quote (if applicable)]
"""
)

# Build QA chain
llm_chain = LLMChain(llm=llm, prompt=prompt_template)
combine_chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_variable_name="context"
)

retriever = vector_db.as_retriever(
    search_type="mmr",  # Max marginal relevance for better diversity
    search_kwargs={"k": 5}  
)

chain = RetrievalQA(
    retriever=retriever,
    combine_documents_chain=combine_chain,
    return_source_documents=True
)

# Test Query
def ask_question(question):
    response = chain.invoke({"query": question})
    print("\nQUESTION:", question)
    print("\nANSWER:", response["result"])
    print("\nSOURCES:")
    for i, doc in enumerate(response["source_documents"][:2]):  # Show top 2 sources
        print(f"\nSource {i+1} (Page {doc.metadata.get('page', 'N/A')}):")
        print(doc.page_content[:500] + "...")


Loading and chunking PDF...


No sentence-transformers model found with name nlpaueb/legal-bert-base-uncased. Creating a new one with mean pooling.


Creating embeddings...


`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


Creating new vector database...
Initializing LLM...


In [None]:
# Example Queries
questions = [
    "What does the 'Presidential records' under 44 U.S.C. Chapter 22 talk about?",
    "What are the Archivist's responsibilities regarding custody of records?",
    "How are Vice-Presidential and Presidential records handled?",
    "What penalties exist for unlawful removal of federal records?"
]

for q in questions:
    ask_question(q)
    print("\n" + "="*80 + "\n")

# # Interactive mode
# print("\nEnter 'quit' to exit interactive mode")
# while True:
#     user_question = input("\nYour question about NARA laws: ")
#     if user_question.lower() == 'quit':
#         break
#     ask_question(user_question)

`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.
`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.



QUESTION: What is the definition of 'Presidential records' under 44 U.S.C. Chapter 22?

ANSWER: Based on the provided context, I found that there is no mention of "Presidential records" in 44 U.S.C. Chapter 22.

However, I did find a relevant section in 44 U.S.C. § 2102, which states:

"The term 'presidential record' means information created or received by the President, whether in writing or in another form, including—

(1) messages and correspondence;
(2) memoranda;
(3) letters;
(4) diaries;
(5) calendars;
(6) phone logs; and
(7) other documents that are specifically authorized to be kept by the President."

Since this definition is not specific to Chapter 22, I will provide a summary answer:

**Summary Answer:** The definition of "Presidential records" under 44 U.S.C. § 2102 includes various types of information created or received by the President, such as messages, correspondence, memoranda, letters, diaries, calendars, phone logs, and other authorized documents.

**Relevant Sec

`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.



QUESTION: What are the Archivist's responsibilities regarding custody of records?

ANSWER: **Summary Answer:** The Archivist is responsible for the custody, use, and withdrawal of records transferred to him.

**Relevant Section:** § 2108. RESPONSIBILITY FOR CUSTODY, USE, AND WITHDRAWAL OF RECORDS

**Direct Quote:** "(a) The Archivist shall be responsible for the custody, use, and withdrawal of records transferred to him."

According to the provided context, the Archivist is solely responsible for the custody, use, and withdrawal of records transferred to him. This responsibility is explicitly stated in § 2108(a).

SOURCES:

Source 1 (Page 154):
handbook issued under subsection (g), and the agency’s 
annual report on this section, and by providing an over -
view, where appropriate, of certain general categories of 
agency records to which those exemptions apply; and
(6) designate one or more FOIA Public Liaisons.
(l) FOIA Public Liaisons shall report to the agency Chief 
FOIA Officer a

`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.



QUESTION: How are Vice-Presidential and Presidential records handled?

ANSWER: **Summary Answer:** According to the provided context, Vice-Presidential and Presidential records are handled as follows:

* The Archivist may maintain and preserve Presidential records on behalf of the President during a President's term of office.
* The President remains exclusively responsible for custody, control, and access to such Presidential records.
* The Archivist may not disclose any such records except under direction of the President until the conclusion of a President's term of office.

**Relevant Section:** (f) During a President’s term of office, the Archivist may maintain and preserve Presidential records on behalf of the President, including records in digital or electronic form. The President shall remain exclusively responsible for custody, control, and access to such Presidential records.

**Direct Quote:** "The President shall remain exclusively responsible for custody, control, and ac