In [None]:
# Original script by J.Tharsen 03-2025
# modified from www.datacamp.com/tutorial/llama-3-1-rag

# Install libraries (if needed)
#!pip install langchain langchain_community langchain-openai scikit-learn langchain-ollama sentence-transformers

In [157]:
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os, glob
import pandas as pd

# Load documents from local files
docs_list = []
metadata_list = []
source_dir = "./cases"
df = pd.read_csv("./cases/metadata.csv")

for filename in glob.glob(source_dir + "/*.txt"):
    filedata = open(filename, 'r').read()
    docs_list.append(filedata)
    row = df[("./cases/" + df['filename']) == filename]
    if not row.empty:
        # Extract metadata as a dictionary
        metadata = {
            'name': row.iloc[0]['name'],
            'year': row.iloc[0]['year'],
            'legal_issue': row.iloc[0]['legal_issue']
        }
        metadata_list.append(metadata)
    else:
        print(f"Warning: No metadata found for file {filename}")
        metadata_list.append(None)

print(str(len(docs_list)) + " documents loaded.")

16 documents loaded.


In [None]:
# Initialize a text splitter 
text_splitter = RecursiveCharacterTextSplitter(
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ],
    chunk_size=500, 
    chunk_overlap=0
)

# Optional arguments for the text_splitter
#    length_function=len,
#    is_separator_regex=False,

# Split the documents into chunks
doc_splits = text_splitter.create_documents(docs_list, metadata_list)

In [159]:
# Check the first document shard
doc_splits[0].metadata

{'name': 'Thompson v. United States, 604 U.S.',
 'year': np.int64(2025),
 'legal_issue': 'Statutory Interpretation'}

In [None]:
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_openai import OpenAIEmbeddings


with open("secret_key.txt", "r") as file:
    my_api_key = file.read().strip()
#my_api_key = "your_key_here"
# Create a file in this directory titled "secret_key.txt" and add your key.

# Create embeddings for documents and store them in a vector store
vectorstore = SKLearnVectorStore.from_documents(
    documents=doc_splits,
    embedding=OpenAIEmbeddings(openai_api_key=my_api_key),
)
retriever = vectorstore.as_retriever(k=4)

In [161]:
from langchain_ollama import ChatOllama
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Define the prompt template for the LLM
prompt = PromptTemplate(
    template="""You are modeled after chief justice John Roberts.
    Use the following documents to answer the question.
    If you don't know the answer, just say that you don't know.
    Answer in the style of justice John Roberts.
    Remain brief with a maximum of 7 sentences:
    Question: {question}
    Documents: {documents}
    Answer:
    """,
    input_variables=["question", "documents"],
)

In [162]:
# Initialize the LLM with the chosen model, set temperature to 0
llm = ChatOllama(
    model="llama3.2",
    temperature=0,
)

In [163]:
# Create a chain combining the prompt template and LLM
rag_chain = prompt | llm | StrOutputParser()

In [None]:
# Define the RAG application class
class RAGApplication:
    def __init__(self, retriever, rag_chain):
        self.retriever = retriever
        self.rag_chain = rag_chain
    def run(self, question):
        # Retrieve relevant documents
        documents = self.retriever.invoke(question)
        for doc in documents:
            print(doc)
        # Extract content from retrieved documents
        doc_texts = "\\n".join([doc.page_content for doc in documents])
        # Get the answer from the language model
        answer = self.rag_chain.invoke({"question": question, "documents": doc_texts})
        return answer, doc_texts

In [165]:
# Initialize the RAG application
rag_application = RAGApplication(retriever, rag_chain)

In [166]:
# Run the RAG application
question = "Should Chevron Deference remain?"
answer, doc_texts = rag_application.run(question)
print("Question:", question)
print("Answer:", answer)
print("Documents:", doc_texts)

page_content='Under that rule, ambiguities of all stripes trigger deference. Indeed, the Government and, seemingly, the dissent continue to defend the proposition that Chevron applies even in cases having little to do with an agency’s technical subject matter expertise. See Brief for Respondents in No. 22–1219, p. 17; post, at 10.' metadata={'id': '9582199e-b750-4576-96ad-4c0b88816c74', 'name': 'Loper Bright Enterprises v. Raimondo, 603 U.S.', 'year': np.int64(2024), 'legal_issue': 'Government Agencies'}
page_content='(quoting Janus v. State, County, and Municipal Employees, 585 U.S. 878, 917 (2018))—all weigh in favor of letting Chevron go.' metadata={'id': '6a424245-c91d-4e09-9777-9ef91f7e6ca6', 'name': 'Loper Bright Enterprises v. Raimondo, 603 U.S.', 'year': np.int64(2024), 'legal_issue': 'Government Agencies'}
page_content='those cases where it might appear to be applicable. See W. Eskridge & L. Baer, The Continuum of Deference: Supreme Court Treatment of Agency Statutory Interpre