In [1]:
import getpass
import os
from dotenv import load_dotenv

# # Get API key from .env
# # Load environment variables from .env file
# load_dotenv()

# # Get API key from environment variable
# api_key = os.getenv("OPENAI_API_KEY")

In [2]:
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_ollama import OllamaLLM

In [47]:
current_dir = os.getcwd()
file_path   = os.path.join(current_dir, "data", "codes-greenville-ny.pdf")
db_dir      = os.path.join(current_dir, "db")

# Check if the text file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(
        f"The file {file_path} does not exist. Please check the path."
    )


In [48]:
pages = PyPDFLoader(file_path).load()

In [49]:
# Define the embedding model
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

In [26]:
# Function to create and persist vector store
def create_vector_store(docs, store_name):
    persistent_directory = os.path.join(db_dir, store_name)
    if not os.path.exists(persistent_directory):
        print(f"\n--- Creating vector store {store_name} ---")
        db = FAISS.from_documents(
            docs, embeddings
        )
        db.save_local(persistent_directory)
        print(f"--- Finished creating vector store {store_name} ---")
    else:
        print(
            f"Vector store {store_name} already exists. No need to initialize.")

In [50]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
pages  = text_splitter.split_documents(pages)

In [11]:
create_vector_store(pages, "FAISS")


--- Creating vector store FAISS ---
--- Finished creating vector store FAISS ---


In [43]:
def query_vector_store(store_name):
    persistent_directory = os.path.join(db_dir, store_name)
    if os.path.exists(persistent_directory):
        print(f"\n--- Initializing Vectorstore: {store_name} ---")
        # print paths
        print(f"Persistent directory: {persistent_directory}")
        db = FAISS.load_local("db/FAISS", OpenAIEmbeddings(), allow_dangerous_deserialization = True)
        retriever = db.as_retriever(
            search_type="similarity_score_threshold",
            search_kwargs={"k": 3, "score_threshold": 0.8},
        )

        return retriever
    else:
        print(f"Vector store {store_name} does not exist.")
        return None

In [44]:
# retriever = query_vector_store("FAISS")
retriever = query_vector_store("FAISS")
retriever.invoke("Tell me about the town of Greenville")


--- Initializing Vectorstore: FAISS ---
Persistent directory: /Users/alex.labuda/Documents/alex_labuda/scripts/python/LLMs/rag-citations/db/FAISS




[]

In [30]:
result = retriever.invoke("What do I need to know about septics?")




In [31]:
result

[]

In [51]:
faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings())
retriever = faiss_index.as_retriever()
# docs = faiss_index.similarity_search("What is LayoutParser?", k=2)
# for doc in docs:
#     print(str(doc.metadata["page"]) + ":", doc.page_content[:300])

# Save embeddings for later use
faiss_index.save_local("faiss_index")

In [13]:
new_db = FAISS.load_local("faiss_index", OpenAIEmbeddings(), allow_dangerous_deserialization = True)

In [14]:
llm = OllamaLLM(model = "llama3", temperature=0)

system_prompt = (
    "You're a helpful AI assistant. Given a user question and some county code pages, answer the user question. If none of the pages can answer the question, just say you don't know.\n\nHere are the county code pages:{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{question}")
    ]
)
prompt.pretty_print()


You're a helpful AI assistant. Given a user question and some county code pages, answer the user question. If none of the pages can answer the question, just say you don't know.

Here are the county code pages:[33;1m[1;3m{context}[0m


[33;1m[1;3m{question}[0m


In [15]:
from operator import itemgetter
from typing import List

from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import (
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)


def format_docs(docs: List[Document]) -> str:
    """Convert Documents to a single string."""
    formatted = [
        f"Page Number: {doc.metadata.get('page_number', 'Unknown')}\nPage Content: {doc.page_content}"
        for doc in docs
    ]
    return "\n\n" + "\n\n".join(formatted)


format = itemgetter("docs") | RunnableLambda(format_docs)
# subchain for generating an answer once we've done retrieval
answer = prompt | llm | StrOutputParser()
# complete chain that calls wiki -> formats docs to string -> runs answer subchain -> returns just the answer and retrieved docs.
chain = (
    RunnableParallel(question=RunnablePassthrough(), docs=retriever)
    .assign(context=format)
    .assign(answer=answer)
    .pick(["answer", "docs"])
)

In [23]:
result = chain.invoke("Tell me about bulk mailing from the provided information.")

In [24]:
print(result.keys())

dict_keys(['answer', 'docs'])


In [25]:
print(result['answer'])

Based on the provided county code pages, I can tell you that "bulk mailings" refer to any letter or document sent to or intended to be sent to 10 or more residents of the Town of Greenville from any member of the Town Board concerning official business of the Town of Greenville in which any portion thereof is paid for by the Town of Greenville through goods, job and/or services. This includes a mailing by any member of the Town Board in connection with any political campaign for elective office currently being sought by such Town Board member.

According to §8-2 of the code, approval of the Town Board is required before transmitting any bulk mailing. Additionally, no Town official or employee shall participate in the distribution or transmission of any bulk mailing without prior approval from the Town Board. Any violation of this chapter by a Town official or employee shall render such official or employee liable to the Town for any damages resulting from such unauthorized activity.


In [19]:
print(result['docs'][0])

page_content='Chapter 8
BULK MAILINGS
§8-1. Definitions.
§8-2. Approval of Town Board
required; distribution;
transmittal.§8-3. Copies distributed.
§8-4. Penalties for offenses.
[HISTORY: Adopted by the Town Board of the Town of Greenville 10-4-1999 . Amendments noted
where applicable.]
§8-1. Definitions.
As used in this chapter, the following terms shall have the meanings indicated:
BULK MAILING —Any letter or other document sent to or intended to be sent to 10 or more residents
of the Town of Greenville from any member of the Town Board concerning official business of the Town
of Greenville in which any portion thereof is paid for by the Town of Greenville through goods, job and
or services, and includes a mailing by any member of the Town Board in connection with any political
campaign for elective office currently being sought by such Town Board member.
§8-2. Approval of Town Board required; distribution; transmittal.
§8-3. Copies distributed.
A copy of this chapter shall be distri

In [36]:
result = chain.invoke("What soil types are suitable for building new homes?")

In [37]:
print(result['answer'])

Based on the provided county code pages, it appears that soils in Group I and Group IV are suitable for building new homes.

Group I soils are developed in sands and gravel, are well-drained, and have moderately rapid to very rapid permeability. These soils can be used for on-site septic systems and have slight limitations for septic systems on A and B slopes and moderate limitations on C slopes. Buildings with and without basements may be installed on these soils.

Group IV soils are developed in coarse silts and loam, are well to moderately well-drained, and have a firm fragipan. These soils can also be used for on-site septic systems that are adequately designed to overcome the noted limitations.

Please note that it's always recommended to conduct an on-site investigation or consult with a registered soils engineer to determine the specific soil type and its properties before building new homes.


In [35]:
print(result['docs'][0])

page_content='site septic systems may also be used in cases where the extreme stoniness of the Bath
Swartswood (020) soils is found to be only a surface condition. On-site investigation
is necessary to determine the extent of stoniness or the bedrock condition of these
soils. The soils have severe limitations for septic systems.
[2] Buildings with and without basements may be installed on these soils if adequate
foundation drainage is provided to a free-flowing outlet. These soils have moderate
to severe limitations for homesites.
[3] Erodability on these soils is low to medium. Erosion may be a problem on the C
slopes.
(7) Group VII: soils developed in silts, clays and very fine sands that are wet, nearly level to gently
sloping.
(a) Characteristics. The soils in this group are somewhat poorly drained and occur in concave
areas in the glacial till upland. These soils are very fine sands through silty clay loam
texture. Permeability is slow to very slow. The seasonal high water table i