In [None]:
!wget https://sgp.fas.org/crs/misc/IF10244.pdf

In [None]:
###################################
########### INGESTION #############
###################################

In [None]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from unstructured.partition.utils.constants import PartitionStrategy


loader = UnstructuredPDFLoader(
    file_path="./IF10244.pdf",
    strategy=PartitionStrategy.HI_RES,
    infer_table_structure=True,
    extract_images_in_pdf=True,
    chunking_strategy="by_title",
    new_after_n_chars=4000,  # Soft-max
    max_characters=4000,  # Hard-max
    combine_text_under_n_chars=2000,  # Combine chunks of < 200 chars
    mode='elements',  # Split the documents into elements such as Title and NarrativeText.
)
data = loader.load()

In [None]:
len(data)

In [None]:
data[0]

In [None]:
[doc.metadata['category'] for doc in data]

In [None]:
data[2].page_content  # Table

In [None]:
###################################
########### RETRIEVER #############
###################################

In [None]:
from getpass import getpass

OPENAI_API_KEY = getpass('Enter OpenAI Key: ')

In [None]:
from langchain_openai import OpenAIEmbeddings


OPENAI_EMBEDDING_MODEL = OpenAIEmbeddings(model='text-embedding-ada-002', api_key=OPENAI_API_KEY)

In [None]:
from langchain_chroma import Chroma

vectorstore = Chroma(
    collection_name='OSM-21-Oct-2024-tradition',
    embedding_function=OPENAI_EMBEDDING_MODEL,
    collection_metadata={"hnsw:space": "cosine"},
)

In [None]:
retriever = vectorstore.as_retriever()

In [None]:
from langchain.vectorstores.utils import filter_complex_metadata

# Filter complex metadata
data = filter_complex_metadata(data)
retriever.vectorstore.add_documents(data)

In [None]:
vectorstore.get(include=["metadatas", "documents", "embeddings"])

In [None]:
vectorstore._collection.count()

In [None]:
query = "Which year has the highest acres burned?"
docs = retriever.invoke(query, limit=5)

In [None]:
[doc.page_content for doc in data]

In [None]:
###################################
########### Synthesis #############
###################################

In [None]:
from langchain_openai import ChatOpenAI

CHAT_MODEL = ChatOpenAI(model_name='gpt-4o-mini', api_key=OPENAI_API_KEY, temperature=0)

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


RAG_TEMPLATE = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context 
to answer the question. If you don't know the answer, just say that you don't know.

<context>
{context}
</context>

Answer the following question:

{question}"""

rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | CHAT_MODEL
    | StrOutputParser()
)

In [None]:
query = "Which year has the highest acres burned?"
response = rag_chain.invoke(query)

In [None]:
response

In [None]:
# Incorrect response. Correct answer - 4.1 million acres

query = query = "Tell me about the number of acres burned by wildfires for the forest service in 2021"
response = rag_chain.invoke(query)

In [None]:
response