## Setup Environment

In [10]:
%pip install -qU nltk langchain_astradb pypdf langchain-text-splitters keybert gliner unstructured langchain-openai python-dotenv "unstructured[pdf]"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [11]:
import os
import requests
import nltk
from langchain_openai import OpenAIEmbeddings
from langchain_astradb import AstraDBVectorStore
from dotenv import load_dotenv
from langchain_core.documents.base import Document
from unstructured.partition.auto import partition
from collections import namedtuple


nltk.download('punkt')
nltk.download('punkt_tab')


load_dotenv()


OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
ASTRADB_TOKEN = os.environ.get('ASTRA_DB_APPLICATION_TOKEN')
ASTRADB_ENDPOINT = os.environ.get('ASTRA_DB_API_ENDPOINT')


VSTORE = AstraDBVectorStore(
    collection_name='test_team_metadata',
    embedding=OpenAIEmbeddings(model='text-embedding-3-small'),
    metric='cosine',
)


SAVE_DIR = 'data/'
PdfUrl = namedtuple('PdfUrl', ['url', 'team'])
PDF_URLS = [
    PdfUrl("https://abc.xyz/assets/9a/bd/838c917c4b4ab21f94e84c3c2c65/goog-10-k-q4-2022.pdf", "google"),
    PdfUrl("https://abc.xyz/assets/43/44/675b83d7455885c4615d848d52a4/goog-10-k-2023.pdf", "google"),
    PdfUrl("https://d18rn0p25nwr6d.cloudfront.net/CIK-0000320193/b4266e40-1de6-4a34-9dfb-8632b8bd57e0.pdf", "apple"),
    PdfUrl("https://d18rn0p25nwr6d.cloudfront.net/CIK-0000320193/faab4555-c69b-438a-aaf7-e09305f87ca3.pdf", "apple")
]


def download_file(url: str, save_path: str):
    """
    Downloads file from a given url
    """
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded: {save_path}")
    else:
        print(f"Failed to download: {url}")


os.makedirs(SAVE_DIR, exist_ok=True)


pdf_files = dict()
for pdf_url in PDF_URLS:
    file_name = pdf_url.url.split("/")[-1]
    save_path = os.path.join(SAVE_DIR, file_name)
    pdf_files[save_path] = pdf_url
    download_file(pdf_url.url, save_path)


team_elements = dict()
for pdf_file_path, pdfurl in pdf_files.items():
    elements = partition(filename=pdf_file_path)
    team_elements[pdfurl.team] = elements


pages = []
for team, elements in team_elements.items():
  for element in elements:
      metadata = element.metadata.to_dict()
      metadata["team"] = team
      metadata["element_id"] = element.id
      metadata["category"] = element.to_dict()["type"]
      document = Document(
          id=element.id,
          mimetype=metadata["filetype"],
          metadata=metadata,
          page_content=element.text,
          )
      pages.append(document)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/brian.ogrady/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/brian.ogrady/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Downloaded: data/goog-10-k-q4-2022.pdf
Downloaded: data/goog-10-k-2023.pdf
Downloaded: data/b4266e40-1de6-4a34-9dfb-8632b8bd57e0.pdf
Downloaded: data/faab4555-c69b-438a-aaf7-e09305f87ca3.pdf


In [12]:
inserted_ids = VSTORE.add_documents(pages)


## RAG Query

In [13]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-4-turbo-2024-04-09")

In [14]:
from langchain_core.runnables import (
    RunnablePassthrough,
    ConfigurableField,
)
from langchain_core.prompts import ChatPromptTemplate

# The chat prompt template
ANSWER_PROMPT = (
    "The original question is given below."
    "This question has been used to retrieve information from a vector store."
    "The matching results are shown below."
    "Use the information in the results to provide a concise answer the original question.\n\n"
    "Original Question: {question}\n\n"
    "Vector Store Results:\n{context}\n\n"
    "Response:"
)

# A function for formatting docs before adding to the chat prompt template
def format_docs(docs):
    for doc in docs:
      print(doc.metadata)
    return "\n\n".join(doc.page_content for doc in docs)

retriever = VSTORE.as_retriever(
    search_type = "similarity",
    search_kwargs = {
        "k": 10,
    },
)

# Allow search filters to be applied dynamically
configurable_retriever = retriever.configurable_fields(
    search_kwargs=ConfigurableField(
        id="search_kwargs",
        name="Search keyword args",
        description="Runtime config of search parameters"
    )
)

# Construct the LLM execution chain
chain = (
    {"context": configurable_retriever | format_docs, "question": RunnablePassthrough()}
    | ChatPromptTemplate.from_messages([ANSWER_PROMPT])
    | llm
)

In [15]:
# Example invocation (platinum membership fee is $695)
resp = chain.invoke(
    "Please compare advertising profit and expenses between 2022 and 2023",
)
context = configurable_retriever.get_relevant_documents(query="Please compare advertising profit and expenses between 2022 and 2023")
for doc in context:
    print(doc.metadata['team'])



{'coordinates': {'points': [[43.2, 501.85913059999996], [43.2, 535.8591306], [571.52999, 535.8591306], [571.52999, 501.85913059999996]], 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': 'data', 'filename': 'goog-10-k-2023.pdf', 'languages': ['eng'], 'last_modified': '2024-11-06T16:57:16', 'page_number': 59, 'parent_id': '907483c1fb13277da2342b2d94d9256f', 'filetype': 'application/pdf', 'team': 'google', 'element_id': '92eabafa7df52924f8648a0bbc3bb0b1', 'category': 'NarrativeText'}
{'coordinates': {'points': [[18.224997320625082, 451.9413989050448], [18.224997320625082, 460.04139799154484], [310.43798613549063, 460.04139799154484], [310.43798613549063, 451.9413989050448]], 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': 'data', 'filename': 'faab4555-c69b-438a-aaf7-e09305f87ca3.pdf', 'languages': ['eng'], 'last_modified': '2024-11-06T16:57:16', 'page_number': 26, 'parent_id': 'eb667ebf640f97c44e9f4f9a7fa0dd2f', 'f

Filter to only include documents related to Google in the similarity search. Need to call the vector store directly because langchain invoke applies the filter after similarity search

## Next Level - Search Tool

In [17]:
docs = VSTORE.similarity_search(query="Please compare advertising profit and expenses between 2022 and 2023",k=10,filter={"team": "google"})
for doc in docs:
  print(doc.metadata['team'])


query = "Please compare advertising profit and expenses between 2022 and 2023"
context = "\n\n".join([doc.page_content for doc in docs])
prompt = f"Based on the following documents, answer the question: {query}\n\nContext:\n{context}"
response = llm.invoke(prompt)

# Output the response
print(response.content)

google
google
google
google
google
google
google
google
google
google
To compare the advertising profit and expenses between 2022 and 2023, we can analyze the provided information as follows:

**Advertising and Promotional Expenses:**
- In 2022, advertising and promotional expenses totaled approximately $9.2 billion.
- In 2023, these expenses decreased to $8.7 billion.
- This represents a decrease in advertising and promotional expenses of $500 million from 2022 to 2023.

**Advertising Revenues:**
- YouTube ads revenues, a significant component of advertising revenues, increased by $2.3 billion from 2022 to 2023.
- The document does not provide explicit figures for total Google advertising revenues for 2022 and 2023, but the increase in YouTube ads revenues suggests a substantial growth in overall advertising revenues.

**Analysis:**
- The decrease in advertising and promotional expenses and the increase in advertising revenues, particularly from YouTube, suggest an improvement in adve