In [1]:
import os
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.environ.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
SEC_API_KEY = os.environ.get("SEC_API_KEY")

In [2]:
import html2text
import json 
from sec_api import QueryApi, RenderApi
from langchain import hub
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.documents.base import Document
from langchain_openai import ChatOpenAI


In [3]:
from langchain_core.runnables import RunnableLambda, RunnablePassthrough, RunnableParallel


In [4]:

queryApi = QueryApi(api_key=SEC_API_KEY)
query = {
  "query": "ticker:MSFT AND formType:\"10-K\"",
  "from": "0",
  "size": "1",
  "sort": [{ "filedAt": { "order": "desc" } }]
}

response = queryApi.get_filings(query)


# print(json.dumps(response["filings"][0], indent=2))
link_to_10k = response["filings"][0]["linkToFilingDetails"]
# link_to_10k

renderApi = RenderApi(api_key=SEC_API_KEY)

target_10k = renderApi.get_filing(link_to_10k)
target_10k_full_text = html2text.html2text(target_10k)
docs = [Document(page_content=target_10k_full_text)]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY), persist_directory="./chroma_msft")


# save to disk
# db2 = Chroma.from_documents(docs, embedding_function, persist_directory="./chroma_db")




In [5]:
# load from disk
vectorstore = Chroma(persist_directory="chroma_msft", embedding_function=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY))
# docs = db3.similarity_search(query)
# print(docs[0].page_content)

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})


llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

In [6]:
prompt = hub.pull("rlm/rag-prompt")
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)



In [7]:
context_and_question = RunnableParallel({"context": retriever | format_docs, "question": RunnablePassthrough()})

In [8]:
rag_chain = (
    context_and_question
    | prompt
    | llm
    | StrOutputParser()
)

In [9]:
# for chunk in rag_chain.stream("What is the average price AMZN paid per share for the repurchases made between april 1 and april 30, 2023?"):
for chunk in rag_chain.stream("What were net sales in 2023 in north america?"):
    print(chunk, end="", flush=True)

Net sales in North America for 2023 were $211,915 million.

In [10]:
rag_chain.invoke("What were net sales in 2023 in north america?")

'Net sales in North America for 2023 were $211,915 million.'