In [2]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.chat_models import init_chat_model

import getpass
import os

In [3]:
llm = init_chat_model("gpt-4o-mini", model_provider="openai")

In [4]:
from langchain_core.messages import HumanMessage, SystemMessage
messages = [
    SystemMessage("Translate the following from English into Uzbek"),
    HumanMessage("How are you?"),
]

llm.invoke(messages)

AIMessage(content='Qalaysiz?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 6, 'prompt_tokens': 22, 'total_tokens': 28, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_3267753c5d', 'finish_reason': 'stop', 'logprobs': None}, id='run-c96f89ca-b685-4aa3-b14e-3f1196fdb144-0', usage_metadata={'input_tokens': 22, 'output_tokens': 6, 'total_tokens': 28, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [6]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [7]:
from langchain_core.vectorstores import InMemoryVectorStore
vector_store = InMemoryVectorStore(embeddings)


In [8]:
import bs4
from langchain import hub
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict

In [9]:
file_path = "/home/iskandar/Downloads/tax_return_unchartered.pdf"
loader = PyPDFLoader(file_path)

In [10]:
docs = loader.load()

In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
all_splits = text_splitter.split_documents(docs)

In [12]:
vs = vector_store.add_documents(documents=all_splits)

In [13]:
os.environ['LANGSMITH_TRACING']="true"
os.environ['LANGSMITH_API_KEY']=getpass.getpass()

 ········


In [14]:
prompt = hub.pull("rlm/rag-prompt")


# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

In [15]:
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [16]:
result = graph.invoke({"question": "What is this document about?"})

#print(f'Context: {result["context"]}\n\n')
print(f'Answer: {result["answer"]}')

Answer: The document provides supplemental information regarding Schedule K-1 and notifies partners of a partnership that they are exempt from filing Schedule K-3 for tax year 2024 due to specific criteria. It details the conditions under which partners, all of whom are U.S. citizens or residents, will not receive Schedule K-3 unless specifically requested. Additionally, it includes sections for e-file authorization and partnership identification details for tax reporting.


In [20]:
result = graph.invoke({"question": "What is the name of the company this tax form used for?"})

#print(f'Context: {result["context"]}\n\n')
print(f'Answer: {result["answer"]}')

Answer: The company associated with this tax form is UNCHARTERED LLC. Their employer identification number is 93-2773352. This form is likely related to an application for extending the time to file certain business income tax returns.


In [21]:
result = graph.invoke({"question": "Is the company profitable?"})

#print(f'Context: {result["context"]}\n\n')
print(f'Answer: {result["answer"]}')

Answer: The company appears to be operating at a loss, as indicated by the repeated figures of (29,860) and other negative values throughout the context. Therefore, it is not profitable.


In [20]:
result = graph.invoke({"question": "what is the total rental loss in K-1 Rental Real Estate Activity?"})

print(f'Context: {result["context"]}\n\n')
print(f'Answer: {result["answer"]}')

Context: [Document(id='b6e27aa7-a79f-4176-b614-81d40f749166', metadata={'producer': 'Drake Software LLC', 'creator': 'Drake PDF Distiller', 'creationdate': '2025-03-14T03:59:17+00:00', 'author': 'Drake Software LLC', 'keywords': '', 'moddate': '2025-03-14T03:59:42-06:00', 'subject': 'Tax Documents', 'title': '2024 Tax Documents', 'source': '/home/iskandar/Downloads/tax_return_unchartered.pdf', 'total_pages': 84, 'page': 12, 'page_label': '13'}, page_content="Line 17 Line 19 Line 20a Line 21Description Type Income/Loss 4797 Pass-thru Net\nTotals\nK-1 Rental Real Estate Activity\n8825 2024Information provided to determine participation level in each rental property\n(For shareholder's and partner's records only)\nAccumulated differences may occur as a result of rounding individual properties.\nEIN/SSN\nK1_8825~.LD\nShareholder/Partner Name Ownership percentage\n.......................................................\nISKANDAR ATAKHODJAEV  33.34 271-59-3993\n3617 CONGER RD\nHuntsville, AL