In [2]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200,
)

loader = Docx2txtLoader('./tax.docx')
document_list = loader.load_and_split(text_splitter=text_splitter)

In [3]:
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings

load_dotenv()

embedding = OpenAIEmbeddings(model='text-embedding-3-large')

In [None]:
import os
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone

pinecone_api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

index_name = "tax-index"

database = PineconeVectorStore.from_documents(document_list, embedding, index_name=index_name)

In [31]:
query = "5000만원인 세금 알려줘"
retreived_docs = database.similarity_search(query)

In [33]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model='gpt-4o')

In [34]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import RetrievalQA

rag_prompt = ChatPromptTemplate.from_messages([
    ('human', '''You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
                Question: {question} 
                Context: {context} 
                Answer:''')
])

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=database.as_retriever(),
    chain_type_kwargs={"prompt": rag_prompt}
)

In [35]:
ai_message = qa_chain({"query": query})

  ai_message = qa_chain({"query": query})


In [36]:
print(ai_message)

{'query': '5000만원인 세금 알려줘', 'result': '해당 질문에 대한 직접적인 답변은 제시된 문서에 포함되어 있지 않습니다. 세금 금액인 "5000만원"에 관한 정보는 제공된 법률 조항들에 포함되어 있지 않으며, 세금 계산에 필요한 구체적인 과세 표준이나 세율표 등의 정보가 필요할 수 있습니다. 따라서, 상세한 세금 계산이 필요하다면 국세청이나 관련 기관에 문의하는 것이 좋습니다.'}
