In [None]:
## law_1.docx
- pinecone store 저장
    - index name: 임의
- RetrievalQA 구현
    - prompt: rlm/rag-prompt
    - 질문: 전세사기

In [None]:
import os

from dotenv import load_dotenv
from langchain import hub
from langchain_community.document_loaders import Docx2txtLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pinecone import Pinecone


## 환경변수 읽어오기
load_dotenv()
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')

## 문서 읽고 분할
# loader = Docx2txtLoader('law_1.docx')

# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=1500,
#     chunk_overlap=200,
# )

# document_list = loader.load_and_split(text_splitter=text_splitter)

## 임베딩 -> 벡터 스토어(데이터베이스)에 저장
## 임베딩 모델 지정
embedding = OpenAIEmbeddings(model='text-embedding-3-large')
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = 'law-1-quiz03'

## 파인콘: 저장
# database =  PineconeVectorStore.from_documents(
#     documents=document_list,
#     embedding=embedding,
#     index_name=index_name,
# )

## 저장된 인덱스 가져오기
## [방법 1]
# database = PineconeVectorStore(
#     index=pc.Index(index_name),
#     embedding=embedding,
# )

## [방법 2]
database = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding,
)

## RetrievalQA
llm = ChatOpenAI(model='gpt-4o')
prompt = hub.pull('rlm/rag-prompt')

def format_docs(docs):
    return '\n\n'.join(doc.page_content for doc in docs)

qa_chain = (
    {
        'context': database.as_retriever() | format_docs,
        'question': RunnablePassthrough(),
    }
    | prompt
    | llm
    | StrOutputParser()
)

qa_chain.invoke('전세사기피해자 임대인을 알려주세요.')

In [1]:
# 저장없이 데이터베이스를 읽기만 할때

import os

from dotenv import load_dotenv
from langchain import hub
from langchain_community.document_loaders import Docx2txtLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pinecone import Pinecone

load_dotenv()
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')

pc = Pinecone(api_key=PINECONE_API_KEY)
embedding = OpenAIEmbeddings(model='text-embedding-3-large')
index_name = 'law-index'

llm = ChatOpenAI()
prompt = hub.pull('rlm/rag-prompt')

database = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding,
)

def format_docs(docs):
    return '\n\n'.join(doc.page_content for doc in docs)

qa_chain = (
    {
        'context': database.as_retriever() | format_docs,
        'question': RunnablePassthrough(),
    }
    | prompt 
    | llm    
    | StrOutputParser() 
)

qa_chain.invoke('전세사기피해자에 대한 정의를 알려주세요')

  from .autonotebook import tqdm as notebook_tqdm


'전세사기피해자란 전세사기로 피해를 입은 임차인으로, 법률에 따라 전세사기피해자로 결정된 자를 말합니다. 이 법은 전세사기피해자에게 경ㆍ공매 절차와 조세 징수에 관한 특례를 부여하며, 전세사기피해자를 지원하고 주거안정을 도모하는 것이 목적입니다. "주택"은 주택임대차보호법에 따른 주거용 건물을 말하며, "전세사기피해주택"은 전세사기피해자가 임차인인 주택을 포함한 임대차계약의 목적물로 정의됩니다.'