In [1]:
# these three lines swap the stdlib sqlite3 lib with the pysqlite3 package
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader # UnstructuredFileLoader는 pdf, txt, docx를 다 열 수 있음
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores.faiss import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough

llm = ChatOpenAI(
    temperature=0.1
)

cache_dir = LocalFileStore("../.cache/") # embedding한 vector를 캐싱하기 위해 캐쉬 디렉토리 설정

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
) # 특정 문자열을 기준으로 끊어줌

loader = UnstructuredFileLoader("../../files/chapter_one.docx")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

# cache embedding을 설정함으로써 embedding을 할 때, 캐시에 embedding이 이미 존재하는지 확인하고
# 없으면 vector store를 호출할때 문서들과 OpenAIEmbeddings를 사용하게 됨
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir    
)


vectorstore = FAISS.from_documents(docs, cached_embeddings)

"""
 [retriever]
   - retriever는 여러 장소에서 document들을 가져오는 클래스의 interface임 (vector store보다 더 일반화된 형태)
"""

retriver = vectorstore.as_retriever()

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant. Answer questions using only the following context. If you don't know the answer just say you don't know, don't make it up:\{context}"),
    ("human","{question}")
])

# RunnablePassthrough를 이용하면 chain의 최초입력을 다음 부분(여기서는 prompt)로 전달할 수 있음
chain = {"context":retriver, "question":RunnablePassthrough()} | prompt | llm

"""
 1) chain에 질문query를 보내면 (이건 prompt의 question항목에 할당되어야함)
 2) retriever가 document들을 검색한뒤 관련된 문서의 list를 반환하고 
 3) 반환 받은 문서들은 llm에 context값으로 입력하게 됨
 4) 최종적으로 llm은 문서들과 질문을 이용하여 적합한 결과를 반환
"""

chain.invoke("Describe Victory Mansions")



Authentication failed for https://api.smith.langchain.com/runs. HTTPError('401 Client Error: Unauthorized for url: https://api.smith.langchain.com/runs', '{"detail":"Invalid auth"}')
Authentication failed for https://api.smith.langchain.com/runs/773779a6-91b2-483a-a15e-8cccae3325d2. HTTPError('401 Client Error: Unauthorized for url: https://api.smith.langchain.com/runs/773779a6-91b2-483a-a15e-8cccae3325d2', '{"detail":"Invalid auth"}')


AIMessage(content='Victory Mansions is a building where Winston Smith resides. It is described as having glass doors at the entrance, which allow gritty dust to enter along with people. The hallway of Victory Mansions has a smell of boiled cabbage and old rag mats. There is a large colored poster on one end of the hallway, depicting the face of a man in his forties with a black mustache. The building has seven floors, and the elevator is rarely functional due to the ongoing electricity cuts. The poster with the enormous face, bearing the caption "BIG BROTHER IS WATCHING YOU," is displayed on each landing opposite the elevator shaft.')