In [2]:
# these three lines swap the stdlib sqlite3 lib with the pysqlite3 package
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader # UnstructuredFileLoader는 pdf, txt, docx를 다 열 수 있음
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores.faiss import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

llm = ChatOpenAI(
    temperature=0.1
)

cache_dir = LocalFileStore("../.cache/") # embedding한 vector를 캐싱하기 위해 캐쉬 디렉토리 설정

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
) # 특정 문자열을 기준으로 끊어줌

loader = UnstructuredFileLoader("../../files/chapter_one.docx")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

# cache embedding을 설정함으로써 embedding을 할 때, 캐시에 embedding이 이미 존재하는지 확인하고
# 없으면 vector store를 호출할때 문서들과 OpenAIEmbeddings를 사용하게 됨
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir    
)


vectorstore = FAISS.from_documents(docs, cached_embeddings)

"""
 [retriever]
   - retriever는 여러 장소에서 document들을 가져오는 클래스의 interface임 (vector store보다 더 일반화된 형태)
"""

"""
  Map Reduce chain을 구현하기 위해 아래 절차를 따름
  1) retriever에 질문을 전달
  2) retriever는 질문과 관련된 document의 list를 얻음
  3) list에 있는 모든 document를 위해 prompt를 만듦
  4) prompt'들'을 전달받은 llm은 응답'들'을 반환하고 
  5) 모든 응답들을 묶어서 하나의 document로 합침
  6) 최종 document를 llm에 prompt로 전달하여 결과를 획득

  Q) 언제 stuff를 언제 map reduce를 사용해야할까?
  A) retriever가 반환하는 document가 많은 경우에는 stuff를 쓸 수 없음. 
     stuff의 prompt에 모든 document를 넣을 수 없기 때문
"""
retriever = vectorstore.as_retriever() # retriever는 string타입의 input을 받고 관련된 문서들을 반환함

map_doc_prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        """
        Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim.
        ------
        {context}
        """
    ),
    ("human","{question}")
])

map_doc_chain = map_doc_prompt | llm

def map_docs(inputs):
    # 질문과 여러개의 문서를 입력받아서 질문에 관련된 하나의 string을 반환하는 함수
    documents = inputs['documents']
    question = inputs['question']
    # 여기서 document는 langchain으로부터 받는 class인데 그 중 page_content가 내용이 담긴 부분임
    results = "\n\n".join(
        map_doc_chain.invoke({
            "context" : doc.page_content, "question":question
        }).content for doc in documents) # 각 문서에 대해 질문과 관련이 있는 부분을 추출한 모든 내용을 하나의 string으로 합침(줄바꿈을 통해 구분)
    return results

# RunnableLambda는 chain과 그 내부 어디에서든 function을 호출할 수 있도록 해줌
map_chain = {"documents" : retriever, "question":RunnablePassthrough()} | RunnableLambda(map_docs)


final_prompt = ChatPromptTemplate.from_messages([
    ("system",
     """
    Given the following extracted parts of a long document and a question, create a final answer.
    If you don't know the answer, just say that you don't know. Don't try to make up an answer.
    ------
    {context}
    """),
    ("human","{question}")
])

chain = {"context": map_chain, "question":RunnablePassthrough()} | final_prompt | llm

chain.invoke("Describe Victory Mansions")



Authentication failed for https://api.smith.langchain.com/runs/4cb30958-846d-4bcb-98e5-ea74ab02927c. HTTPError('401 Client Error: Unauthorized for url: https://api.smith.langchain.com/runs/4cb30958-846d-4bcb-98e5-ea74ab02927c', '{"detail":"Invalid auth"}')
Authentication failed for https://api.smith.langchain.com/runs. HTTPError('401 Client Error: Unauthorized for url: https://api.smith.langchain.com/runs', '{"detail":"Invalid auth"}')


AIMessage(content='Victory Mansions is a building complex located in London, specifically in Airstrip One, which is the chief city of Oceania. It is described as a grimy landscape with rotting nineteenth-century houses. The houses are in a state of disrepair, with their sides supported by timber, windows patched with cardboard, and roofs made of corrugated iron. The garden walls are sagging in all directions. There are also bombed sites with rubble and plaster dust in the air, and in some areas, sordid colonies of wooden dwellings resembling chicken-houses have sprung up.\n\nVictory Mansions itself is overshadowed by the Ministry of Truth, which is an enormous pyramidal structure made of glittering white concrete. The building has terraces that soar 300 meters into the air. From the roof of Victory Mansions, one can see the Ministry of Truth and three other buildings of similar appearance and size. These four buildings are the homes of the four Ministries that make up the government ap

In [3]:
chain.invoke("Where does Winston go to work?")

AIMessage(content='Winston goes to work at the Ministry of Truth.')