In [29]:
from dotenv import load_dotenv
load_dotenv('../.env')

from langchain import PromptTemplate, LLMChain, OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

In [5]:
loader = PyPDFLoader("../data/pdf/economic_manuscript.pdf")
# loader = UnstructuredPDFLoader("./data/pdf/economic_manuscript.pdf")
pages = loader.load()

In [7]:
len(pages)

174

In [8]:
# loader = TextLoader('../data/text/buyo_knowledge_base_cleaned.txt')
# pages = loader.load()

In [10]:
len(pages[4].page_content)

1310

In [13]:
# Split text 
def split_chunks(sources):
    chunks = []
    splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32)
    for chunk in splitter.split_documents(sources):
        chunks.append(chunk)
    return chunks

In [14]:
pages_chunked = split_chunks(pages)

In [15]:
len(pages_chunked)

922

In [18]:
# faiss_index = FAISS.from_documents(pages_chunked, OpenAIEmbeddings())
# faiss_index.save_local(folder_path='../embeddings/economic-manuscripts')

In [19]:
embeddings = OpenAIEmbeddings()
faiss_index = FAISS.load_local('../embeddings/economic-manuscripts', embeddings)

In [23]:
question = 'What is the Economic Manuscript?'

In [24]:
matched_docs = faiss_index.similarity_search(question)

In [35]:
template = """
Please use the following context to answer questions. 
If you can't find the information in the context respond that you don't know.
Context: {context}
---
Question: {question}
Answer: Let's think step by step."""

context = "\n".join([doc.page_content for doc in matched_docs])
prompt = PromptTemplate(template=template, input_variables=["context", "question"]).partial(context=context)

In [39]:
# llm = OpenAI(temperature=0.4, model_name="text-davinci-003", max_tokens=800)

llm = ChatOpenAI(temperature=0.4, model_name='gpt-3.5-turbo', max_tokens=2048, request_timeout=10, max_retries=2)

In [40]:
llm_chain = LLMChain(llm=llm, prompt=prompt)

In [41]:
llm_chain.run(question)

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=10.0).


'The Economic Manuscript refers to the Economic and Philosophic Manuscripts of 1844, which is an unfinished work by Karl Marx. It is a criticism of the bourgeois political economy and the bourgeois economic system. The manuscript is divided into three parts, with the first and earliest being largely of a preparatory nature. The manuscript is published in the sequence in which Marx put them down, and the title and headings were given by the Institute of Marxism-Leninism.'