In [82]:
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQAWithSourcesChain

In [127]:
chunk_size = 900
chunk_overlap = 0
embedding_model = 'text-embedding-ada-002'
llm_model = 'text-davinci-003'
max_tokens = 256
temperature = 0.7

prompt = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful answer in less than 50 words:"""

In [136]:
loader = TextLoader('../test.txt')
documents = loader.load()

for d in documents:
    d.metadata['url'] = 'https://xyz'

[doc.metadata for doc in documents]

[{'source': '../test.txt', 'url': 'https://xyz'}]

In [137]:
text_splitter = CharacterTextSplitter(chunk_size=chunk_size, 
                                      chunk_overlap=chunk_overlap) 
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings(model=embedding_model, 
                              chunk_size=chunk_size,)


Created a chunk of size 2885, which is longer than the specified 900
Created a chunk of size 5284, which is longer than the specified 900


In [138]:
db = Chroma.from_documents(texts, embeddings)

In [139]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":4})
llm = OpenAI(model=llm_model, max_tokens=max_tokens, temperature=temperature)
prompt_template = PromptTemplate(
    template=prompt, input_variables=["context", "question"]
)
chain_type_kwargs = {'prompt': prompt_template}
qa = RetrievalQA.from_chain_type(llm=llm, 
                                 chain_type="stuff", 
                                 retriever=retriever,
                                 chain_type_kwargs=chain_type_kwargs,
                                 return_source_documents=True)

In [140]:
query = "How are you doing?"
qa({'query': query})

{'query': 'How are you doing?',
 'result': "\nI'm doing well. I'm feeling optimistic about 2023 and all the main parts of my life are stable. I'm continuing to focus on my goals of learning, health, and happiness.",
 'source_documents': [Document(page_content='Other than the standard things one does with a toddler, not much has changed on the family side. Just enjoying all of it. We’ve been spending a bit more time with the extended family too as COVID worries wane and hope to do more of it next year.\n\nFriends: I’ve been terrible at keeping in touch with my friends these past couple of years. I use the excuse of not being able to meet in-person combined with having a small child, but that’s a weak excuse. Recently, I’ve been trying to get in touch with some of my friends now that I’m more comfortable with limited in-person meetings. I hope I can do more of it next year.', metadata={'source': '../test.txt', 'url': 'https://xyz'}),
  Document(page_content='Health & Fitness: On the heal