In [1]:
"""
RESOURCES
https://python.langchain.com/v0.1/docs/use_cases/question_answering/quickstart/
https://python.langchain.com/docs/tutorials/rag/
https://scalexi.medium.com/implementing-a-retrieval-augmented-generation-rag-system-with-openais-api-using-langchain-ab39b60b4d9f
"""

'\nRESOURCES\nhttps://python.langchain.com/v0.1/docs/use_cases/question_answering/quickstart/\nhttps://python.langchain.com/docs/tutorials/rag/\nhttps://scalexi.medium.com/implementing-a-retrieval-augmented-generation-rag-system-with-openais-api-using-langchain-ab39b60b4d9f\n'

In [2]:
## Imports
import os

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [3]:
## Function for reading text files containing information for RAG
def read_txt_files_in_folder(folder_path):
    all_texts = []
    
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    filtered_content = ''.join([char for char in content if char not in ['**','#','##','###']])
                    all_texts.append(filtered_content)
    
    return all_texts

In [4]:
## Reading the text files
text = read_txt_files_in_folder('data/')
print(len(text))
print(len(text[0])) 

4
5676


`RecursiveCharacterTextSplitter` recursively splits the text data using common separators until each chunk is a certain `chunk_size` 

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n","\n", " "], ## CHECK: Separators not making a difference?
    chunk_size=1000, 
    chunk_overlap=200, 
    add_start_index=True
)

In [None]:
## Converting text data into documents
docs = text_splitter.create_documents(text)
vectorstore = Chroma.from_documents(documents=docs, embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

#prompt = hub.pull("rlm/rag-prompt")
## Based on hub.pull("rlm/rag-prompt") # Do not include single-quotation marks or double-quotation marks in the answer.
template = """Use the following pieces of context to answer the questions related to interior design. Please respond without using double-quotation marks. 
If the question is not related to interior design, politely say that your are an assistant helping with interior design and tell the user to ask relavant questions, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
{context}

Question: {question}

Helpful Answer:"""

custom_rag_prompt = PromptTemplate.from_template(template)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)