In [35]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document

import os
import dotenv

In [36]:
# Configurable parameters
data_dir = "./data"
chunk_size = 128
chunk_overlap = 32

# Ensure data directory exits
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# Get list of files
files = os.listdir(data_dir)
print(f"List of files: {files}")

text_data_from_files = []
for file in files:
    try: 
        with open(os.path.join(data_dir, file), "r") as f:
            file_data = f.read()
        text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
            chunk_size = chunk_size,
            chunk_overlap = chunk_overlap
        )
        split_text = text_splitter.split_text(file_data)
        print("Split text: ", len(split_text))
        
        for index, chunked_text in enumerate(split_text):
            text_data_from_files.append(Document(
                page_content=chunked_text,
                metadata = {
                    "source": file, 
                    "page_content": chunked_text,
                    "title": file.split(".")[0],
                    "chunk_index": index
                }
            ))
    except Exception as e:
        print(f"Error processing file {file}: {e}")

print(f"Text data from files: ", [doc.page_content for doc in text_data_from_files])

List of files: ['product.md']
Split text:  7
Text data from files:  ['Synthetic Product Data for RAG App\nBelow is a collection of synthetic data representing imaginary products for use in a sample Retrieval-Augmented Generation (RAG) application. Each product includes a name, description, category, price, and unique identifier.\nProduct 1\n\nID: PRD-001\nName: Quantum Widget\nCategory: Electronics\nPrice: $149.99\nDescription: A cutting-edge quantum widget designed to optimize energy flow in smart home devices. Features a sleek design with voice-activated controls and compatibility with all major IoT platforms.\n\nProduct 2', 'Product 2\n\nID: PRD-002\nName: EcoGlow Lamp\nCategory: Home Decor\nPrice: $79.50\nDescription: An eco-friendly lamp that adjusts brightness based on ambient light. Made from recycled materials, it offers a warm, inviting glow and a battery life of up to 48 hours.\n\nProduct 3', 'Product 3\n\nID: PRD-003\nName: HyperFit Tracker\nCategory: Wearables\nPrice: $199.

In [37]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings()
vector_store = FAISS.from_documents(text_data_from_files, embeddings)

  embeddings = HuggingFaceEmbeddings()
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


In [38]:
query = "How much is AeroBrew Coffee Maker?"

retriever = vector_store.as_retriever(search_kwargs={"k": 1}) # default is 4
retriever.invoke(query)

[Document(metadata={'source': 'product.md', 'page_content': 'Product 4\n\nID: PRD-004\nName: AeroBrew Coffee Maker\nCategory: Kitchen Appliances\nPrice: $129.95\nDescription: A compact coffee maker that brews barista-quality coffee in under 2 minutes. Features customizable brew strength and a self-cleaning mechanism for easy maintenance.\n\nProduct 5\n\nID: PRD-005\nName: SkyVault Storage Device\nCategory: Tech Accessories\nPrice: $249.99\nDescription: A high-capacity, cloud-integrated storage device with advanced encryption for secure data management. Supports seamless file sharing across multiple devices.\n\nProduct 6', 'title': 'product', 'chunk_index': 3}, page_content='Product 4\n\nID: PRD-004\nName: AeroBrew Coffee Maker\nCategory: Kitchen Appliances\nPrice: $129.95\nDescription: A compact coffee maker that brews barista-quality coffee in under 2 minutes. Features customizable brew strength and a self-cleaning mechanism for easy maintenance.\n\nProduct 5\n\nID: PRD-005\nName: Sky

In [39]:
def load_environment(env_file = ".env"):
    try:
        env_file = os.path.abspath(env_file)
        if not os.path.exists(env_file):
            print(f"Error: {env_file} does not exist")
            return False
        
        required_env_vars = ["OPENAI_API_KEY", "OPENAI_BASE_URL", "OPENAI_MODEL"]
        dotenv.load_dotenv(dotenv_path=env_file)
        
        for var in required_env_vars:
            value = os.getenv(var)
            logger.info(f"{var}: {'is set' if value else 'is not set'}")
            if not value:
                print(f"Error: {var} is not set")
                return False
        
        return True

    except Exception as e:
        print(f"Error loading environment: {e}")
        return False
    

if not load_environment():
    logger.error("Failed to load environment variables")

INFO:__main__:OPENAI_API_KEY: is set
INFO:__main__:OPENAI_BASE_URL: is set
INFO:__main__:OPENAI_MODEL: is set


In [40]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model_name = os.getenv("OPENAI_MODEL"),
    openai_api_key = os.getenv("OPENAI_API_KEY"),
    openai_api_base = os.getenv("OPENAI_BASE_URL")
)

In [41]:
from langchain.prompts import ChatPromptTemplate

template = """ You are a helpful assistant. Use the following context to answer the question. If the question is not related to the context, answer with 'I don't know'. 

Context: {context}

Question: {question}

Answer: """

prompt = ChatPromptTemplate.from_template(template)


In [42]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

chain = (
    {"context": retriever, "question": RunnablePassthrough() }
    | prompt
    | llm
    | StrOutputParser()
)

response = chain.invoke(query)
print(f"Response: {response}")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Response: The AeroBrew Coffee Maker is $129.95.
