## Requirements:
- 1. Add an .env file containing the OPENAI_API_KEY variable with a valid key
- 2. Add knowledge_base.csv dataset

In [None]:
!pip install openai faiss-cpu numpy pandas python-dotenv langchain-openai 

In [3]:
import os
import numpy as np
import pandas as pd
import faiss
import openai
import logging
from typing import List, Any
from dotenv import load_dotenv
from dataclasses import dataclass, field

# Load environment variables
load_dotenv()
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)

def get_env_variable(name: str) -> str:
    """
    Retrieves an environment variable.
    Raises an error if the variable is not set.
    """
    value = os.getenv(name)
    if value is None or value.strip() == "":
        raise EnvironmentError(f"Required environment variable '{name}' is not set.")
    return value

@dataclass(frozen=True)
class Config:
    """
    Configuration settings for the RAG pipeline.
    """
    OPENAI_API_KEY: str = field(default_factory=lambda: get_env_variable("OPENAI_API_KEY"))
    KNOWLEDGE_BASE_PATH: str = "../data/knowledge_base.csv"
    FAISS_INDEX_PATH: str = "../data/faiss_index.idx"
    EMBEDDING_MODEL: str = "text-embedding-ada-002"
    CHAT_MODEL: str = "gpt-3.5-turbo"
    TEMPERATURE: float = 0.1

CONFIG = Config()
openai.api_key = CONFIG.OPENAI_API_KEY

class EmbeddingService:
    """
    Service for generating text embeddings using OpenAI's API.
    """
    def __init__(self, model: str = CONFIG.EMBEDDING_MODEL) -> None:
        self.model = model
    
    def get_embedding(self, text: str) -> np.ndarray:
        """
        Generates an embedding for the given text.
        """
        response = openai.embeddings.create(model=self.model, input=[text])
        embedding = response.data[0].embedding
        return np.array(embedding, dtype=np.float32)

class DataLoader:
    """
    Handles loading and saving knowledge base data.
    """
    def __init__(self, csv_path: str = CONFIG.KNOWLEDGE_BASE_PATH) -> None:
        self.csv_path = csv_path
    
    def load(self) -> pd.DataFrame:
        """
        Loads the knowledge base from a CSV file.
        """
        return pd.read_csv(self.csv_path)
    
    def save(self, df: pd.DataFrame, path: str) -> None:
        """
        Saves the DataFrame to a specified CSV file.
        """
        df.to_csv(path, index=False)

class FaissIndexer:
    """
    Builds and manages a FAISS index for fast text retrieval.
    """
    def __init__(self, df: pd.DataFrame) -> None:
        self.df = df
        self.embedding_service = EmbeddingService()
    
    def build_index(self) -> faiss.IndexFlatIP:
        """
        Constructs a FAISS index from text embeddings.
        """
        embeddings = [self.embedding_service.get_embedding(row["Text"]) for _, row in self.df.iterrows()]
        vectors = np.array(embeddings)
        norms = np.linalg.norm(vectors, axis=1, keepdims=True)
        vectors = vectors / norms
        index = faiss.IndexFlatIP(vectors.shape[1])
        index.add(vectors)
        faiss.write_index(index, CONFIG.FAISS_INDEX_PATH)
        return index
    
    def load_index(self) -> faiss.IndexFlatIP:
        """
        Loads a FAISS index from file.
        """
        return faiss.read_index(CONFIG.FAISS_INDEX_PATH)

class Retriever:
    """
    Retrieves the most relevant texts from the FAISS index.
    """
    def __init__(self, df: pd.DataFrame, index: faiss.IndexFlatIP) -> None:
        self.df = df
        self.index = index
        self.embedding_service = EmbeddingService()
    
    def search(self, query: str, top_k: int = 3) -> List[str]:
        """
        Searches the FAISS index for relevant documents.
        """
        query_vector = self.embedding_service.get_embedding(query)
        query_vector = query_vector / np.linalg.norm(query_vector)
        distances, indices = self.index.search(np.array([query_vector]), top_k)
        return [self.df.iloc[idx]["Text"] for idx in indices[0] if idx < len(self.df)]

class ResponseGenerator:
    """
    Generates AI responses based on retrieved context.
    """
    def __init__(self, model_name: str = CONFIG.CHAT_MODEL, temperature: float = CONFIG.TEMPERATURE) -> None:
        self.client = openai.Client()
        self.model_name = model_name
        self.temperature = temperature
    
    def generate(self, query: str, relevant_docs: List[str]) -> str:
        """
        Generates a response using OpenAI's chat completion API.
        """
        context = "\n".join(relevant_docs)
        prompt = f"Use the following context to answer the query:\n\nContext: {context}\n\nQuery: {query}"
        logger.debug("Generating response for query: %s", query)
        response = self.client.chat.completions.create(model=self.model_name, messages=[{"role": "system", "content": prompt}], temperature=self.temperature)
        return response.choices[0].message.content

class RAGPipeline:
    """
    Orchestrates the RAG pipeline for text retrieval and response generation.
    """
    def __init__(self) -> None:
        self.data_loader = DataLoader()
        self.df = self.data_loader.load()
        self.indexer = FaissIndexer(self.df)
        self.response_generator = ResponseGenerator()
    
    def build_or_load_index(self) -> None:
        """
        Loads an existing FAISS index or builds a new one if not found.
        """
        try:
            self.index = self.indexer.load_index()
        except Exception:
            self.index = self.indexer.build_index()
    
    def run(self, query: str) -> None:
        """
        Executes the full RAG pipeline for a given query.
        """
        self.build_or_load_index()
        retriever = Retriever(self.df, self.index)
        relevant_docs = retriever.search(query)
        response = self.response_generator.generate(query, relevant_docs)
        print("Response:", response)

pipeline = RAGPipeline()
query = "Who invented NNs?"
pipeline.run(query)


2025-02-24 14:02:39,379 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-02-24 14:02:40,066 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Response: Warren McCulloch and Walter Pitts invented neural networks.
