## Rag Pipeline with vecror database


## Load ENV

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

## Load Text file for RAG

In [None]:

import os
from langchain_community.document_loaders import TextLoader
loader = TextLoader("ekv_content.txt")
text_docs = loader.load()

## Load Website content using HTML Tag for RAG

In [None]:
# Web-based loader
from langchain_community.document_loaders import WebBaseLoader
import bs4

# Load the web page, chunk it, and save it
loader = WebBaseLoader(
    web_paths=("https://ekaivakriti.com",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(class_="py-16 bg-gradient-to-br from-white to-blue-50")
    )
)

web_docs = loader.load()

## Load PDF for RAG

In [None]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("ekaivakriti_pitch_deck.pdf")
pdf_docs = loader.load()

## Load JSon for RAG


In [None]:
import requests
import json
from langchain_core.documents import Document
from typing import Dict, Any, List
import uuid

# Helper function to flatten nested dictionaries
def flatten_dict(d: Dict[str, Any], parent_key: str = '', sep: str = '_') -> Dict[str, Any]:
    """Flatten a nested dictionary into a single-level dictionary with concatenated keys."""
    items = []
    for key, value in d.items():
        new_key = f"{parent_key}{sep}{key}" if parent_key else key
        if isinstance(value, dict):
            items.extend(flatten_dict(value, new_key, sep).items())
        elif isinstance(value, list) and all(isinstance(item, dict) for item in value):
            continue  # Skip lists of dictionaries; handle separately
        else:
            items.append((new_key, value))
    return dict(items)

# Helper function to create a Document
def create_document(content: str, metadata: Dict[str, Any], default_type: str = "generic") -> Document:
    """Create a LangChain Document with content and metadata."""
    return Document(
        page_content=str(content)[:10000],  # Truncate to avoid excessive length
        metadata={k: v for k, v in metadata.items() if v is not None}
    )

# Main processing logic
try:
    # Fixed URL for ekaivakriti.com
    url = "https://ekaivakriti.com/ekv_chatbot.json".strip()
    
    # Download JSON
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()

    # Save JSON with a unique filename
    filename = f"website_data_{uuid.uuid4().hex[:8]}.json"
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"JSON saved to {filename}")

    # Process JSON into Documents
    json_docs: List[Document] = []

    def process_json(obj: Any, parent_key: str = '', depth: int = 0) -> None:
        """Recursively process JSON to create Documents."""
        if isinstance(obj, dict):
            # Flatten dictionary for metadata, excluding nested dictionaries or lists
            metadata = flatten_dict({k: v for k, v in obj.items() if not isinstance(v, (dict, list))})
            metadata['source'] = url
            metadata['type'] = parent_key.split('_')[-1] if parent_key else 'root'

            # Create Document for textual content if present
            content_fields = ['content', 'description', 'quote', 'text', 'body']
            content = next((obj.get(field) for field in content_fields if obj.get(field) and isinstance(obj.get(field), str)), '')

            # Special handling for company_info to create a summary Document
            if parent_key == 'company_info' and not content:
                content = (
                    f"{obj.get('name', '')} is based in {obj.get('location', '')}. "
                    f"Tagline: {obj.get('tagline', '')}. "
                    f"Serving industries: {', '.join(obj.get('industries_served', []))}. "
                    f"Contact: {obj.get('contact_info', {}).get('email', '')}, "
                    f"{obj.get('contact_info', {}).get('phone', '')}, "
                    f"{obj.get('contact_info', {}).get('address', '')}."
                )

            if content:
                json_docs.append(create_document(content, metadata))

            # Recursively process nested dictionaries and lists
            for key, value in obj.items():
                if isinstance(value, (dict, list)):
                    process_json(value, f"{parent_key}_{key}" if parent_key else key, depth + 1)

        elif isinstance(obj, list):
            # Process each item in the list
            for i, item in enumerate(obj):
                process_json(item, f"{parent_key}_item_{i}", depth + 1)

## Split the document


In [None]:
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Set up the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# Your data variables (replace with your actual variable names)
data_sources = {
    "json": json_docs,
    "pdf": pdf_docs,
    "web": web_docs,
    "text": text_docs
}

# Convert data to Document format if needed
def make_documents(data, source):
    # If already a list of Documents, return it
    if isinstance(data, list) and all(isinstance(item, Document) for item in data):
        return data
    # If a single string, make one Document
    if isinstance(data, str):
        return [Document(page_content=data, metadata={"source": source})]
    # If a list of strings, make a Document for each
    if isinstance(data, list) and all(isinstance(item, str) for item in data):
        return [Document(page_content=text, metadata={"source": f"{source}_{i}"}) for i, text in enumerate(data)]
    return []  # Return empty list if data is invalid

# Collect all documents
all_docs = []
for source, data in data_sources.items():
    docs = make_documents(data, source)
    all_docs.extend(docs)

# Split all documents into chunks
chunked_documents = text_splitter.split_documents(all_docs)

# Show results
print("Number of chunks created:", len(chunked_documents))
if chunked_documents:
    print("First chunk (first 200 chars):", chunked_documents[0].page_content[:200])

## Vector Embedding & Vector Store - Faiss



In [None]:
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
db = FAISS.from_documents(chunked_documents, OpenAIEmbeddings())

## Query from stored data

In [None]:
query = "How soon can we see ROI?"
retrieved_results = db.similarity_search(query, k=1)
for result in retrieved_results:
    print(result.page_content)