## Rag Pipeline with vecror database


## Load ENV

In [117]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

## Load Text file for RAG

In [118]:

import os
from langchain_community.document_loaders import TextLoader
loader = TextLoader("speech.txt")
text_docs = loader.load()
print(text_docs)

[Document(metadata={'source': 'speech.txt'}, page_content="The Eiffel Tower, located in Paris, France, was completed in 1889 for the World's Fair. It stands approximately 300 metres tall and was originally criticised by many artists and intellectuals for its design. Today, it is one of the most recognised and visited landmarks in the world.Python is a high-level, interpreted programming language known for its readability and versatility. It's widely used in web development, data analysis, machine learning, automation, and scripting. Key features include dynamic typing, a vast standard library, and a large ecosystem of third-party packages.Mangoes are tropical stone fruits enjoyed for their sweet and juicy flesh. India is the largest producer of mangoes in the world. The most popular variety is Alphonso, known for its rich flavour and aroma.In Greek mythology, Athena was the goddess of wisdom, war, and strategy. She was said to have sprung fully grown and armoured from the forehead of Z

## Load Website content using HTML Tag for RAG

In [119]:
# Web-based loader
from langchain_community.document_loaders import WebBaseLoader
import bs4

# Load the web page, chunk it, and save it
loader = WebBaseLoader(
    web_paths=("https://ekaivakriti.com",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(class_="py-16 bg-gradient-to-br from-white to-blue-50")
    )
)

web_docs = loader.load()

# Show number of loaded documents
print(web_docs)

[Document(metadata={'source': 'https://ekaivakriti.com'}, page_content="\n\n\n\n\n\nWho We Are\nAbout EkaivaKriti\nEkaivaKriti (एकैवकृति,\n                        pronounced /ˈeh-KAI-va-kree-tee/) means a\n                        unique creation in Sanskrit,\n                    a name that captures our purpose of building digital solutions that stand apart.\nWe're a small, flexible team that helps businesses grow with digital\n                    solutions that work. From startups to global enterprises across e‑commerce, solar, healthcare, tech,\n                    and finance, we turn bold ideas into results that matter. No complexity, no fluff, just outcomes\n                    that help your business shine.\n\n\nLet's build something truly\n                            unique, together.\n\n\n\n")]


## Load PDF for RAG

In [120]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("ekaivakriti_pitch_deck.pdf")
pdf_docs = loader.load()
pdf_docs

[Document(metadata={'producer': 'pypdf', 'creator': 'PyPDF', 'creationdate': '', 'source': 'ekaivakriti_pitch_deck.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}, page_content='EKekaivakriti\nYour AI Innovation Partner\nTransforming businesses with intelligent agents, automation, and data\ninsights for companies across all industries\n\uf544 AI Agent Development \uf085 Process Automation \uf201 Advanced Data Analytics\nMade with Genspark'),
 Document(metadata={'producer': 'pypdf', 'creator': 'PyPDF', 'creationdate': '', 'source': 'ekaivakriti_pitch_deck.pdf', 'total_pages': 8, 'page': 1, 'page_label': '2'}, page_content='Business Challenges Today\nWhy organizations need AI-powered solutions\n\uf252\nInefficient Business Processes\n● 40% of employee time wasted on manual, repetitive tasks\n● Human errors increase operational costs by 20-30%\n● Slow approval workflows and process bottlenecks\n\uf086\nPoor Customer Experience\n● 76% of customers expect personalized interactions\n● 

## Load JSon for RAG


In [121]:
import requests
import json
from langchain_core.documents import Document
from typing import Dict, Any, List
import uuid

# Helper function to flatten nested dictionaries
def flatten_dict(d: Dict[str, Any], parent_key: str = '', sep: str = '_') -> Dict[str, Any]:
    """Flatten a nested dictionary into a single-level dictionary with concatenated keys."""
    items = []
    for key, value in d.items():
        new_key = f"{parent_key}{sep}{key}" if parent_key else key
        if isinstance(value, dict):
            items.extend(flatten_dict(value, new_key, sep).items())
        elif isinstance(value, list) and all(isinstance(item, dict) for item in value):
            continue  # Skip lists of dictionaries; handle separately
        else:
            items.append((new_key, value))
    return dict(items)

# Helper function to create a Document
def create_document(content: str, metadata: Dict[str, Any], default_type: str = "generic") -> Document:
    """Create a LangChain Document with content and metadata."""
    return Document(
        page_content=str(content)[:10000],  # Truncate to avoid excessive length
        metadata={k: v for k, v in metadata.items() if v is not None}
    )

# Main processing logic
try:
    # Fixed URL for ekaivakriti.com
    url = "https://ekaivakriti.com/ekv_chatbot.json".strip()
    
    # Download JSON
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()

    # Save JSON with a unique filename
    filename = f"website_data_{uuid.uuid4().hex[:8]}.json"
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"JSON saved to {filename}")

    # Process JSON into Documents
    json_docs: List[Document] = []

    def process_json(obj: Any, parent_key: str = '', depth: int = 0) -> None:
        """Recursively process JSON to create Documents."""
        if isinstance(obj, dict):
            # Flatten dictionary for metadata, excluding nested dictionaries or lists
            metadata = flatten_dict({k: v for k, v in obj.items() if not isinstance(v, (dict, list))})
            metadata['source'] = url
            metadata['type'] = parent_key.split('_')[-1] if parent_key else 'root'

            # Create Document for textual content if present
            content_fields = ['content', 'description', 'quote', 'text', 'body']
            content = next((obj.get(field) for field in content_fields if obj.get(field) and isinstance(obj.get(field), str)), '')

            # Special handling for company_info to create a summary Document
            if parent_key == 'company_info' and not content:
                content = (
                    f"{obj.get('name', '')} is based in {obj.get('location', '')}. "
                    f"Tagline: {obj.get('tagline', '')}. "
                    f"Serving industries: {', '.join(obj.get('industries_served', []))}. "
                    f"Contact: {obj.get('contact_info', {}).get('email', '')}, "
                    f"{obj.get('contact_info', {}).get('phone', '')}, "
                    f"{obj.get('contact_info', {}).get('address', '')}."
                )

            if content:
                json_docs.append(create_document(content, metadata))

            # Recursively process nested dictionaries and lists
            for key, value in obj.items():
                if isinstance(value, (dict, list)):
                    process_json(value, f"{parent_key}_{key}" if parent_key else key, depth + 1)

        elif isinstance(obj, list):
            # Process each item in the list
            for i, item in enumerate(obj):
                process_json(item, f"{parent_key}_item_{i}", depth + 1)

    # Process the JSON data
    process_json(data)

    # Preview results
    print("Total Documents Loaded:", len(json_docs))
    for doc in json_docs[:20]:
        print("Content:", doc.page_content[:100])
        print("Metadata:", doc.metadata)
        print()

except requests.RequestException as e:
    print(f"Error fetching URL: {e}")
except json.JSONDecodeError as e:
    print(f"Error decoding JSON: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")

JSON saved to website_data_a6755b5c.json
Total Documents Loaded: 13
Content: EkaivaKriti is based in Gurgaon, India. Tagline: Build, Automate, Grow. Serving industries: solar, h
Metadata: {'name': 'EkaivaKriti', 'location': 'Gurgaon, India', 'tagline': 'Build, Automate, Grow', 'source': 'https://ekaivakriti.com/ekv_chatbot.json', 'type': 'info'}

Content: EkaivaKriti offers AI‑powered solutions, web & mobile development, digital marketing, e‑commerce, an
Metadata: {'url': 'https://ekaivakriti.com/', 'title': 'Home', 'content': 'EkaivaKriti offers AI‑powered solutions, web & mobile development, digital marketing, e‑commerce, and business automation. Based in Gurgaon, we serve ambitious brands worldwide to help them build, automate and grow.', 'source': 'https://ekaivakriti.com/ekv_chatbot.json', 'type': '0'}

Content: EkaivaKriti (एकैवकृति, pronounced /ˈeh‑KAI‑va‑kree‑tee/) means “a unique creation” in Sanskrit. We’r
Metadata: {'url': 'https://ekaivakriti.com/about-us', 'title': 'About 

## Split the document


In [122]:
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Set up the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# Your data variables (replace with your actual variable names)
data_sources = {
    "json": json_docs,
    "pdf": pdf_docs,
    "web": web_docs,
    "text": text_docs
}

# Convert data to Document format if needed
def make_documents(data, source):
    # If already a list of Documents, return it
    if isinstance(data, list) and all(isinstance(item, Document) for item in data):
        return data
    # If a single string, make one Document
    if isinstance(data, str):
        return [Document(page_content=data, metadata={"source": source})]
    # If a list of strings, make a Document for each
    if isinstance(data, list) and all(isinstance(item, str) for item in data):
        return [Document(page_content=text, metadata={"source": f"{source}_{i}"}) for i, text in enumerate(data)]
    return []  # Return empty list if data is invalid

# Collect all documents
all_docs = []
for source, data in data_sources.items():
    docs = make_documents(data, source)
    all_docs.extend(docs)

# Split all documents into chunks
chunked_documents = text_splitter.split_documents(all_docs)

# Show results
print("Number of chunks created:", len(chunked_documents))
if chunked_documents:
    print("First chunk (first 200 chars):", chunked_documents[0].page_content[:200])

Number of chunks created: 27
First chunk (first 200 chars): EkaivaKriti is based in Gurgaon, India. Tagline: Build, Automate, Grow. Serving industries: solar, healthcare, finance, e-commerce, education, tourism, enterprise. Contact: info@ekaivakriti.com, +91-1


## Vector Embedding & Vector Store - Faiss



In [123]:
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
db = FAISS.from_documents(chunked_documents, OpenAIEmbeddings())
db

<langchain_community.vectorstores.faiss.FAISS at 0x14721fe30>

## Query from stored data

In [130]:
query = "consultancy?"
retrieved_results = db.similarity_search(query, k=1)
for result in retrieved_results:
    print(result.page_content)

Our tech consultancy helps businesses with digital strategy, cloud and CTO services. We offer proven frameworks and actionable advice as a trusted partner in growth.
