## Basic Setup, API Keys, and Libraries

In [1]:
%%capture
!pip install langchain langchain_core langchain_community faiss-cpu openai==1.56.2 langchain_openai langchain-huggingface cohere langchain_cohere

In [2]:
import os
from google.colab import userdata

# Set environment variables
os.environ["OPENAI_API_KEY"] = userdata.get('OPEN_AI_KEY')
HF_TOKEN  = userdata.get('HF_TOKEN')
os.environ["COHERE_API_KEY"] = userdata.get('COHERE_KEY')

In [3]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# LangChain imports
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from pprint import pprint
from pathlib import Path
from typing import List, Dict
import re
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from urllib.parse import urljoin



## Configuration Dictionary

In [5]:
defaultConfig = {
    # Document processing settings
    "chunkSize": 500,
    "chunkOverlap": 50,
    "userAgentHeader": "YourCompany-ResearchBot/1.0 (your@email.com)",

    # embedding model
    "embeddingModel": "BAAI/bge-base-en-v1.5",

    # Vector store settings
    "numRetrievedDocuments": 12,

    # Document formater settings
    "numSelectedDocuments": 5,

    # Model settings
    "ragAnswerModel": "gpt-4o",
    "ragAnswerModelTemeprature": 0.7,

    #Reranker setting
    "rerankerModel": "rerank-english-v3.0",
    "numRerankedDocuments": 5,

   # URLs to process
    "companyFilingUrls": [
        ("Tesla", "https://www.sec.gov/Archives/edgar/data/1318605/000162828024002390/tsla-20231231.htm"),
        ("General Motors", "https://www.sec.gov/Archives/edgar/data/1467858/000146785824000031/gm-20231231.htm")
    ],

       # RAG prompt template
    "ragPromptTemplate": """
    Give an answer for the `Question` using only the given `Context`. Use information relevant to the query from the entire context.
    Provide a detailed answer with thorough explanations, avoiding summaries.

    Question: {question}

    Context: {context}

    Answer:
    """,

    # Decomposer settings
    "queryDecomposerModel": "gpt-4o-mini",
    "queryDecomposerModelTemperature": 0.8,

    # SubQuery prompt template
    "subqueryPromptTemplate": """
    Break down the `Question` into multiple sub-queries. Use the guidelines given below to help in the task.

    1. The set of sub-queries together capture the complete information needed to answer the question.
    2. Each sub-query should ask for just one piece of information about one specific company.
    3. For each sub-query, only mention the information you're trying to get. Don't use verbs like "retrieve" or "find".
    4. Include the company name mentioned in each sub-query.
    5. Do not include any references to data sources in your sub-queries.

    Enclose the sub-query in angle brackets. For example:
    <sub-query 1>
    <sub-query 2>

    Question: {question}

    Begin:
    """
}

In [6]:
config = defaultConfig.copy() # Creates a separate copy of the default configuration dictionary (defaultConfig) so that any subsequent changes won't alter the original default settings.

##  Embeddings Model

In [7]:
embeddingFunction = HuggingFaceEmbeddings(
    model_name= config.get('embeddingModel','BAAI/bge-base-en-v1.5' ),
    model_kwargs={"device": "cuda"},
    encode_kwargs={
        "normalize_embeddings": True,
        "query_instruction": "Represent this question for searching relevant passages: "
    }
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Load  vector store

In [8]:
# Set up paths
gdrive_root = Path('/content/drive/My Drive')
faiss_dir = gdrive_root/"content/drive/My Drive"
# Ensure the directory exists
faiss_dir.mkdir(parents=True, exist_ok=True)

In [9]:
# Load the vectorstore from Google Drive
from pathlib import Path
loaded_vectorstore = FAISS.load_local(str(faiss_dir), embeddingFunction, allow_dangerous_deserialization=True)
print("Vector store loaded successfully from Google Drive")

Vector store loaded successfully from Google Drive


##Query Decomposition (Sub-query Generation)

In [10]:
def decompose_query(question: str, config: Dict) -> List[str]:
    print(f"\n🧠 Main Question:\n{question}")

    llm = ChatOpenAI(model=config["queryDecomposerModel"], temperature=config["queryDecomposerModelTemperature"])
    prompt = PromptTemplate.from_template(config["subqueryPromptTemplate"])
    chain = prompt | llm | StrOutputParser()

    response = chain.invoke(question)
    subqueries = re.findall(r"<(.*?)>", response, re.DOTALL)

    print("\n🔍 Sub-Queries Decomposed:")
    for i, sq in enumerate(subqueries):
        print(f"  {i+1}. {sq.strip()}")

    return subqueries


## Create Reranker with Contextual Compression

In [11]:
def create_reranker(config: Dict, base_retriever):
    compressor = CohereRerank(model=config["rerankerModel"], top_n=config["numRerankedDocuments"])
    return ContextualCompressionRetriever(base_compressor=compressor, base_retriever=base_retriever)

## Format Documents for Context

In [12]:
def format_docs(docs: List[Document], limit: int = 5) -> str:
    def _get_prefix(doc):
        return doc.metadata.get("company", "")

    return "\n\n".join(f"{_get_prefix(doc)}\n{doc.page_content}" for doc in docs[:limit])

## RAG Retrieval Logic with Reranking and Sub-querying

In [24]:
def retrieve_context(question: str, config: Dict, retriever, reranker=None, decomposer=False) -> str:
    context = ""

    if decomposer:
        subqueries = decompose_query(question, config)
        print("\n⚙️ Running retrieval + reranker per sub-query...\n")

        for i, subq in enumerate(subqueries):
            print(f"\n🔹 Sub-query {i+1}: {subq}")

            # 🔍 Step 1: Determine target company from sub-query
            company = None
            if "Tesla" in subq:
                company = "Tesla"
            elif "GM" in subq or "General Motors" in subq:
                company = "General Motors"

            search_filter = {"company": company} if company else {}
            print(f"🔍 Applying metadata filter: {search_filter}")

            # 🔎 Step 2: Perform metadata-filtered retrieval
            raw_docs = retriever.invoke(subq, search_kwargs={"filter": search_filter})
            print(f"📥 RAG retrieved {len(raw_docs)} documents for sub-query {i+1}. Preview:")
            for j, doc in enumerate(raw_docs[:12]):
                print(f"   RAG → Doc {j+1} (company={doc.metadata.get('company', 'N/A')}): {doc.page_content[:120]}...\n")

            # ✅ Step 3: Filter docs by metadata before reranker
            filtered_docs = [doc for doc in raw_docs if doc.metadata.get("company") == company]

            # 🔁 Step 4: Rerank (with filtered input)
            if reranker:
                docs = reranker.base_compressor.compress_documents(filtered_docs, query=subq)
                print(f"🧠 Reranker selected {len(docs)} top documents.")
                for j, doc in enumerate(docs[:10]):
                    print(f"   RRK → Doc {j+1} (company={doc.metadata.get('company', 'N/A')}): {doc.page_content[:120]}...\n")
            else:
                docs = filtered_docs

            # 📚 Step 5: Add to context
            context += format_docs(docs, config["numSelectedDocuments"]) + "\n\n"

    else:
        print("\n⚙️ Running single-shot RAG without decomposition\n")
        docs = retriever.invoke(question)
        print(f"📥 Retrieved {len(docs)} documents for full query.")
        for j, doc in enumerate(docs[:3]):
            print(f"   Doc {j+1} (company={doc.metadata.get('company', 'N/A')}): {doc.page_content[:120]}...\n")

        context += format_docs(docs, config["numSelectedDocuments"])

    return context


## Run Final RAG Answer with OpenAI

In [25]:
retriever = loaded_vectorstore.as_retriever(
    search_kwargs={"k": config["numRetrievedDocuments"]}
)

In [26]:
def answer_with_rag(question: str, config: Dict, retriever, reranker=None, decomposer=False) -> str:
    context = retrieve_context(question, config, retriever, reranker, decomposer)

    prompt = PromptTemplate.from_template(config["ragPromptTemplate"])
    rendered_prompt = prompt.format(question=question, context=context)

    print("\n🧾 Prompt Sent to OpenAI:")
    print("=" * 100)
    print(rendered_prompt)
    print("=" * 100)

    llm = ChatOpenAI(model=config["ragAnswerModel"], temperature=config["ragAnswerModelTemeprature"])
    chain = {
        "context": lambda x: context,
        "question": RunnablePassthrough()
    } | prompt | llm | StrOutputParser()

    final_answer = chain.invoke(question)

    print("\n✅ Final Answer from OpenAI:\n")
    return final_answer


In [27]:
query = "How do Tesla and GM's approaches to manufacturing and production compare, particularly for electric vehicles? Where are their vehicles produced? What are the safety standards followed in their vehicles?"

reranker = create_reranker(config, retriever)
response = answer_with_rag(query, config, retriever, reranker=reranker, decomposer=True)

print(response)



🧠 Main Question:
How do Tesla and GM's approaches to manufacturing and production compare, particularly for electric vehicles? Where are their vehicles produced? What are the safety standards followed in their vehicles?

🔍 Sub-Queries Decomposed:
  1. Tesla's approach to manufacturing and production for electric vehicles
  2. GM's approach to manufacturing and production for electric vehicles
  3. Tesla's vehicle production locations
  4. GM's vehicle production locations
  5. Tesla's safety standards for vehicles
  6. GM's safety standards for vehicles

⚙️ Running retrieval + reranker per sub-query...


🔹 Sub-query 1: Tesla's approach to manufacturing and production for electric vehicles
🔍 Applying metadata filter: {'company': 'Tesla'}
📥 RAG retrieved 12 documents for sub-query 1. Preview:
   RAG → Doc 1 (company=Tesla): of our initial Model 3 manufacturing processes. In addition, we may introduce in the future new or unique manufacturing ...

   RAG → Doc 2 (company=General Motors):