In [2]:
from dotenv import load_dotenv
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain.chat_models import init_chat_model
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec

import os

load_dotenv()

True

In [6]:
# Designing Pinecone vector store 

pinecone_api_key = os.getenv("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)
index_name = "aayushbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

index = pc.Index(index_name)
# index 

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

vector_store = PineconeVectorStore(index=index, embedding=embeddings, namespace="aayush-docs")


In [9]:
'''
A hash is a unique fingerprint of a file. 
If file content changes even by one character, the hash completely changes. 
If file is same, hash is always same.

'''
import hashlib
import os
import json

INDEXED_FILE = "indexed_docs.json"

def get_file_hash(path):
    return hashlib.md5(open(path, "rb").read()).hexdigest()


def is_already_indexed(filename, file_hash):
    if not os.path.exists(INDEXED_FILE):  # if json file doesn't exist yet
        return False                       # nothing indexed, return False
    db = json.load(open(INDEXED_FILE))    # load the json as a dict
    return db.get(filename) == file_hash  # check if stored hash matches current hash

def is_pinecone_empty(vector_store):
    index = vector_store._index  # access underlying pinecone index
    stats = index.describe_index_stats()
    return stats["total_vector_count"] == 0


def mark_indexed(filename, file_hash):
    db = json.load(open(INDEXED_FILE)) if os.path.exists(INDEXED_FILE) else {}
    db[filename] = file_hash
    json.dump(db, open(INDEXED_FILE, "w"))

filename = "test.md"
file_hash = get_file_hash(filename)

if is_already_indexed(filename, file_hash):
    print("already indexed, skipping")
else:
    headers_to_split_on = [
        ("##", "section"),
        ("###", "subsection"),
    ]
    splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

    with open(filename, "r") as f:
        md_text = f.read()

    chunks = splitter.split_text(md_text)
    if is_pinecone_empty(vector_store):
        vector_store.add_documents(chunks)
        mark_indexed(filename, file_hash)
        print(f"indexed {len(chunks)} chunks")
    else:
        mark_indexed(filename, file_hash)  # sync the hash so future runs skip correctly
        print(f"Pinecone already has data ({index.describe_index_stats()['total_vector_count']} vectors), skipping upsert")



already indexed, skipping


In [10]:

from typing import Literal, Optional, List
from langchain_openai import ChatOpenAI
# from langchain.prompts import PromptTemplate
from pydantic import BaseModel

# query = "what did aayushmaan study at unsw?"

class Filter(BaseModel):
    sections: List[Literal[
        "Projects", "Technical Skills", "Work Experience",
        "Education", "Frequently Asked Questions (for RAG)",
        "Recruiter FAQ — AI & Applied AI Experience",
        "Life Journey (Timeline)", "Family", "Sports Achievements"
    ]]
    subsection: Optional[str] = None  # not always needed

filter_llm = init_chat_model("o3-mini").with_structured_output(Filter)
llm = init_chat_model("gpt-4o")


def get_filter(query):
    return filter_llm.invoke(f"""
You are a routing assistant for a personal knowledge base about Aayushmaan Hooda.

Given this query: "{query}"

Pick one or more relevant sections. Pick subsection only if the query is very specific.

Available sections:
- Projects
- Technical Skills
- Work Experience
- Education
- Frequently Asked Questions (for RAG)
- Recruiter FAQ — AI & Applied AI Experience
- Life Journey (Timeline)
- Family
- Sports Achievements
""")

# result = get_filter(query)

# search_filter = {"section": {"$in": result.sections}}
# if result.subsection:
#     search_filter["subsection"] = result.subsection

# retriever = vector_store.as_retriever(
#     search_kwargs={"k": 3, "filter": search_filter}
# )

# retriever.invoke(query)

In [16]:
from langchain.tools import tool
from langchain_tavily import TavilySearch
from datetime import datetime

web_search = TavilySearch(max_results=2)

@tool
def rag_tool(query: str):
    " This is retriever tool, retrieve all info about aayushmaan for this tool"
    result = get_filter(query)

    search_filter = {"section": {"$in": result.sections}}
    if result.subsection:
        search_filter["subsection"] = result.subsection

    retrieved_docs = vector_store.similarity_search(
            query, 
            k=3, 
            filter=search_filter
        )
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\nContent: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs


@tool
def web_search_tool(query: str) -> str:
    """
    Use this tool to search the web when the user asks about current events,
    news, or things not in the local knowledge.
    Input: query (str) - the search term.
    Output: short text with titles and urls of results.
    """
    res = web_search.invoke({"query": query})
    return res["results"]

@tool
def age_calculator() -> str:
    """
    Use when age is asked
    Calculate age from a hardcoded date of birth (30 August 1999).
    Returns the current age as an integer of aayushmaan.
    """
    dob = datetime(1999, 8, 30)
    today = datetime.now()
    
    age = today.year - dob.year
    
    # Adjust if birthday hasn't occurred this year
    if (today.month, today.day) < (dob.month, dob.day):
        age -= 1
    
    return f"Current age: {age} years old (DOB: 30 August 1999)"

@tool
def calendar_tool(query: str) -> str:
    """
    Use this tool for any date/time/calendar related questions.
    Provides current date, time, day of week, and can calculate
    days between dates or upcoming events.
    """
    now = datetime.now()
    
    return f"""
        Current date: {now.strftime("%A, %d %B %Y")}
        Current time: {now.strftime("%I:%M %p")}
        Day of week: {now.strftime("%A")}
        Week number: {now.strftime("%W")}
        """

In [None]:
from langchain.agents import create_agent
from app.prompts import system_prompt

tools = [rag_tool, web_search_tool, age_calculator, calendar_tool]
# If desired, specify custom instructions
prompt = system_prompt
agent = create_agent(llm, tools, system_prompt=prompt)

In [18]:
query = (
   "what is dat etoday?"
)

for event in agent.stream(
    {"messages": [{"role": "user", "content": query}]},
    stream_mode="values",
):
    event["messages"][-1].pretty_print()


what is dat etoday?
Tool Calls:
  calendar_tool (call_uFzQ8OuUjZ7smokNmIumP76v)
 Call ID: call_uFzQ8OuUjZ7smokNmIumP76v
  Args:
    query: what is the date today?
Name: calendar_tool


        Current date: Saturday, 21 February 2026
        Current time: 10:45 AM
        Day of week: Saturday
        Week number: 07
        

Today is Saturday, 21st February 2026.


In [None]:
# # Identity / basic
# q1 = "where is aayushmaan from?"
# q2 = "what is aayushmaan's email and github?"

# # Work experience
# q3 = "what did aayushmaan do at annalect?"
# q4 = "tell me about the stoik internship"

# # Projects
# q5 = "tell me about the voice rag project"
# q6 = "what is the leave manager mcp server?"

# # Skills
# q7 = "what databases does aayushmaan use?"

# # Education
# q8 = "what did aayushmaan study at unsw?"

# # Recruiter style
# q9 = "can aayushmaan build end to end AI applications?"
# q10 = "what makes aayushmaan different from other AI engineers?"