In [1]:
import os
import json
import re
import uuid
from typing import List, Dict

In [2]:
from openai import OpenAI

import gradio as gr

from gradio.components import ChatMessage


In [3]:
from pypdf import PdfReader

In [4]:
from pathlib import Path

In [5]:
KNOWLEDGE_DIR = Path("D:\\SCRIPTING\\AGENTS\\No_Framework\\Project1_IntroduceMe\\Knewledge_Sources")
OUTPUT_FILE = Path("chunks.json")

In [6]:
SECTION_HEADERS = {
    "summary": ["summary", "about"],
    "experience": ["experience", "work experience", "professional experience"],
    "education": ["education"],
    "skills": ["skills", "technologies", "technical skills"],
    "projects": ["projects"],
}

In [7]:
def extract_text_from_pdf(pdf_path: Path) -> str:
    reader = PdfReader(pdf_path)
    pages = []
    for page in reader.pages:
        text = page.extract_text()
        if text:
            pages.append(text)
    return "\n".join(pages)

In [8]:
def normalize_text(text: str) -> str:
    lines = [line.strip() for line in text.splitlines()]
    lines = [line for line in lines if line]
    return "\n".join(lines)

In [9]:
def detect_section(line: str) -> str | None:
    lower = line.lower()
    for section, keywords in SECTION_HEADERS.items():
        for kw in keywords:
            if lower == kw or lower.startswith(kw):
                return section
    return None

In [10]:
def split_into_sections(text: str) -> dict:
    sections = {}
    current_section = "other"
    sections[current_section] = []

    for line in text.split("\n"):
        detected = detect_section(line)
        if detected:
            current_section = detected
            sections.setdefault(current_section, [])
            continue
        sections[current_section].append(line)

    return sections

In [11]:
def chunk_experience(lines: list[str], source: str) -> list[dict]:
    chunks = []
    current_block = []

    for line in lines:
        # Heuristic: job titles / company lines often contain dates or "at"
        if re.search(r"\b(at|@|\d{4})\b", line.lower()) and current_block:
            chunks.append(build_chunk(source, "experience", current_block))
            current_block = []

        current_block.append(line)

    if current_block:
        chunks.append(build_chunk(source, "experience", current_block))

    return chunks

In [12]:
def build_chunk(source: str, section: str, lines: list[str]) -> dict:
    return {
        "id": str(uuid.uuid4()),
        "source": source,
        "section": section,
        "text": " ".join(lines),
    }

In [13]:
def chunk_sections(sections: dict, source: str) -> list[dict]:
    chunks = []

    for section, lines in sections.items():
        if not lines:
            continue

        if section == "experience":
            chunks.extend(chunk_experience(lines, source))
        else:
            chunks.append(build_chunk(source, section, lines))

    return chunks

In [14]:
all_chunks = []

for pdf_file in KNOWLEDGE_DIR.glob("*.pdf"):
    print(f"Processing {pdf_file.name}")
    raw_text = extract_text_from_pdf(pdf_file)
    normalized = normalize_text(raw_text)
    sections = split_into_sections(normalized)
    chunks = chunk_sections(sections, source=pdf_file.stem)
    all_chunks.extend(chunks)

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, indent=2)

print(f"Created {len(all_chunks)} chunks → {OUTPUT_FILE}")

Processing Ankit_Tripathi_AI_Engineer_Resume.pdf
Processing Ankit_Tripathi_Test_Manager_Resume.pdf
Processing Profile.pdf
Created 21 chunks → chunks.json


In [15]:
PERSONA_PROMPT = '''
You are an AI assistant representing Ankit Tripathi, a Test Manager with hands-on experience in quality engineering and a strong interest in AI/ML. You represent this person in a professional, public-facing context.

You answer questions as Ankit would in a professional setting.

Knowledge usage

You are provided with personal knowledge extracted from documents such as LinkedIn profiles, resumes, and personal notes.

When stating facts about Ankit’s experience, skills, or background, use only the provided context.

If the context is insufficient or unclear, explicitly say so.

Do not invent experience, achievements, or opinions.

Reasoning rules

Base answers on evidence from the provided knowledge.

You may summarize or paraphrase, but do not add new factual claims.

You may extrapolate only for opinion-based or hypothetical questions, and only when the extrapolation is consistent with stated principles or past behavior.

Clearly signal uncertainty when extrapolating.

Communication style

Clear, structured, and professional

Concise by default; expand only when helpful

Neutral, factual tone

Avoid marketing language and exaggeration

Boundaries

Do not provide legal, medical, or financial advice.

Do not speculate about private, sensitive, or unverified matters.

Do not present assumptions as facts.
'''

In [16]:
def load_chunks(json_path: str) -> List[Dict]:
    """
    Load document chunks from a JSON file.

    Args:
        json_path (str): Path to chunk JSON file

    Returns:
        List[Dict]: List of chunk objects with id, source, section, text
    """

    with open(json_path, "r", encoding="utf-8") as f:
        chunks = json.load(f)

    # Basic validation
    required_keys = {"id", "source", "section", "text"}
    for chunk in chunks:
        if not required_keys.issubset(chunk.keys()):
            raise ValueError(f"Chunk missing required keys: {chunk}")

    return chunks


In [17]:
def extract_chunk_texts(chunks: List[Dict]) -> List[str]:
    """
    Extract only text content from chunks for embedding and retrieval.

    Args:
        chunks (List[Dict])

    Returns:
        List[str]: Chunk texts
    """

    return [chunk["text"] for chunk in chunks]

In [18]:
def build_knowledge_prompt(chunks: List[Dict]) -> str:
    """
    Build a grounded knowledge section from JSON chunks.

    Args:
        chunks (List[Dict]): Loaded document chunks

    Returns:
        str: Formatted knowledge prompt
    """

    if not chunks:
        return "NO KNOWLEDGE PROVIDED."

    knowledge_blocks = []

    for idx, chunk in enumerate(chunks, start=1):
        block = f"""
[CHUNK {idx}]
Source: {chunk.get("source", "unknown")}
Section: {chunk.get("section", "unknown")}
Content:
{chunk.get("text", "")}
"""
        knowledge_blocks.append(block.strip())

    knowledge_prompt = """
=== PROVIDED KNOWLEDGE (AUTHORITATIVE SOURCE) ===
The following information is extracted from Ankit Tripathi's resume and LinkedIn profile.
Use ONLY this information to answer factual questions about Ankit.
If the answer is not present, state that clearly.

{blocks}

=== END PROVIDED KNOWLEDGE ===
""".format(blocks="\n\n".join(knowledge_blocks))

    return knowledge_prompt.strip()


In [19]:
chunks = load_chunks("chunks.json")
knowledge_prompt = build_knowledge_prompt(chunks)


In [20]:
print(knowledge_prompt)

=== PROVIDED KNOWLEDGE (AUTHORITATIVE SOURCE) ===
The following information is extracted from Ankit Tripathi's resume and LinkedIn profile.
Use ONLY this information to answer factual questions about Ankit.
If the answer is not present, state that clearly.

[CHUNK 1]
Source: Ankit_Tripathi_AI_Engineer_Resume
Section: other
Content:
Ankit Tripathi - AI Engineer Resume Hyderabad, India Phone: +91 7755914062 | Email: ankittripathi2402@gmail.com LinkedIn: linkedin.com/in/ankit-tripathi-71a48245/ Career Summary Innovative and technically skilled Test Manager with 12+ years of experience in software quality engineering, now pivoting into AI engineering. Hands-on experience with GenAI, Retrieval-Augmented Generation (RAG), LangChain, and cloud-native development. Successfully built and deployed AI/ML solutions such as predictive maintenance dashboards, NLP-based tweet analysis pipelines, and RAG-based chatbots. Strong foundational skills in Python, machine learning, cloud architecture (AWS), 

In [21]:
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

client = OpenAI()

In [22]:



def call_llm(
    persona: str,
    knowledge: str,
    question: str,
    model: str = "gpt-4o-mini",
    temperature: float = 0.2,
    max_tokens: int = 500,
) -> str:
    """
    Call OpenAI chat model with grounded persona and knowledge.

    Args:
        persona (str): Persona/system prompt
        knowledge (str): Knowledge prompt built from chunks
        question (str): User question
        model (str): OpenAI model
        temperature (float): Response randomness
        max_tokens (int): Max tokens in response

    Returns:
        str: Assistant reply
    """

    messages = [
        {
            "role": "system",
            "content": persona.strip()
        },
        {
            "role": "system",
            "content": knowledge.strip()
        },
        {
            "role": "user",
            "content": question.strip()
        }
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
    )

    return response.choices[0].message.content.strip()


In [25]:
def respond(user_message, history):
    """
    Gradio callback for chat responses.
    """
    # Initialize history if it's None
    if history is None:
        history = []

    # Skip empty messages
    if not user_message.strip():
        return "", history

    # Get the answer from the LLM
    reply = call_llm(
        persona=PERSONA_PROMPT,
        knowledge=knowledge_prompt,
        question=user_message,
    )

    # Append to history in the new "messages" format
    history.append({"role": "user", "content": user_message})
    history.append({"role": "assistant", "content": reply})

    return "", history

In [26]:
with gr.Blocks() as demo:
    gr.Markdown("# Ankit Tripathi – AI Avatar")

    chatbot = gr.Chatbot(
    )

    msg = gr.Textbox(
        placeholder="Ask about my experience, skills, or projects...",
        show_label=False
    )

    clear = gr.Button("Clear")

    msg.submit(
        respond,
        inputs=[msg, chatbot],
        outputs=[msg, chatbot]
    )

    clear.click(
        lambda: [],
        None,
        chatbot
    )

demo.launch()


* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.


