In [1]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
assert OPENAI_API_KEY, "Add your OpenAI key to a .env file"


In [2]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector
from langchain.text_splitter import RecursiveCharacterTextSplitter

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)

In [3]:
from langchain.document_loaders import (
    TextLoader, CSVLoader, JSONLoader, UnstructuredPDFLoader,
    UnstructuredExcelLoader, UnstructuredHTMLLoader,
)
FILE_LOADERS = {
    ".txt": TextLoader,
    ".md":  TextLoader,
    ".pdf": UnstructuredPDFLoader,
    ".csv": CSVLoader,            # returns BOTH text & DataFrame
    ".json": JSONLoader,
    ".xls": UnstructuredExcelLoader,
    ".xlsx": UnstructuredExcelLoader,
    ".html": UnstructuredHTMLLoader,
}


In [4]:
import json, inspect, pandas as pd, numpy as np
from typing import TypedDict, List

class Tool(TypedDict):
    name: str
    description: str
    parameters: dict     # JSON-Schema

tool_registry: List[Tool] = []

def tool(fn=None, *, name=None, description=""):
    "Decorator to auto-register a function as an LLM-callable tool"
    def wrap(f):
        sig = inspect.signature(f)
        schema = {
            "type": "object",
            "properties": {
                p.name: {"type": "string"}  # keep simple; parse inside
                for p in sig.parameters.values()
            },
            "required": list(sig.parameters),
        }
        tool_registry.append(
            {"name": name or f.__name__, "description": description, "parameters": schema}
        )
        return f
    return wrap if fn is None else wrap(fn)

# --- Examples -----------------------------------------------------

@tool(description="Run an SQL-style query on a pandas DataFrame")
def df_query(df_json: str, query: str) -> str:
    df = pd.read_json(df_json)
    result = df.query(query)
    return result.to_markdown(index=False)

@tool(description="Aggregate a numeric column in a DataFrame")
def df_agg(df_json: str, column: str, agg: str) -> str:
    df = pd.read_json(df_json)
    val = getattr(df[column], agg)()
    return json.dumps({agg: val})


In [5]:
from openai import OpenAI
import tiktoken, os, json

client = OpenAI()

def agent_qa(question: str, docs):
    messages = [
        {"role": "system", "content": "You are a research assistant."},
        {
            "role": "user",
            "content": question
                     + "\n\n---\nHere are relevant documents:\n"
                     + "\n\n".join(d.page_content for d in docs[:10])
        },
    ]
    while True:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=messages,
            tools=tool_registry,        # <--- attaches our functions
            tool_choice="auto",
            temperature=0.2,
        )
        msg = resp.choices[0].message
        if msg.tool_calls:
            for call in msg.tool_calls:
                fn = next(t for t in tool_registry if t["name"] == call.function.name)
                args = json.loads(call.function.arguments)
                result = globals()[fn["name"]](**args)   # run local Python
                messages.append(
                    {
                        "role": "tool",
                        "tool_call_id": call.id,
                        "name": fn["name"],
                        "content": str(result),
                    }
                )
            continue     # let the LLM see results & finish
        else:
            return msg.content


In [6]:
print(agent_qa("What are the top 3 countries by population?", [
    {"page_content": "China has a population of 1.4 billion, India has 1.4 billion, and the USA has 331 million."},
    {"page_content": "China is the most populous country with 1.4 billion people, followed closely by India with 1.4 billion, and the USA with 331 million."},
    {"page_content": "The top three countries by population are China, India, and the USA."},
]))

AttributeError: 'dict' object has no attribute 'page_content'