In [26]:
%pip install langchain langchain-openai langchain-core

Note: you may need to restart the kernel to use updated packages.


In [27]:
import os
from langchain_openai import ChatOpenAI
from langchain_core.tools import tool
from langchain_core.messages import HumanMessage
from langchain.agents import create_tool_calling_agent, AgentExecutor
from langchain_core.prompts import ChatPromptTemplate
from datetime import datetime
import random
import pandas as pd
import numpy as np
import json

from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

# Tools

In [28]:
# Global DataFrame variable that tools can access
current_df = None

@tool
def get_dataframe_info() -> str:
    """Get basic information about the DataFrame including shape, columns, and data types."""
    global current_df
    if current_df is None:
        return "No DataFrame loaded. Please load a DataFrame first."
    
    info = {
        "shape": current_df.shape,
        "columns": list(current_df.columns),
        "dtypes": {col: str(dtype) for col, dtype in current_df.dtypes.items()},
        "memory_usage": f"{current_df.memory_usage(deep=True).sum()} bytes"
    }
    return json.dumps(info, indent=2)

@tool
def get_column_stats(column_name: str) -> str:
    """Get statistical summary for a specific column."""
    global current_df
    if current_df is None:
        return "No DataFrame loaded. Please load a DataFrame first."
    
    if column_name not in current_df.columns:
        return f"Column '{column_name}' not found. Available columns: {list(current_df.columns)}"
    
    col = current_df[column_name]
    
    if col.dtype in ['int64', 'float64']:
        stats = {
            "count": int(col.count()),
            "mean": float(col.mean()),
            "std": float(col.std()),
            "min": float(col.min()),
            "25%": float(col.quantile(0.25)),
            "50%": float(col.median()),
            "75%": float(col.quantile(0.75)),
            "max": float(col.max())
        }
    else:
        stats = {
            "count": int(col.count()),
            "unique": int(col.nunique()),
            "top": str(col.mode().iloc[0]) if not col.mode().empty else "N/A",
            "freq": int(col.value_counts().iloc[0]) if len(col.value_counts()) > 0 else 0
        }
    
    return json.dumps(stats, indent=2)

@tool
def get_missing_values() -> str:
    """Check for missing values in the DataFrame."""
    global current_df
    if current_df is None:
        return "No DataFrame loaded. Please load a DataFrame first."
    
    missing = current_df.isnull().sum()
    missing_dict = {col: int(count) for col, count in missing.items()}
    total_missing = missing.sum()
    
    return json.dumps({
        "total_missing_values": int(total_missing),
        "missing_by_column": missing_dict,
        "percentage_missing": {col: round((count/len(current_df))*100, 2) for col, count in missing_dict.items()}
    }, indent=2)

# Code

In [29]:
# Create a sample DataFrame for testing
sample_data = {
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'age': [25, 30, 35, 28, 22],
    'salary': [50000, 75000, 90000, 60000, 45000],
    'department': ['Engineering', 'Sales', 'Engineering', 'Marketing', 'Sales'],
    'years_experience': [3, 8, 12, 5, 1]
}
df = pd.read_csv("../datasets/smoke.csv")
current_df = df  # Set the global DataFrame for tools to access
print(df.head())

   Year       State Smoke everyday Smoke some days Former smoker Never smoked
0  2010          AL         15.60%           6.30%        23.90%       54.20%
1  2010          AK         13.50%           6.80%        26.10%       53.60%
2  2010          AZ         10.70%           4.40%        27.90%       57.10%
3  2100    Arkansas         17.30%           5.60%        24.10%          53%
4  2010  California          7.50%           4.60%        23.10%       64.80%


In [30]:
# Initialize GPT-4o-mini with tool calling enabled
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.1,
    api_key=os.environ["OPENAI_API_KEY"]
)

# Check if the model supports tool calling
print(f"Model: {llm.model_name}")
print(f"Supports tool calling: {hasattr(llm, 'bind_tools')}")

Model: gpt-4o-mini
Supports tool calling: True


In [31]:
# Collect all tools imported from pandas_tools.ipynb (doesn't work)
# tools = pandas_tools.tools

# Local tools
tools = [get_dataframe_info, get_column_stats, get_missing_values]

In [32]:
# Bind tools to the LLM
llm_with_tools = llm.bind_tools(tools)

print("Available pandas tools:")
for tool_func in tools:
    print(f"- {tool_func.name}: {tool_func.description}")

Available pandas tools:
- get_dataframe_info: Get basic information about the DataFrame including shape, columns, and data types.
- get_column_stats: Get statistical summary for a specific column.
- get_missing_values: Check for missing values in the DataFrame.


In [33]:
prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a helpful data analysis assistant. You have access to pandas DataFrame tools that can help analyze data.
    
    The current DataFrame contains data from a CSV file. Use the appropriate tools to explore and understand the data structure.

    When users ask about data analysis, use the appropriate tools to get the information they need.
    Always provide clear, helpful explanations of the results."""),
    ("placeholder", "{chat_history}"),
    ("human", "{input}"),
    ("placeholder", "{agent_scratchpad}")
])

In [34]:
# Create the agent using create_tool_calling_agent
agent = create_tool_calling_agent(llm, tools, prompt)

# Create the agent executor
agent_executor = AgentExecutor(
    agent=agent, 
    tools=tools, 
    verbose=True,  # Shows which tools are being called
    handle_parsing_errors=True
)

print("Agent and AgentExecutor created successfully!")

Agent and AgentExecutor created successfully!


In [35]:
# response = agent_executor.invoke({"input": "What's the shape and structure of the DataFrame?"})
# print(response['output'])

# response = agent_executor.invoke({"input": "Are there any missing values in the dataset?"})
# print(response['output'])

messages = []

query = "Give me statistics of smoke everyday column"
messages.append(HumanMessage(content=query))

response = agent_executor.invoke(messages)
print(response['output'])

StopIteration: 