In [58]:
# Lang chain imports
from langchain.chat_models import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage
from dotenv import load_dotenv
from pathlib import Path

# Get API KEY
import os

load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
import pandas as pd
import numpy as np

# **IMPORTING FEATURES**
<hr>


Import features from other notebooks using import_ipynb library

In [68]:
# Import Features
import import_ipynb
from features.summaries import get_summaries
from features.missing_vals import missing_vals

In [24]:
# List features for agent 
features = """
Available features:
- get_summaries(): data summaries, visualise data 
- missing_vals(): handling missing data, imputation recommendations
"""
# - handle_missing_vals(): imputation, missing values 
# - detect_outliers(): outliers

# **QUERY ROUTER**
<hr>

The query router uses an LLM to process your query and pass it into your defined features

In [2]:
def route_query(user_query, df):
    # Create message chain
    messages = []
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent
                                  
    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}

    {features}

    Rules:
    - Each function takes (df, user_query) and returns modified df
    - TRY TO HANDLE AS MANY QUERIES AS POSSIBLE DO WHAT USER ASKS YOU TO DO
    - TRY TO HANDLE QUERIES BEFORE ROUTING (e.g. printing simple pandas code)
    - Each function call should have a targeted query explaining exactly what to do
    - Return only executable Python code, no explanations, NO MARKDOWN BLOCKS
    - Only if no actions can be taken, print a descriptive message why
    - In order to generate a response/message to the user use print statements
    print("message")
    - ASSUME DF IS STORED IN DF

    Examples:
    HANDLE QUERIES  THROUGH FEATURES:
    - User: Find outliers for price and stock, Generated: Single: df = detect_outliers(df, "find outliers in price, stock")
    - User: Impute numeric columns and generate mean and std for age, Generated: df = missing_vals(df, "impute numeric columns"); df = get_summary(df, "calculate mean and std for age")
    HANDLE QUERIES  WITHOUT FEATURES (THROUGH PANDAS):
    - User: What object columns are still remaining, Generated: print(df.select_dtypes(include='object'))
    - User: Print out the nuniques in each column, Generated: print([col,df[col].nunique() for col in df])
    """))
    messages.append(HumanMessage(content=f"User request: {user_query}"))

    # Call LLM
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()

    # Execute AI generated code
    try:
        original_df = df.copy()
        exec(generated_code)
        return df
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df

# **TEST QUERIES**

In [69]:
# Enter CSV filename from "datasets" folder
dataset_name = "smoke.csv"

# Build CSV path (to avoid import errors)
load_dotenv()
PROJECT_ROOT = Path(os.environ["PROJECT_ROOT"])
path = PROJECT_ROOT / "datasets" / dataset_name

df = pd.read_csv(path)
test_df = df.copy()

In [70]:
user_query = "Help me clean formatting errors"

# comment when importing from main
# route_query(user_query, df) 