In [1]:
# Lang chain imports
from langchain_openai import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage
from dotenv import load_dotenv
from pathlib import Path

# Get API KEY
import os

load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
import pandas as pd
import numpy as np

# **IMPORTING FEATURES**
<hr>


Import features from other notebooks using import_ipynb library

In [None]:
# Import Features - Restart kernel if this doesn't work
import import_ipynb

# Clear any cached imports
import sys
module_names = [name for name in sys.modules.keys() if name.startswith('features.')]
for name in module_names:
    if name in sys.modules:
        del sys.modules[name]

from features.summaries import get_summaries
from features.missing_vals import missing_vals
from features.duplicates import duplicates
from features.data_types import data_types
from features.outlier_detection import outlier_detection

In [None]:
# List features for agent 
features = """
Available features (TOOLS):
- get_summaries(df, query): for queries regarding generating summaries and statistical analysis
- missing_vals(df, query): for queries regarding missing values, imputation etc.
- duplicates(df, query): for queries regarding duplicate detection, removal, and analysis
- data_types(df, query): for queries regarding data type optimization, memory management, and type conversions
- outlier_detection(df, query): for queries regarding outlier detection, analysis, and handling using IQR, Z-score, or Isolation Forest methods
"""

# **QUERY ROUTER**
<hr>

The query router uses an LLM to process your query and pass it into your defined features

In [None]:
def route_query(user_query, df):
    # Create message chain
    messages = []
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent
                                  
    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}

    {features}
    - These functions (TOOLS) are available to call to assist with queries
    
    Rules:
    - Each function takes (df, user_query) and returns modified df
    - Each function call should have a targeted query explaining exactly what to do
    - Return only executable Python code, no explanations, NO MARKDOWN BLOCKS
    - Only if no actions can be taken, print a descriptive message why
    - ASSUME DF IS STORED IN DF
    - ALWAYS assign the result back to df when calling features: df = get_summaries(df, "query")

    Examples:
    HANDLE QUERIES THROUGH FEATURES:
    - User: Find means for price and stock, Generated: df = get_summaries(df, "find mean in price, stock")
    - User: Suggest how to handle missing values, Generated: df = missing_vals(df, "how to handle missing values")
    - User: Find duplicate rows, Generated: df = duplicates(df, "find duplicate rows")
    - User: Remove duplicates keeping first occurrence, Generated: df = duplicates(df, "remove duplicates keeping first")
    - User: Optimize data types, Generated: df = data_types(df, "optimize data types to save memory")
    - User: Find outliers in price column, Generated: df = outlier_detection(df, "find outliers in price column")
    - User: Remove extreme values, Generated: df = outlier_detection(df, "remove extreme values using IQR method")
    HANDLE QUERIES WITHOUT FEATURES (THROUGH PANDAS):
    - User: What object columns are still remaining, Generated: print(df.select_dtypes(include='object').columns.tolist())
    - User: Print out the nuniques in each column, Generated: print([df[col].nunique() for col in df.columns])
    """))
    messages.append(HumanMessage(content=f"User request: {user_query}"))

    # Call LLM
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()

    print(generated_code)
    # Execute AI generated code
    try:
        original_df = df.copy()
        # Create local namespace with our variables
        local_vars = {
            'df': df.copy(),
            'original_df': original_df,
            'pd': pd,
            'np': np,
            'get_summaries': get_summaries,
            'missing_vals': missing_vals,
            'duplicates': duplicates,
            'data_types': data_types,
            'outlier_detection': outlier_detection,
            'print': print
        }
        
        exec(generated_code, globals(), local_vars)
        return local_vars['df']
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df

# **TEST QUERIES**

In [None]:
# # Enter CSV filename from "datasets" folder
# dataset_name = "Life Expectancy Data.csv"

# # Build CSV path (to avoid import errors)
# load_dotenv()
# PROJECT_ROOT = Path(os.environ["PROJECT_ROOT"])
# path = PROJECT_ROOT / "datasets" / dataset_name

# df = pd.read_csv(path)
# test_df = df.copy()

In [None]:
# user_query = "Optimize data types to save memory"
# result = route_query(user_query, df)

df = data_types(df, "optimize data types to save memory")
=== NUMERIC TYPE OPTIMIZATION ===
Columns optimized: 17
Total memory saved: 0.21 MB
\nOptimization details:
  Year: int64 → int16 (saved 0.02 MB)
  Life expectancy : float64 → float32 (saved 0.01 MB)
  Adult Mortality: float64 → float32 (saved 0.01 MB)
  infant deaths: int64 → int16 (saved 0.02 MB)
  Alcohol: float64 → float32 (saved 0.01 MB)
  Hepatitis B: float64 → float32 (saved 0.01 MB)
  Measles : int64 → int32 (saved 0.01 MB)
   BMI : float64 → float32 (saved 0.01 MB)
  under-five deaths : int64 → int16 (saved 0.02 MB)
  Polio: float64 → float32 (saved 0.01 MB)
  Total expenditure: float64 → float32 (saved 0.01 MB)
  Diphtheria : float64 → float32 (saved 0.01 MB)
   HIV/AIDS: float64 → float32 (saved 0.01 MB)
   thinness  1-19 years: float64 → float32 (saved 0.01 MB)
   thinness 5-9 years: float64 → float32 (saved 0.01 MB)
  Income composition of resources: float64 → float32 (saved 0.01 MB)
  Schooling: float64 → float32 (