In [1]:
# Lang chain imports
from langchain_openai import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage
from dotenv import load_dotenv
from pathlib import Path

# Get API KEY
import os

load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
import pandas as pd
import numpy as np

# **IMPORTING FEATURES**
<hr>


Import features from other notebooks using import_ipynb library

In [None]:
# Import Features - Restart kernel if this doesn't work
import import_ipynb

# Clear any cached imports
import sys
module_names = [name for name in sys.modules.keys() if name.startswith('features.')]
for name in module_names:
    if name in sys.modules:
        del sys.modules[name]

from features.summaries import get_summaries
from features.missing_vals import missing_vals
from features.duplicates import duplicates
from features.data_types import data_types
from features.outlier_detection import outlier_detection
from features.text_processing import text_processing
from features.validation import validation
from features.feature_engineering import feature_engineering
from features.standardization import standardization
from features.export_tools import export_tools
from features.profiling import profiling

In [None]:
# List features for agent 
features = """
Available features (TOOLS):
- get_summaries(df, query): for queries regarding generating summaries and statistical analysis
- missing_vals(df, query): for queries regarding missing values, imputation etc.
- duplicates(df, query): for queries regarding duplicate detection, removal, and analysis
- data_types(df, query): for queries regarding data type optimization, memory management, and type conversions
- outlier_detection(df, query): for queries regarding outlier detection, analysis, and handling using IQR, Z-score, or Isolation Forest methods
- text_processing(df, query): for queries regarding text cleaning, case standardization, special character handling, and text pattern validation
- validation(df, query): for queries regarding data validation, email/phone format checking, range validation, and data quality assessment
- feature_engineering(df, query): for queries regarding creating new features, binning, ratios, interactions, lag features, and rolling statistics
- standardization(df, query): for queries regarding data normalization, scaling, categorical encoding, and ML preprocessing
- export_tools(df, query): for queries regarding data export, formatting, Excel output, data dictionaries, summary reports, and codebook generation
- profiling(df, query): for queries regarding comprehensive data profiling, quality metrics calculation, correlation analysis, pattern detection, and data drift analysis between datasets
"""

# **QUERY ROUTER**
<hr>

The query router uses an LLM to process your query and pass it into your defined features

In [None]:
def route_query(user_query, df):
    # Create message chain
    messages = []
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent
                                  
    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}

    {features}
    - These functions (TOOLS) are available to call to assist with queries
    
    Rules:
    - Each function takes (df, user_query) and returns modified df
    - Each function call should have a targeted query explaining exactly what to do
    - Return only executable Python code, no explanations, NO MARKDOWN BLOCKS
    - Only if no actions can be taken, print a descriptive message why
    - ASSUME DF IS STORED IN DF
    - ALWAYS assign the result back to df when calling features: df = get_summaries(df, "query")

    Examples:
    HANDLE QUERIES THROUGH FEATURES:
    - User: Find means for price and stock, Generated: df = get_summaries(df, "find mean in price, stock")
    - User: Suggest how to handle missing values, Generated: df = missing_vals(df, "how to handle missing values")
    - User: Find duplicate rows, Generated: df = duplicates(df, "find duplicate rows")
    - User: Remove duplicates keeping first occurrence, Generated: df = duplicates(df, "remove duplicates keeping first")
    - User: Optimize data types, Generated: df = data_types(df, "optimize data types to save memory")
    - User: Find outliers in price column, Generated: df = outlier_detection(df, "find outliers in price column")
    - User: Remove extreme values, Generated: df = outlier_detection(df, "remove extreme values using IQR method")
    - User: Clean text columns, Generated: df = text_processing(df, "clean text columns")
    - User: Standardize city names, Generated: df = text_processing(df, "standardize city names")
    - User: Validate email addresses, Generated: df = validation(df, "validate email addresses")
    - User: Check data quality, Generated: df = validation(df, "find data quality issues")
    - User: Create age groups, Generated: df = feature_engineering(df, "create age groups in 10-year bins")
    - User: Calculate ratios, Generated: df = feature_engineering(df, "calculate price per square foot ratio")
    - User: Normalize columns, Generated: df = standardization(df, "normalize numeric columns")
    - User: One-hot encode categories, Generated: df = standardization(df, "one-hot encode categorical variables")
    - User: Export to Excel, Generated: df = export_tools(df, "export to Excel with proper formatting")
    - User: Create data dictionary, Generated: df = export_tools(df, "create a data dictionary")
    - User: Generate summary report, Generated: df = export_tools(df, "generate summary report")
    - User: Export to CSV, Generated: df = export_tools(df, "export to CSV")
    - User: Create codebook, Generated: df = export_tools(df, "create codebook for documentation")
    - User: Generate data profile report, Generated: df = profiling(df, "generate a data profile report")
    - User: Calculate data quality metrics, Generated: df = profiling(df, "calculate data quality metrics")
    - User: Analyze correlations, Generated: df = profiling(df, "analyze correlations in the data")
    - User: Detect patterns, Generated: df = profiling(df, "detect patterns in the data")
    - User: Check for data drift, Generated: df = profiling(df, "check for data drift between datasets")
    HANDLE QUERIES WITHOUT FEATURES (THROUGH PANDAS):
    - User: What object columns are still remaining, Generated: print(df.select_dtypes(include='object').columns.tolist())
    - User: Print out the nuniques in each column, Generated: print([df[col].nunique() for col in df.columns])
    """))
    messages.append(HumanMessage(content=f"User request: {user_query}"))

    # Call LLM
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()

    print(generated_code)
    # Execute AI generated code
    try:
        original_df = df.copy()
        # Create local namespace with our variables
        local_vars = {
            'df': df.copy(),
            'original_df': original_df,
            'pd': pd,
            'np': np,
            'get_summaries': get_summaries,
            'missing_vals': missing_vals,
            'duplicates': duplicates,
            'data_types': data_types,
            'outlier_detection': outlier_detection,
            'text_processing': text_processing,
            'validation': validation,
            'feature_engineering': feature_engineering,
            'standardization': standardization,
            'export_tools': export_tools,
            'profiling': profiling,
            'print': print
        }
        
        exec(generated_code, globals(), local_vars)
        return local_vars['df']
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df

# **TEST QUERIES**

In [None]:
# # Enter CSV filename from "datasets" folder
# dataset_name = "Life Expectancy Data.csv"

# # Build CSV path (to avoid import errors)
# load_dotenv()
# PROJECT_ROOT = Path(os.environ["PROJECT_ROOT"])
# path = PROJECT_ROOT / "datasets" / dataset_name

# df = pd.read_csv(path)
# test_df = df.copy()

In [None]:
# # Test the new integrated features
# print("=== TESTING INTEGRATED FEATURES ===")

# # Create a simple test dataset
# test_data = {
#     'name': ['John Doe', 'jane smith', 'BOB JOHNSON', '  Mary Brown  ', 'ALICE WHITE'],
#     'email': ['john@email.com', 'jane.invalid', 'bob@test.co.uk', 'mary@domain.org', 'alice@company.com'],
#     'age': [25, 35, 45, 30, 28],
#     'salary': [50000, 75000, 90000, 60000, 55000],
#     'department': ['IT', 'HR', 'IT', 'Finance', 'IT'],
#     'score': [85.5, 92.3, 78.1, 88.9, 91.2]
# }
# test_df = pd.DataFrame(test_data)
# print(f"Test dataset shape: {test_df.shape}")
# print("\\nTest dataset:")
# print(test_df)

# # Test 1: Text processing
# print("\\n1. Testing text processing:")
# query1 = "Clean and standardize the name column to title case"
# result1 = route_query(query1, test_df.copy())

# # Test 2: Validation
# print("\\n\\n2. Testing data validation:")
# query2 = "Validate email addresses and check data quality"
# result2 = route_query(query2, test_df.copy())

# # Test 3: Feature engineering
# print("\\n\\n3. Testing feature engineering:")
# query3 = "Create age groups in 3 bins and calculate salary ratio per age"
# result3 = route_query(query3, test_df.copy())

# # Test 4: Standardization
# print("\\n\\n4. Testing standardization:")
# query4 = "Normalize salary and score columns for machine learning"
# result4 = route_query(query4, test_df.copy())

# print("\\n=== ALL FEATURE INTEGRATION TESTS COMPLETE ===")
# print("If no errors appeared above, all features are properly integrated!")