## **Feature:** Feature Engineering

**Names:** Gia Bao Ngo

### **What it does**
Creates new features from existing data through various transformations including categorical binning, ratio calculations, feature interactions, time series lag features, rolling statistics, and polynomial transformations to enhance machine learning model performance.

In [None]:
# Load dotenv
import os
from dotenv import load_dotenv
load_dotenv()

# Get API Key
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
from pathlib import Path
import pandas as pd
import numpy as np
# Additional imports for feature engineering
import math
import re
import datetime
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings('ignore')

# Langchain imports
from langchain_openai import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage

### **Helper Functions**
- `create_bins(df, column, bins=None, labels=None, method='equal_width')` - Create categorical bins from numeric columns
- `create_ratios(df, numerator_col, denominator_col, new_name=None)` - Calculate ratios between two numeric columns
- `create_interaction_features(df, col1, col2, operation='multiply')` - Create interaction features between columns
- `create_lag_features(df, column, lags=[1], group_by=None)` - Create time series lag features
- `create_rolling_features(df, column, window=3, operations=['mean'])` - Create rolling window statistics
- `create_polynomial_features(df, columns, degree=2)` - Generate polynomial transformations

In [None]:
def create_bins(df, column, bins=None, labels=None, method='equal_width'):
    """
    Create categorical bins from numeric columns using various binning methods.
    
    Parameters:
    - df: pandas DataFrame
    - column: column name to bin
    - bins: number of bins (int) or bin edges (array-like)
    - labels: labels for bins (default: auto-generated)
    - method: 'equal_width', 'equal_frequency', or 'custom'
    
    Returns:
    - DataFrame with new binned column
    """
    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame")
    
    if not pd.api.types.is_numeric_dtype(df[column]):
        raise ValueError(f"Column '{column}' must be numeric for binning")
    
    result_df = df.copy()
    new_col_name = f"{column}_binned"
    
    # Handle missing values
    non_null_data = df[column].dropna()
    if len(non_null_data) == 0:
        print(f"Warning: Column '{column}' has no valid data for binning")
        result_df[new_col_name] = np.nan
        return result_df
    
    try:
        if method == 'equal_width':
            # Equal-width binning (default pandas behavior)
            if bins is None:
                bins = 5  # Default number of bins
            
            result_df[new_col_name] = pd.cut(
                df[column], 
                bins=bins, 
                labels=labels, 
                include_lowest=True,
                duplicates='drop'
            )
            
        elif method == 'equal_frequency':
            # Equal-frequency binning (quantile-based)
            if bins is None:
                bins = 5
            
            result_df[new_col_name] = pd.qcut(
                df[column], 
                q=bins, 
                labels=labels, 
                duplicates='drop'
            )
            
        elif method == 'custom':
            # Custom bin edges
            if bins is None:
                raise ValueError("Custom method requires bin edges to be specified")
            
            result_df[new_col_name] = pd.cut(
                df[column], 
                bins=bins, 
                labels=labels, 
                include_lowest=True,
                duplicates='drop'
            )
        else:
            raise ValueError("Method must be 'equal_width', 'equal_frequency', or 'custom'")
        
        # Report binning results
        bin_counts = result_df[new_col_name].value_counts().sort_index()
        total_binned = bin_counts.sum()
        null_count = result_df[new_col_name].isnull().sum()
        
        print(f"=== BINNING RESULTS FOR '{column}' ===")
        print(f"Method: {method}")
        print(f"Number of bins created: {len(bin_counts)}")
        print(f"Values binned: {total_binned}")
        print(f"Null values: {null_count}")
        
        print(f"\nBin distribution:")
        for bin_label, count in bin_counts.items():
            percentage = (count / total_binned) * 100 if total_binned > 0 else 0
            print(f"  {bin_label}: {count} values ({percentage:.1f}%)")
        
        # Show range information
        min_val, max_val = non_null_data.min(), non_null_data.max()
        print(f"\nOriginal range: [{min_val:.2f}, {max_val:.2f}]")
        
    except Exception as e:
        print(f"Error creating bins for column '{column}': {str(e)}")
        return df
    
    return result_df

In [None]:
def create_ratios(df, numerator_col, denominator_col, new_name=None):
    """
    Calculate ratios between two numeric columns.
    
    Parameters:
    - df: pandas DataFrame
    - numerator_col: column name for numerator
    - denominator_col: column name for denominator
    - new_name: name for new ratio column (default: auto-generated)
    
    Returns:
    - DataFrame with new ratio column
    """
    # Validate inputs
    if numerator_col not in df.columns:
        raise ValueError(f"Numerator column '{numerator_col}' not found in DataFrame")
    if denominator_col not in df.columns:
        raise ValueError(f"Denominator column '{denominator_col}' not found in DataFrame")
    
    if not pd.api.types.is_numeric_dtype(df[numerator_col]):
        raise ValueError(f"Numerator column '{numerator_col}' must be numeric")
    if not pd.api.types.is_numeric_dtype(df[denominator_col]):
        raise ValueError(f"Denominator column '{denominator_col}' must be numeric")
    
    result_df = df.copy()
    
    # Generate column name if not provided
    if new_name is None:
        new_name = f"{numerator_col}_per_{denominator_col}"
    
    try:
        # Calculate ratio with division by zero handling
        denominator_values = result_df[denominator_col].replace(0, np.nan)  # Replace 0 with NaN to avoid division by zero
        result_df[new_name] = result_df[numerator_col] / denominator_values
        
        # Calculate statistics for reporting
        ratio_stats = result_df[new_name].describe()
        zero_denominators = (df[denominator_col] == 0).sum()
        null_ratios = result_df[new_name].isnull().sum()
        infinite_ratios = np.isinf(result_df[new_name]).sum()
        
        # Handle infinite values (in case any slipped through)
        if infinite_ratios > 0:
            result_df[new_name] = result_df[new_name].replace([np.inf, -np.inf], np.nan)
        
        print(f"=== RATIO CREATION RESULTS ===")
        print(f"New column: '{new_name}'")
        print(f"Formula: {numerator_col} / {denominator_col}")
        print(f"Values calculated: {len(df) - null_ratios}")
        print(f"Null values (including zero denominators): {null_ratios}")
        print(f"Zero denominators handled: {zero_denominators}")
        
        print(f"\nRatio statistics:")
        print(f"  Mean: {ratio_stats['mean']:.4f}")
        print(f"  Median (50%): {ratio_stats['50%']:.4f}")
        print(f"  Min: {ratio_stats['min']:.4f}")
        print(f"  Max: {ratio_stats['max']:.4f}")
        print(f"  Std: {ratio_stats['std']:.4f}")
        
        # Check for potential issues
        warnings = []
        if zero_denominators > 0:
            warnings.append(f"{zero_denominators} zero denominators converted to NaN")
        if ratio_stats['max'] > 1000 or ratio_stats['min'] < -1000:
            warnings.append("Extreme ratio values detected - consider data validation")
        if ratio_stats['std'] > ratio_stats['mean'] * 10:
            warnings.append("High variance in ratios - consider outlier analysis")
        
        if warnings:
            print("\nWarnings:")
            for warning in warnings:
                print(f"  - {warning}")
    
    except Exception as e:
        print(f"Error creating ratio column: {str(e)}")
        return df
    
    return result_df

In [None]:
def create_interaction_features(df, col1, col2, operation='multiply'):
    """
    Create interaction features between two columns using various operations.
    
    Parameters:
    - df: pandas DataFrame
    - col1: first column name
    - col2: second column name
    - operation: 'multiply', 'add', 'subtract', 'divide', 'mean', 'max', 'min'
    
    Returns:
    - DataFrame with new interaction feature
    """
    # Validate inputs
    if col1 not in df.columns:
        raise ValueError(f"Column '{col1}' not found in DataFrame")
    if col2 not in df.columns:
        raise ValueError(f"Column '{col2}' not found in DataFrame")
    
    result_df = df.copy()
    
    # Generate new column name based on operation
    operation_symbols = {
        'multiply': 'x',
        'add': 'plus',
        'subtract': 'minus',
        'divide': 'div',
        'mean': 'mean',
        'max': 'max',
        'min': 'min'
    }
    
    symbol = operation_symbols.get(operation, operation)
    new_col_name = f"{col1}_{symbol}_{col2}"
    
    try:
        # Handle different data types and operations
        if operation == 'multiply':
            if pd.api.types.is_numeric_dtype(df[col1]) and pd.api.types.is_numeric_dtype(df[col2]):
                result_df[new_col_name] = df[col1] * df[col2]
            else:
                # For non-numeric, might want string concatenation or other logic
                print(f"Warning: Multiply operation on non-numeric columns may not be meaningful")
                result_df[new_col_name] = pd.to_numeric(df[col1], errors='coerce') * pd.to_numeric(df[col2], errors='coerce')
                
        elif operation == 'add':
            if pd.api.types.is_numeric_dtype(df[col1]) and pd.api.types.is_numeric_dtype(df[col2]):
                result_df[new_col_name] = df[col1] + df[col2]
            else:
                # String concatenation for non-numeric
                result_df[new_col_name] = df[col1].astype(str) + "_" + df[col2].astype(str)
                
        elif operation == 'subtract':
            if pd.api.types.is_numeric_dtype(df[col1]) and pd.api.types.is_numeric_dtype(df[col2]):
                result_df[new_col_name] = df[col1] - df[col2]
            else:
                print(f"Warning: Subtract operation requires numeric columns")
                result_df[new_col_name] = pd.to_numeric(df[col1], errors='coerce') - pd.to_numeric(df[col2], errors='coerce')
                
        elif operation == 'divide':
            if pd.api.types.is_numeric_dtype(df[col1]) and pd.api.types.is_numeric_dtype(df[col2]):
                # Handle division by zero
                denominator = df[col2].replace(0, np.nan)
                result_df[new_col_name] = df[col1] / denominator
            else:
                print(f"Warning: Divide operation requires numeric columns")
                num_col1 = pd.to_numeric(df[col1], errors='coerce')
                num_col2 = pd.to_numeric(df[col2], errors='coerce').replace(0, np.nan)
                result_df[new_col_name] = num_col1 / num_col2
                
        elif operation == 'mean':
            if pd.api.types.is_numeric_dtype(df[col1]) and pd.api.types.is_numeric_dtype(df[col2]):
                result_df[new_col_name] = (df[col1] + df[col2]) / 2
            else:
                print(f"Warning: Mean operation requires numeric columns")
                num_col1 = pd.to_numeric(df[col1], errors='coerce')
                num_col2 = pd.to_numeric(df[col2], errors='coerce')
                result_df[new_col_name] = (num_col1 + num_col2) / 2
                
        elif operation == 'max':
            if pd.api.types.is_numeric_dtype(df[col1]) and pd.api.types.is_numeric_dtype(df[col2]):
                result_df[new_col_name] = np.maximum(df[col1], df[col2])
            else:
                print(f"Warning: Max operation on non-numeric columns may not be meaningful")
                num_col1 = pd.to_numeric(df[col1], errors='coerce')
                num_col2 = pd.to_numeric(df[col2], errors='coerce')
                result_df[new_col_name] = np.maximum(num_col1, num_col2)
                
        elif operation == 'min':
            if pd.api.types.is_numeric_dtype(df[col1]) and pd.api.types.is_numeric_dtype(df[col2]):
                result_df[new_col_name] = np.minimum(df[col1], df[col2])
            else:
                print(f"Warning: Min operation on non-numeric columns may not be meaningful")
                num_col1 = pd.to_numeric(df[col1], errors='coerce')
                num_col2 = pd.to_numeric(df[col2], errors='coerce')
                result_df[new_col_name] = np.minimum(num_col1, num_col2)
        else:
            raise ValueError(f"Unsupported operation: {operation}")
        
        # Calculate and report statistics
        if pd.api.types.is_numeric_dtype(result_df[new_col_name]):
            stats = result_df[new_col_name].describe()
            null_count = result_df[new_col_name].isnull().sum()
            
            print(f"=== INTERACTION FEATURE RESULTS ===")
            print(f"New feature: '{new_col_name}'")
            print(f"Operation: {col1} {operation} {col2}")
            print(f"Data type: {result_df[new_col_name].dtype}")
            print(f"Valid values: {len(df) - null_count}")
            print(f"Null values: {null_count}")
            
            if not stats.empty:
                print(f"\nFeature statistics:")
                print(f"  Mean: {stats['mean']:.4f}")
                print(f"  Median (50%): {stats['50%']:.4f}")
                print(f"  Min: {stats['min']:.4f}")
                print(f"  Max: {stats['max']:.4f}")
                print(f"  Std: {stats['std']:.4f}")
        else:
            # For non-numeric features
            unique_count = result_df[new_col_name].nunique()
            null_count = result_df[new_col_name].isnull().sum()
            
            print(f"=== INTERACTION FEATURE RESULTS ===")
            print(f"New feature: '{new_col_name}'")
            print(f"Operation: {col1} {operation} {col2}")
            print(f"Data type: {result_df[new_col_name].dtype}")
            print(f"Unique values: {unique_count}")
            print(f"Null values: {null_count}")
    
    except Exception as e:
        print(f"Error creating interaction feature: {str(e)}")
        return df
    
    return result_df

In [None]:
def create_lag_features(df, column, lags=[1], group_by=None):
    """
    Create time series lag features (previous values) for a column.
    
    Parameters:
    - df: pandas DataFrame
    - column: column name to create lags for
    - lags: list of lag periods (e.g., [1, 2, 3] for 1, 2, 3 periods back)
    - group_by: column name to group by (for panel data)
    
    Returns:
    - DataFrame with new lag columns
    """
    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame")
    
    if group_by and group_by not in df.columns:
        raise ValueError(f"Group by column '{group_by}' not found in DataFrame")
    
    result_df = df.copy()
    created_cols = []
    
    try:
        if group_by:
            # Create lags within each group
            for lag in lags:
                lag_col_name = f"{column}_lag_{lag}"
                result_df[lag_col_name] = result_df.groupby(group_by)[column].shift(lag)
                created_cols.append(lag_col_name)
        else:
            # Create lags for entire dataset
            for lag in lags:
                lag_col_name = f"{column}_lag_{lag}"
                result_df[lag_col_name] = result_df[column].shift(lag)
                created_cols.append(lag_col_name)
        
        # Calculate statistics for reporting
        total_rows = len(result_df)
        print(f"=== LAG FEATURES CREATION RESULTS ===")
        print(f"Source column: '{column}'")
        if group_by:
            print(f"Grouped by: '{group_by}' ({result_df[group_by].nunique()} unique groups)")
        print(f"Lag periods created: {lags}")
        print(f"New columns: {created_cols}")
        
        print(f"\nLag features summary:")
        for i, lag_col in enumerate(created_cols):
            valid_values = result_df[lag_col].notna().sum()
            null_values = result_df[lag_col].isna().sum()
            lag_period = lags[i]
            
            print(f"  {lag_col}:")
            print(f"    Valid values: {valid_values} ({valid_values/total_rows:.1%})")
            print(f"    Null values: {null_values} (expected {lag_period} at start of each group)")
            
            if pd.api.types.is_numeric_dtype(result_df[lag_col]):
                stats = result_df[lag_col].describe()
                print(f"    Mean: {stats['mean']:.4f}, Std: {stats['std']:.4f}")
        
        # Check for potential issues
        warnings = []
        if group_by:
            min_group_size = result_df.groupby(group_by).size().min()
            max_lag = max(lags)
            if min_group_size <= max_lag:
                warnings.append(f"Some groups have {min_group_size} rows but max lag is {max_lag}")
        
        total_null_ratio = sum([result_df[col].isna().sum() for col in created_cols]) / (total_rows * len(created_cols))
        if total_null_ratio > 0.3:
            warnings.append(f"High null ratio ({total_null_ratio:.1%}) in lag features")
        
        if warnings:
            print("\nWarnings:")
            for warning in warnings:
                print(f"  - {warning}")
    
    except Exception as e:
        print(f"Error creating lag features: {str(e)}")
        return df
    
    return result_df

In [None]:
def create_rolling_features(df, column, window=3, operations=['mean'], group_by=None):
    """
    Create rolling window statistics for a column.
    
    Parameters:
    - df: pandas DataFrame
    - column: column name to create rolling features for
    - window: size of rolling window
    - operations: list of operations ['mean', 'sum', 'std', 'min', 'max', 'median']
    - group_by: column name to group by (for panel data)
    
    Returns:
    - DataFrame with new rolling feature columns
    """
    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame")
    
    if group_by and group_by not in df.columns:
        raise ValueError(f"Group by column '{group_by}' not found in DataFrame")
    
    if not pd.api.types.is_numeric_dtype(df[column]):
        raise ValueError(f"Column '{column}' must be numeric for rolling calculations")
    
    result_df = df.copy()
    created_cols = []
    valid_operations = ['mean', 'sum', 'std', 'min', 'max', 'median', 'var', 'count']
    
    # Validate operations
    invalid_ops = [op for op in operations if op not in valid_operations]
    if invalid_ops:
        raise ValueError(f"Invalid operations: {invalid_ops}. Valid operations: {valid_operations}")
    
    try:
        if group_by:
            # Create rolling features within each group
            grouped = result_df.groupby(group_by)[column]
            
            for operation in operations:
                col_name = f"{column}_rolling_{window}_{operation}"
                
                if operation == 'mean':
                    result_df[col_name] = grouped.transform(lambda x: x.rolling(window, min_periods=1).mean())
                elif operation == 'sum':
                    result_df[col_name] = grouped.transform(lambda x: x.rolling(window, min_periods=1).sum())
                elif operation == 'std':
                    result_df[col_name] = grouped.transform(lambda x: x.rolling(window, min_periods=2).std())
                elif operation == 'min':
                    result_df[col_name] = grouped.transform(lambda x: x.rolling(window, min_periods=1).min())
                elif operation == 'max':
                    result_df[col_name] = grouped.transform(lambda x: x.rolling(window, min_periods=1).max())
                elif operation == 'median':
                    result_df[col_name] = grouped.transform(lambda x: x.rolling(window, min_periods=1).median())
                elif operation == 'var':
                    result_df[col_name] = grouped.transform(lambda x: x.rolling(window, min_periods=2).var())
                elif operation == 'count':
                    result_df[col_name] = grouped.transform(lambda x: x.rolling(window, min_periods=1).count())
                
                created_cols.append(col_name)
        else:
            # Create rolling features for entire dataset
            for operation in operations:
                col_name = f"{column}_rolling_{window}_{operation}"
                rolling_obj = result_df[column].rolling(window, min_periods=1)
                
                if operation == 'mean':
                    result_df[col_name] = rolling_obj.mean()
                elif operation == 'sum':
                    result_df[col_name] = rolling_obj.sum()
                elif operation == 'std':
                    result_df[col_name] = result_df[column].rolling(window, min_periods=2).std()
                elif operation == 'min':
                    result_df[col_name] = rolling_obj.min()
                elif operation == 'max':
                    result_df[col_name] = rolling_obj.max()
                elif operation == 'median':
                    result_df[col_name] = rolling_obj.median()
                elif operation == 'var':
                    result_df[col_name] = result_df[column].rolling(window, min_periods=2).var()
                elif operation == 'count':
                    result_df[col_name] = rolling_obj.count()
                
                created_cols.append(col_name)
        
        # Calculate and report statistics
        total_rows = len(result_df)
        print(f"=== ROLLING FEATURES CREATION RESULTS ===")
        print(f"Source column: '{column}'")
        print(f"Window size: {window}")
        if group_by:
            print(f"Grouped by: '{group_by}' ({result_df[group_by].nunique()} unique groups)")
        print(f"Operations: {operations}")
        print(f"New columns: {created_cols}")
        
        print(f"\nRolling features summary:")
        for col_name in created_cols:
            valid_values = result_df[col_name].notna().sum()
            null_values = result_df[col_name].isna().sum()
            
            print(f"  {col_name}:")
            print(f"    Valid values: {valid_values} ({valid_values/total_rows:.1%})")
            print(f"    Null values: {null_values}")
            
            if valid_values > 0:
                stats = result_df[col_name].describe()
                print(f"    Mean: {stats['mean']:.4f}, Std: {stats['std']:.4f}")
                print(f"    Range: [{stats['min']:.4f}, {stats['max']:.4f}]")
        
        # Check for potential issues
        warnings = []
        
        if group_by:
            min_group_size = result_df.groupby(group_by).size().min()
            if min_group_size < window:
                warnings.append(f"Some groups have {min_group_size} rows but window size is {window}")
        
        # Check for high correlation between rolling features
        if len(created_cols) > 1:
            rolling_corr = result_df[created_cols].corr()
            high_corr_pairs = []
            for i in range(len(rolling_corr.columns)):
                for j in range(i+1, len(rolling_corr.columns)):
                    if abs(rolling_corr.iloc[i, j]) > 0.95:
                        high_corr_pairs.append((rolling_corr.columns[i], rolling_corr.columns[j], rolling_corr.iloc[i, j]))
            
            if high_corr_pairs:
                warnings.append(f"High correlation detected between rolling features")
        
        if warnings:
            print("\nWarnings:")
            for warning in warnings:
                print(f"  - {warning}")
    
    except Exception as e:
        print(f"Error creating rolling features: {str(e)}")
        return df
    
    return result_df

In [None]:
def create_polynomial_features(df, columns, degree=2):
    """
    Generate polynomial transformations of numeric columns.
    
    Parameters:
    - df: pandas DataFrame
    - columns: list of column names to create polynomial features for
    - degree: polynomial degree (2 = quadratic, 3 = cubic, etc.)
    
    Returns:
    - DataFrame with new polynomial feature columns
    """
    if isinstance(columns, str):
        columns = [columns]
    
    # Validate inputs
    for col in columns:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
        if not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column '{col}' must be numeric for polynomial features")
    
    if degree < 1:
        raise ValueError("Degree must be at least 1")
    
    result_df = df.copy()
    created_cols = []
    
    try:
        # Select only the specified numeric columns
        feature_data = result_df[columns].copy()
        
        # Handle missing values by filling with median (PolynomialFeatures doesn't handle NaN)
        feature_data_filled = feature_data.fillna(feature_data.median())
        
        # Create polynomial features using sklearn
        poly = PolynomialFeatures(degree=degree, include_bias=False, interaction_only=False)
        poly_features = poly.fit_transform(feature_data_filled)
        
        # Get feature names
        feature_names = poly.get_feature_names_out(columns)
        
        # Add new polynomial features to DataFrame
        for i, feature_name in enumerate(feature_names):
            # Skip original features (degree 1)
            if feature_name not in columns:
                # Clean up feature names for readability
                clean_name = feature_name.replace(' ', '_').replace('^', '_pow_')
                result_df[f"poly_{clean_name}"] = poly_features[:, i]
                created_cols.append(f"poly_{clean_name}")
        
        # Calculate and report statistics
        total_rows = len(result_df)
        original_null_count = feature_data.isnull().sum().sum()
        
        print(f"=== POLYNOMIAL FEATURES CREATION RESULTS ===")
        print(f"Source columns: {columns}")
        print(f"Polynomial degree: {degree}")
        print(f"Features created: {len(created_cols)}")
        print(f"New columns: {created_cols[:5]}{'...' if len(created_cols) > 5 else ''}")
        
        if original_null_count > 0:
            print(f"Note: {original_null_count} missing values were filled with median before polynomial transformation")
        
        print(f"\nPolynomial features summary:")
        for col_name in created_cols[:5]:  # Show first 5 features
            if col_name in result_df.columns:
                valid_values = result_df[col_name].notna().sum()
                
                if valid_values > 0:
                    stats = result_df[col_name].describe()
                    print(f"  {col_name}:")
                    print(f"    Mean: {stats['mean']:.4f}, Std: {stats['std']:.4f}")
                    print(f"    Range: [{stats['min']:.4f}, {stats['max']:.4f}]")
        
        if len(created_cols) > 5:
            print(f"  ... and {len(created_cols) - 5} more polynomial features")
        
        # Check for potential issues
        warnings = []
        
        # Check for extreme values that might cause numerical issues
        for col_name in created_cols:
            if col_name in result_df.columns:
                col_data = result_df[col_name]
                if col_data.max() > 1e10 or col_data.min() < -1e10:
                    warnings.append(f"Extreme values in {col_name} - consider scaling original features")
                    break
        
        # Check for high number of features created
        if len(created_cols) > 50:
            warnings.append(f"Large number of features created ({len(created_cols)}) - consider feature selection")
        
        # Check for potential multicollinearity
        if len(columns) > 1 and degree > 2:
            warnings.append("High-degree polynomials with multiple variables may cause multicollinearity")
        
        if warnings:
            print("\nWarnings:")
            for warning in warnings:
                print(f"  - {warning}")
        
        # Memory usage information
        original_memory = feature_data.memory_usage(deep=True).sum() / 1024**2
        new_features_memory = sum([result_df[col].memory_usage(deep=True) for col in created_cols]) / 1024**2
        print(f"\nMemory impact:")
        print(f"  Original features: {original_memory:.2f} MB")
        print(f"  New polynomial features: {new_features_memory:.2f} MB")
        print(f"  Total increase: {new_features_memory:.2f} MB")
    
    except Exception as e:
        print(f"Error creating polynomial features: {str(e)}")
        return df
    
    return result_df

In [None]:
helper_docs = """ Helper functions available:
- create_bins(df, column, bins=None, labels=None, method='equal_width'): Create categorical bins from numeric columns. Methods: 'equal_width', 'equal_frequency', 'custom'. Returns DataFrame with new binned column.
- create_ratios(df, numerator_col, denominator_col, new_name=None): Calculate ratios between two numeric columns with zero-division handling. Returns DataFrame with new ratio column.
- create_interaction_features(df, col1, col2, operation='multiply'): Create interaction features between columns. Operations: 'multiply', 'add', 'subtract', 'divide', 'mean', 'max', 'min'. Returns DataFrame with new interaction feature.
- create_lag_features(df, column, lags=[1], group_by=None): Create time series lag features (previous values). Supports grouping for panel data. Returns DataFrame with lag columns.
- create_rolling_features(df, column, window=3, operations=['mean'], group_by=None): Create rolling window statistics. Operations: 'mean', 'sum', 'std', 'min', 'max', 'median', 'var', 'count'. Returns DataFrame with rolling feature columns.
- create_polynomial_features(df, columns, degree=2): Generate polynomial transformations using sklearn. Creates interaction terms and powers up to specified degree. Returns DataFrame with polynomial features.

Examples:
- "Create age groups in 5 bins" -> df = create_bins(df, 'age', bins=5)
- "Calculate price per square foot" -> df = create_ratios(df, 'price', 'square_feet')
- "Create interaction between income and education" -> df = create_interaction_features(df, 'income', 'education', 'multiply')
- "Add 1 and 2 period lags for sales" -> df = create_lag_features(df, 'sales', lags=[1, 2])
- "Create 7-day rolling average" -> df = create_rolling_features(df, 'value', window=7, operations=['mean'])
- "Generate quadratic features" -> df = create_polynomial_features(df, ['feature1', 'feature2'], degree=2)
"""

# **MAIN FEATURE FUNCTION**

In [None]:
def feature_engineering(df, user_query):
    """
    Main function that gets called by the main router.
    MUST take (df, user_query) and return df
    """
    
    # Create message chain
    messages = []
    messages.append(SystemMessage(content=helper_docs))
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent focused on feature engineering and transformation.
    
    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}

    Libraries available:
    - pd (pandas), np (numpy)
    - math, re, datetime
    - sklearn.preprocessing, PolynomialFeatures
    - All helper functions listed above
    
    Rules:
    - Return only executable Python code, no explanations, no markdown blocks
    - Use helper functions for feature engineering tasks
    - ASSUME "df" IS ALREADY DEFINED
    - ALWAYS assign the result back to df when modifying: df = create_bins(df, 'column_name')
    - For multiple operations, chain them: df = create_bins(df, 'age'); df = create_ratios(df, 'price', 'sqft')
    - In order to generate a response/message to the user use print statements
    print("message")
    - Write a detailed print message to summarise actions taken and reasons
    
    Common query patterns and approaches:
    - "Create age groups" or "bin ages" -> df = create_bins(df, 'age', bins=5, method='equal_width')
    - "Calculate ratio" or "price per unit" -> df = create_ratios(df, 'price', 'square_feet')
    - "Create interaction" or "multiply features" -> df = create_interaction_features(df, 'col1', 'col2', 'multiply')
    - "Add lag features" or "previous values" -> df = create_lag_features(df, 'sales', lags=[1, 2, 3])
    - "Rolling average" or "moving average" -> df = create_rolling_features(df, 'price', window=7, operations=['mean'])
    - "Polynomial features" or "quadratic terms" -> df = create_polynomial_features(df, ['feature1', 'feature2'], degree=2)
    - "Engineering features" -> Use multiple functions as appropriate
    
    Feature engineering best practices:
    - For categorical binning: Use equal_width for uniform ranges, equal_frequency for balanced distributions
    - For ratios: Common patterns are price/unit, value/time, rate calculations
    - For interactions: Multiply for amplification effects, add for combined effects
    - For time series: Use lags for autoregressive patterns, rolling for smoothing trends
    - For polynomials: Degree 2-3 usually sufficient, higher degrees risk overfitting
    """))
    messages.append(HumanMessage(content=f"User request: {user_query}"))
    
    # Call LLM with message chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()
    
    # Execute code
    try:
        original_df = df.copy()
        # Create local namespace with our variables
        local_vars = {
            'df': df.copy(),
            'original_df': original_df,
            'pd': pd,
            'np': np,
            'create_bins': create_bins,
            'create_ratios': create_ratios,
            'create_interaction_features': create_interaction_features,
            'create_lag_features': create_lag_features,
            'create_rolling_features': create_rolling_features,
            'create_polynomial_features': create_polynomial_features,
            'PolynomialFeatures': PolynomialFeatures,
            'print': print
        }
        
        exec(generated_code, globals(), local_vars)
        return local_vars['df']
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df

# **Testing**

In [None]:
# # Create sample data for feature engineering testing
# np.random.seed(42)
# n_samples = 100

# test_data = {
#     'age': np.random.randint(18, 80, n_samples),  # For binning
#     'price': np.random.uniform(100, 1000, n_samples),  # For ratios
#     'square_feet': np.random.uniform(500, 3000, n_samples),  # For ratios
#     'income': np.random.uniform(30000, 150000, n_samples),  # For interactions
#     'education_years': np.random.randint(8, 20, n_samples),  # For interactions
#     'sales': np.cumsum(np.random.normal(100, 20, n_samples)),  # For time series features
#     'customer_id': np.repeat(range(1, 21), 5),  # For grouped operations
#     'time_period': np.tile(range(1, 6), 20),  # Time dimension
#     'feature1': np.random.uniform(-2, 2, n_samples),  # For polynomial features
#     'feature2': np.random.uniform(-1, 3, n_samples),  # For polynomial features
# }

# test_df = pd.DataFrame(test_data)
# print("Test DataFrame created for feature engineering:")
# print(f"Shape: {test_df.shape}")
# print("\\nColumns and data types:")
# print(test_df.dtypes)
# print("\\nSample data:")
# print(test_df.head())

In [None]:
# # Test various feature engineering operations
# print("=== TESTING FEATURE ENGINEERING FUNCTIONS ===\\n")

# # Test 1: Binning
# print("1. Testing binning:")
# query1 = "Create age groups in 5 equal-width bins"
# result1 = feature_engineering(test_df.copy(), query1)

# # Test 2: Ratios
# print("\\n2. Testing ratios:")
# query2 = "Calculate price per square foot ratio"
# result2 = feature_engineering(test_df.copy(), query2)

# # Test 3: Interaction features
# print("\\n3. Testing interaction features:")
# query3 = "Create interaction between income and education by multiplying them"
# result3 = feature_engineering(test_df.copy(), query3)

# # Test 4: Lag features
# print("\\n4. Testing lag features:")
# query4 = "Add 1 and 2 period lag features for sales grouped by customer_id"
# result4 = feature_engineering(test_df.copy(), query4)

# # Test 5: Rolling features
# print("\\n5. Testing rolling features:")
# query5 = "Create 3-period rolling mean and standard deviation for sales by customer"
# result5 = feature_engineering(test_df.copy(), query5)

In [None]:
# # Test 6: Polynomial features
# print("\\n6. Testing polynomial features:")
# query6 = "Generate quadratic polynomial features for feature1 and feature2"
# result6 = feature_engineering(test_df.copy(), query6)

# print("\\n=== FEATURE ENGINEERING TESTING COMPLETE ===")
# print(f"Original columns: {len(test_df.columns)}")
# print(f"Final columns after all tests: {len(result6.columns)}")
# print(f"New columns added: {len(result6.columns) - len(test_df.columns)}")