## **Feature:** Outlier Detection & Handling

**Names:** Gia Bao Ngo

### **What it does**
Intelligently detects and handles outlier records in datasets using statistical methods (IQR, Z-score) and machine learning approaches (Isolation Forest). Provides multiple strategies for outlier handling including capping, removal, and transformation to help users understand and clean data quality issues.

In [2]:
# Load dotenv
import os
from dotenv import load_dotenv
load_dotenv()

# Get API Key
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
from pathlib import Path
import pandas as pd
import numpy as np
# Additional imports for outlier detection
import math
import re
import datetime
from sklearn import preprocessing, impute
from sklearn.ensemble import IsolationForest
from scipy import stats

# Langchain imports
from langchain_openai import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage

### **Helper Functions**
- `detect_outliers_iqr(df, columns=None, multiplier=1.5)` - IQR method outlier detection
- `detect_outliers_zscore(df, columns=None, threshold=3)` - Z-score method outlier detection  
- `detect_outliers_isolation_forest(df, columns=None, contamination=0.1)` - ML-based outlier detection
- `handle_outliers(df, columns=None, method='cap')` - Cap, remove, or transform outliers
- `outlier_summary(df, columns=None)` - Summary of outliers per column

In [3]:
def detect_outliers_iqr(df, columns=None, multiplier=1.5):
    """
    Detect outliers using the Interquartile Range (IQR) method.
    
    Parameters:
    - df: pandas DataFrame
    - columns: list of column names to check (None = all numeric columns)
    - multiplier: IQR multiplier for outlier threshold (default 1.5)
    
    Returns:
    - DataFrame with outlier indices and information
    """
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns.tolist()
    
    outliers_info = []
    
    for col in columns:
        if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            
            lower_bound = Q1 - multiplier * IQR
            upper_bound = Q3 + multiplier * IQR
            
            # Find outlier indices
            outlier_mask = (df[col] < lower_bound) | (df[col] > upper_bound)
            outlier_indices = df[outlier_mask].index.tolist()
            
            if outlier_indices:
                for idx in outlier_indices:
                    outliers_info.append({
                        'index': idx,
                        'column': col,
                        'value': df.loc[idx, col],
                        'lower_bound': lower_bound,
                        'upper_bound': upper_bound,
                        'method': 'IQR',
                        'multiplier': multiplier
                    })
    
    if outliers_info:
        outliers_df = pd.DataFrame(outliers_info)
        print(f"Found {len(outliers_info)} outliers using IQR method (multiplier={multiplier})")
        print(f"Outliers detected in columns: {outliers_df['column'].unique().tolist()}")
        return outliers_df
    else:
        print(f"No outliers found using IQR method (multiplier={multiplier})")
        return pd.DataFrame()

In [4]:
def detect_outliers_zscore(df, columns=None, threshold=3):
    """
    Detect outliers using Z-score method (suitable for normally distributed data).
    
    Parameters:
    - df: pandas DataFrame
    - columns: list of column names to check (None = all numeric columns)
    - threshold: Z-score threshold for outlier detection (default 3)
    
    Returns:
    - DataFrame with outlier indices and information
    """
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns.tolist()
    
    outliers_info = []
    
    for col in columns:
        if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
            # Calculate Z-scores
            z_scores = np.abs(stats.zscore(df[col].dropna()))
            
            # Find outlier indices (where Z-score > threshold)
            outlier_mask = z_scores > threshold
            outlier_indices = df[col].dropna().iloc[outlier_mask].index.tolist()
            
            if outlier_indices:
                for idx in outlier_indices:
                    z_score = abs(stats.zscore(df[col].dropna())[df[col].dropna().index.get_loc(idx)])
                    outliers_info.append({
                        'index': idx,
                        'column': col,
                        'value': df.loc[idx, col],
                        'z_score': z_score,
                        'threshold': threshold,
                        'method': 'Z-score'
                    })
    
    if outliers_info:
        outliers_df = pd.DataFrame(outliers_info)
        print(f"Found {len(outliers_info)} outliers using Z-score method (threshold={threshold})")
        print(f"Outliers detected in columns: {outliers_df['column'].unique().tolist()}")
        return outliers_df
    else:
        print(f"No outliers found using Z-score method (threshold={threshold})")
        return pd.DataFrame()

In [5]:
def detect_outliers_isolation_forest(df, columns=None, contamination=0.1):
    """
    Detect outliers using Isolation Forest method (ML-based, multivariate).
    
    Parameters:
    - df: pandas DataFrame
    - columns: list of column names to check (None = all numeric columns)
    - contamination: proportion of outliers in the dataset (default 0.1)
    
    Returns:
    - DataFrame with outlier indices and anomaly scores
    """
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Filter to only include specified numeric columns
    numeric_cols = [col for col in columns if col in df.columns and pd.api.types.is_numeric_dtype(df[col])]
    
    if not numeric_cols:
        print("No numeric columns found for Isolation Forest analysis")
        return pd.DataFrame()
    
    # Prepare data for Isolation Forest
    data_for_analysis = df[numeric_cols].dropna()
    
    if len(data_for_analysis) < 2:
        print("Insufficient data for Isolation Forest analysis")
        return pd.DataFrame()
    
    # Fit Isolation Forest
    iso_forest = IsolationForest(contamination=contamination, random_state=42)
    outlier_labels = iso_forest.fit_predict(data_for_analysis)
    anomaly_scores = iso_forest.score_samples(data_for_analysis)
    
    # Find outliers (labeled as -1)
    outlier_indices = data_for_analysis.index[outlier_labels == -1].tolist()
    
    if outlier_indices:
        outliers_info = []
        for idx in outlier_indices:
            score_idx = data_for_analysis.index.get_loc(idx)
            outliers_info.append({
                'index': idx,
                'anomaly_score': anomaly_scores[score_idx],
                'method': 'Isolation Forest',
                'contamination': contamination,
                'features_used': numeric_cols
            })
        
        outliers_df = pd.DataFrame(outliers_info)
        print(f"Found {len(outlier_indices)} outliers using Isolation Forest (contamination={contamination})")
        print(f"Features analyzed: {numeric_cols}")
        return outliers_df
    else:
        print(f"No outliers found using Isolation Forest (contamination={contamination})")
        return pd.DataFrame()

In [6]:
def handle_outliers(df, columns=None, method='cap', detection_method='iqr', **kwargs):
    """
    Handle outliers using various strategies: cap, remove, or transform.
    
    Parameters:
    - df: pandas DataFrame
    - columns: list of column names to process (None = all numeric columns)
    - method: 'cap', 'remove', 'log_transform'
    - detection_method: 'iqr', 'zscore', or 'isolation_forest'
    - **kwargs: additional parameters for detection methods
    
    Returns:
    - DataFrame with outliers handled
    """
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns.tolist()
    
    result_df = df.copy()
    handled_info = []
    
    for col in columns:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            continue
            
        original_count = len(result_df)
        
        if detection_method == 'iqr':
            multiplier = kwargs.get('multiplier', 1.5)
            Q1 = result_df[col].quantile(0.25)
            Q3 = result_df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - multiplier * IQR
            upper_bound = Q3 + multiplier * IQR
            outlier_mask = (result_df[col] < lower_bound) | (result_df[col] > upper_bound)
            
        elif detection_method == 'zscore':
            threshold = kwargs.get('threshold', 3)
            z_scores = np.abs(stats.zscore(result_df[col].dropna()))
            # Create a mask for the entire dataframe, not just non-null values
            outlier_mask = pd.Series(False, index=result_df.index)
            non_null_mask = result_df[col].notna()
            outlier_mask.loc[non_null_mask] = z_scores > threshold
            
        else:  # isolation_forest would be more complex for single column
            continue
            
        outlier_count = outlier_mask.sum()
        
        if outlier_count > 0:
            if method == 'cap':
                if detection_method == 'iqr':
                    result_df.loc[result_df[col] < lower_bound, col] = lower_bound
                    result_df.loc[result_df[col] > upper_bound, col] = upper_bound
                    action = f"Capped {outlier_count} outliers to bounds [{lower_bound:.3f}, {upper_bound:.3f}]"
                elif detection_method == 'zscore':
                    # Cap to percentile values instead for zscore
                    lower_cap = result_df[col].quantile(0.05)
                    upper_cap = result_df[col].quantile(0.95)
                    result_df.loc[outlier_mask, col] = np.clip(result_df.loc[outlier_mask, col], lower_cap, upper_cap)
                    action = f"Capped {outlier_count} Z-score outliers"
                    
            elif method == 'remove':
                result_df = result_df[~outlier_mask]
                action = f"Removed {outlier_count} outlier rows"
                
            elif method == 'log_transform':
                # Only apply to positive values
                positive_mask = result_df[col] > 0
                transform_mask = outlier_mask & positive_mask
                if transform_mask.sum() > 0:
                    result_df.loc[transform_mask, col] = np.log1p(result_df.loc[transform_mask, col])
                    action = f"Log-transformed {transform_mask.sum()} positive outliers"
                else:
                    action = "No positive outliers found for log transformation"
            
            handled_info.append({
                'column': col,
                'method': method,
                'detection': detection_method,
                'outliers_found': outlier_count,
                'action': action
            })
    
    # Print summary
    if handled_info:
        print(f"=== OUTLIER HANDLING SUMMARY ===")
        for info in handled_info:
            print(f"{info['column']}: {info['action']}")
        
        if method == 'remove':
            removed_rows = original_count - len(result_df)
            print(f"Total rows removed: {removed_rows}")
            print(f"Dataset size: {original_count} → {len(result_df)} rows")
    else:
        print("No outliers found to handle")
    
    return result_df

In [7]:
def outlier_summary(df, columns=None):
    """
    Generate comprehensive summary of outliers per column using multiple methods.
    
    Parameters:
    - df: pandas DataFrame
    - columns: list of column names to analyze (None = all numeric columns)
    
    Returns:
    - DataFrame (unchanged, prints analysis)
    """
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns.tolist()
    
    print("=== OUTLIER ANALYSIS SUMMARY ===\\n")
    
    summary_data = []
    
    for col in columns:
        if col not in df.columns or not pd.api.types.is_numeric_dtype(df[col]):
            continue
            
        col_data = df[col].dropna()
        if len(col_data) == 0:
            continue
            
        # Basic statistics
        mean_val = col_data.mean()
        median_val = col_data.median()
        std_val = col_data.std()
        
        # IQR method
        Q1 = col_data.quantile(0.25)
        Q3 = col_data.quantile(0.75)
        IQR = Q3 - Q1
        iqr_lower = Q1 - 1.5 * IQR
        iqr_upper = Q3 + 1.5 * IQR
        iqr_outliers = ((col_data < iqr_lower) | (col_data > iqr_upper)).sum()
        
        # Z-score method
        z_scores = np.abs(stats.zscore(col_data))
        zscore_outliers = (z_scores > 3).sum()
        
        # Data distribution assessment
        skewness = col_data.skew()
        kurtosis = col_data.kurtosis()
        
        summary_data.append({
            'Column': col,
            'Count': len(col_data),
            'Mean': round(mean_val, 3),
            'Median': round(median_val, 3),
            'Std': round(std_val, 3),
            'Skewness': round(skewness, 3),
            'IQR_Outliers': iqr_outliers,
            'ZScore_Outliers': zscore_outliers,
            'IQR_Lower': round(iqr_lower, 3),
            'IQR_Upper': round(iqr_upper, 3),
            'Min': round(col_data.min(), 3),
            'Max': round(col_data.max(), 3)
        })
    
    if summary_data:
        summary_df = pd.DataFrame(summary_data)
        print("Column-wise Outlier Summary:")
        print(summary_df.to_string(index=False))
        
        # Overall recommendations
        print("\\n=== RECOMMENDATIONS ===")
        for _, row in summary_df.iterrows():
            recommendations = []
            
            if abs(row['Skewness']) > 1:
                recommendations.append("Consider log transformation (highly skewed)")
            elif abs(row['Skewness']) > 0.5:
                recommendations.append("Moderate skewness detected")
            
            if row['IQR_Outliers'] > row['ZScore_Outliers']:
                recommendations.append("IQR method more sensitive for this data")
            elif row['ZScore_Outliers'] > row['IQR_Outliers']:
                recommendations.append("Z-score method more sensitive (check normality)")
            
            if row['IQR_Outliers'] > len(df) * 0.05:  # More than 5% outliers
                recommendations.append("High outlier percentage - investigate data quality")
            
            if recommendations:
                print(f"{row['Column']}: {'; '.join(recommendations)}")
        
        print("\\n=== SUGGESTED DETECTION METHOD ===")
        for _, row in summary_data:
            if abs(row['Skewness']) <= 1:  # Roughly normal
                print(f"{row['Column']}: Z-score method (data appears roughly normal)")
            else:  # Skewed data
                print(f"{row['Column']}: IQR method (data is skewed)")
    else:
        print("No numeric columns found for outlier analysis")
    
    return df

In [8]:
helper_docs = """ Helper functions available:
- detect_outliers_iqr(df, columns=None, multiplier=1.5): Detect outliers using IQR method. Returns DataFrame with outlier information.
- detect_outliers_zscore(df, columns=None, threshold=3): Detect outliers using Z-score method for normally distributed data. Returns DataFrame with outlier information.
- detect_outliers_isolation_forest(df, columns=None, contamination=0.1): ML-based multivariate outlier detection using Isolation Forest. Returns DataFrame with outlier information.
- handle_outliers(df, columns=None, method='cap', detection_method='iqr', **kwargs): Handle outliers using cap, remove, or log_transform methods. Returns modified DataFrame.
- outlier_summary(df, columns=None): Comprehensive outlier analysis with recommendations. Returns unchanged DataFrame but prints detailed analysis.

Examples:
- "Find outliers in price column" -> detect_outliers_iqr(df, columns=['price'])
- "Remove outliers using IQR method" -> df = handle_outliers(df, method='remove', detection_method='iqr')
- "Cap extreme values" -> df = handle_outliers(df, method='cap')
- "Show outlier patterns" -> outlier_summary(df)
"""

# **MAIN FEATURE FUNCTION**

In [9]:
def outlier_detection(df, user_query):
    """
    Main function that gets called by the main router.
    MUST take (df, user_query) and return df
    """
    
    # Get basic dataset info for context
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Create message chain
    messages = []
    messages.append(SystemMessage(content=helper_docs))
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent focused on outlier detection and handling.
    
    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}
    Numeric columns: {numeric_cols}

    Libraries available:
    - pd (pandas), np (numpy)
    - math, re, datetime
    - stats (from scipy)
    - IsolationForest (from sklearn.ensemble)
    - All helper functions listed above
    
    Rules:
    - Return only executable Python code, no explanations, no markdown blocks
    - Use helper functions when appropriate for outlier detection tasks
    - ASSUME "df" IS ALREADY DEFINED
    - For analysis queries, use helper functions that print results (detect_outliers_*, outlier_summary)
    - For data cleaning, use helper functions that modify DataFrame (handle_outliers)
    - ALWAYS assign the result back to df when modifying: df = handle_outliers(df, method='cap')
    - In order to generate a response/message to the user use print statements
    print("message")
    - Write a detailed print message to summarise actions taken and reasons
    
    Common query patterns:
    - "Find outliers" or "Detect outliers" -> use detect_outliers_iqr(df) or detect_outliers_zscore(df)
    - "Remove outliers" -> df = handle_outliers(df, method='remove')
    - "Cap extreme values" -> df = handle_outliers(df, method='cap')
    - "Show outlier analysis" -> outlier_summary(df)
    - "Find outliers in specific column" -> detect_outliers_iqr(df, columns=['column_name'])
    """))
    messages.append(HumanMessage(content=f"User request: {user_query}"))
    
    # Call LLM with message chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()
    
    # Execute code
    try:
        original_df = df.copy()
        # Create local namespace with our variables
        local_vars = {
            'df': df.copy(),
            'original_df': original_df,
            'numeric_cols': numeric_cols,
            'pd': pd,
            'np': np,
            'stats': stats,
            'IsolationForest': IsolationForest,
            'detect_outliers_iqr': detect_outliers_iqr,
            'detect_outliers_zscore': detect_outliers_zscore,
            'detect_outliers_isolation_forest': detect_outliers_isolation_forest,
            'handle_outliers': handle_outliers,
            'outlier_summary': outlier_summary,
            'print': print
        }
        
        exec(generated_code, globals(), local_vars)
        return local_vars['df']
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df

# **Testing**

In [None]:
# # Enter CSV filename from "datasets" folder
# dataset_name = "Life Expectancy Data.csv"

# # Build CSV path (to avoid import errors)
# load_dotenv()
# PROJECT_ROOT = Path(os.environ["PROJECT_ROOT"])
# path = PROJECT_ROOT / "datasets" / dataset_name

# df = pd.read_csv(path)
# test_df = df.copy()

In [None]:
# query = "cap extreme value"
# result = outlier_detection(test_df, query)

=== OUTLIER HANDLING SUMMARY ===
Life expectancy : Capped 10 outliers to bounds [44.200, 94.600]
Adult Mortality: Capped 82 outliers to bounds [-157.000, 459.000]
infant deaths: Capped 315 outliers to bounds [-33.000, 55.000]
percentage expenditure: Capped 389 outliers to bounds [-650.588, 1096.807]
Hepatitis B: Capped 254 outliers to bounds [47.000, 127.000]
Measles : Capped 542 outliers to bounds [-540.375, 900.625]
under-five deaths : Capped 394 outliers to bounds [-42.000, 70.000]
Polio: Capped 279 outliers to bounds [49.500, 125.500]
Total expenditure: Capped 32 outliers to bounds [-0.589, 12.341]
Diphtheria : Capped 298 outliers to bounds [49.500, 125.500]
 HIV/AIDS: Capped 542 outliers to bounds [-0.950, 1.850]
GDP: Capped 365 outliers to bounds [-7706.370, 14081.112]
Population: Capped 294 outliers to bounds [-10641055.375, 18257207.625]
 thinness  1-19 years: Capped 89 outliers to bounds [-6.800, 15.600]
 thinness 5-9 years: Capped 96 outliers to bounds [-7.050, 15.750]
Income

  result_df.loc[result_df[col] < lower_bound, col] = lower_bound
