## **Feature:** Missing Vals

**Names:** Tanat

### **What it does**
Intelligently handles missing values in datasets using data cleaning best practices. The feature analyzes your data's characteristics (data types, distribution, missing percentages) and automatically suggests or applies the most appropriate imputation method.

Users can either get imputation recommendations or have the system automatically clean their data based on best practices.

In [1]:
# Load dotenv
import os
from dotenv import load_dotenv
load_dotenv()

# Get API Key
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
from pathlib import Path
import pandas as pd
import numpy as np

# Langchain imports
from langchain_openai import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage


### **Helper Functions**

- `analyze_missing_values(df, drop_threshold=0.5)` = Suggest missing vlaue imputation strategies based on best practices/heuristics
- `auto_impute(df, drop_threshold=0.5)` = Automatically imputes missing values in a DataFrame based on simple best-practice heuristics.

In [2]:
def analyze_missing_values(df, drop_threshold=0.5):
    """
    Suggest missing value imputation strategies based on heuristics.
    """
    suggestions = {}
    for col in df.columns:
        missing_pct = df[col].isna().mean()
        dtype = df[col].dtype
        suggestion = None

        if missing_pct == 0:
            continue
        if missing_pct > drop_threshold:
            suggestion = "Drop column (too many missing values)"
        else:
            # Numerical features
            if pd.api.types.is_numeric_dtype(dtype):
                n_unique = df[col].nunique(dropna=True)
                if n_unique < 15:  # numeric but categorical (like codes)
                    suggestion = "Mode imputation (numeric categorical)"
                else:
                    non_null = df[col].dropna()
                    if len(non_null) < 10:
                        suggestion = "Median imputation (small sample)"
                    else:
                        skewness = non_null.skew()
                        suggestion = "Mean imputation" if abs(skewness) < 1 else "Median imputation (skewed)"
            
            # Categorical features
            elif pd.api.types.is_categorical_dtype(dtype) or pd.api.types.is_object_dtype(dtype):
                n_unique = df[col].nunique(dropna=True)
                if n_unique <= 10:
                    suggestion = "Mode imputation (most frequent)"
                else:
                    suggestion = "Impute with 'Unknown' or predictive model"

            # Boolean features
            elif pd.api.types.is_bool_dtype(dtype):
                suggestion = "Mode imputation (True/False)"

            # Datetime features
            elif pd.api.types.is_datetime64_any_dtype(dtype):
                suggestion = "Forward/Backward fill or interpolation (time series)"
            
            else:
                suggestion = "Custom handling needed"

        suggestions[col] = {
            "dtype": str(dtype),
            "missing_pct": round(missing_pct, 3),
            "suggestion": suggestion
        }

    return pd.DataFrame.from_dict(suggestions, orient="index")

In [3]:
def auto_impute(df, drop_threshold=0.5):
    """
    Automatically imputes missing values in a DataFrame based on simple 
    best-practice heuristics.
    """
    for col in df.columns:
        missing_pct = df[col].isna().mean()
        dtype = df[col].dtype

        # Drop if too many missing
        if missing_pct > drop_threshold:
            df = df.drop(columns=[col])
            continue

        # Numerical features (mean or median)
        if np.issubdtype(dtype, np.number):
            skewness = df[col].dropna().skew()
            if abs(skewness) < 1:
                fill_value = df[col].mean()
                print(f"Imputed '{col}' with mean")
            else:
                fill_value = df[col].median()
                print(f"'{col}' is skewed, imputed with median")
            df[col] = df[col].fillna(fill_value)

        # Categorical features
        elif df[col].dtype == "object" or pd.api.types.is_categorical_dtype(df[col]):
            n_unique = df[col].nunique(dropna=True)
            unique_ratio = n_unique / df.shape[0]
            # Low Cardinality Fill with median else impute with unknown 
            if n_unique <= 20 or unique_ratio < 0.05:
                fill_value = df[col].mode(dropna=True)[0] if not df[col].mode(dropna=True).empty else "Unknown"
                df[col] = df[col].fillna(fill_value)
                print(f"{col} has low Cardinality, imputed with mode")
            else:
                df[col] = df[col].fillna("Unknown")

        # Datetime features
        elif np.issubdtype(dtype, np.datetime64):
            df[col] = df[col].fillna(method="ffill").fillna(method="bfill")

        # Fallback
        else:
            df[col] = df[col].fillna("Unknown")

    return df

In [4]:
def impute_col(series, strategy):
    """
    Impute missing values in a pandas Series using the specified strategy.
    Supported strategies: 'mean', 'median', 'mode', 'unknown', 'ffill', 'bfill'
    """
    if strategy == "mean":
        fill_value = series.mean()
        return series.fillna(fill_value)
    elif strategy == "median":
        fill_value = series.median()
        return series.fillna(fill_value)
    elif strategy == "mode":
        fill_value = series.mode(dropna=True)[0] if not series.mode(dropna=True).empty else "Unknown"
        return series.fillna(fill_value)
    elif strategy == "unknown":
        return series.fillna("Unknown")
    elif strategy == "ffill":
        return series.fillna(method="ffill")
    elif strategy == "bfill":
        return series.fillna(method="bfill")
    else:
        print(f"Unknown strategy '{strategy}', no imputation performed.")
        return series

In [5]:
helper_docs = """ Helper functions available: 
- auto_impute(df, drop_threshold=0.5): Automatically imputes missing values in a DataFrame based on simple best-practice heuristics. Returns modified DataFrame.
- impute_col(series, strategy): Impute missing values in a pandas Series using the specified strategy. Returns modified series.
    - Supported Strategies: 'mean', 'median', 'mode', 'unknown', 'ffill', 'bfill'
- analyze_missing_values(df, drop_threshold=0.5): Analyze missing values and provide imputation suggestions. Returns DataFrame with analysis.

Examples:
- df = auto_impute(df)  # automatic imputation
- df['column_name'] = impute_col(df['column_name'], 'mean')  # For specific column imputation
- analysis = analyze_missing_values(df)  # For analysis only
"""

# **MAIN FEATURE FUNCTION**

In [6]:
def missing_vals(df, user_query):
    """
    Main function that gets called by the main router.
    MUST take (df, user_query) and return df
    """
    
    suggestions = analyze_missing_values(df)
    
    # Get columns with missing values for the LLM
    columns_with_missing = df.columns[df.isnull().any()].tolist()
    missing_info = {col: {'missing_count': df[col].isnull().sum(), 'missing_pct': df[col].isnull().mean()} 
                   for col in columns_with_missing}

    # Create message chain
    messages = []
    messages.append(SystemMessage(content=helper_docs))
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent trying to handle missing values.
    
    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}
    
    Columns with missing values: {len(columns_with_missing)} columns
    Missing value summary: {missing_info}
    
    imputation suggestions: {suggestions if not suggestions.empty else "No Missing Values!"}

    Libraries available:
    - pd (pandas), np (numpy)
    - math, re, datetime
    - preprocessing, impute (from sklearn)
    
    Rules:
    - Return only executable Python code, no explanations, no markdown blocks
    - Use helper functions if needed
    - ASSUME "df" IS ALREADY DEFINED
    - ALWAYS assign the result back to df when modifying: df = auto_impute(df)
    - RECOMMENDED: Use auto_impute(df) for automatic imputation as it handles all column names correctly
    - Only use impute_col for specific single-column imputation requests
    - In order to generate a response/message to the user use print statements
    print("message")
    - Write a detailed print message to summarise actions taken and reasons
    
    Common query patterns:
    - "automatic imputation": df = auto_impute(df)
    - "analysis only": analysis = analyze_missing_values(df); print(analysis)
    - "impute missing values using suggested strategy": df['column_name'] = impute_col(df['column_name'], 'mean')
    """))
    messages.append(HumanMessage(content=f"User request: {user_query}"))
    
    # Call LLM with message chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()
    
    # Execute code
    try:
        original_df = df.copy()
        # Create local namespace with our variables
        local_vars = {
            'df': df.copy(),
            'original_df': original_df,
            'suggestions': suggestions,
            'columns_with_missing': columns_with_missing,
            'missing_info': missing_info,
            'pd': pd,
            'np': np,
            'auto_impute': auto_impute,
            'impute_col': impute_col,
            'analyze_missing_values': analyze_missing_values,
            'print': print
        }
        
        exec(generated_code, globals(), local_vars)
        return local_vars['df']
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df

# **Testing**

In [None]:
# # Enter CSV filename from "datasets" folder
# dataset_name = "Life Expectancy Data.csv"

# # Build CSV path (to avoid import errors)
# load_dotenv()
# PROJECT_ROOT = Path(os.environ["PROJECT_ROOT"])
# path = PROJECT_ROOT / "datasets" / dataset_name

# df = pd.read_csv(path)
# test_df = df.copy()

In [10]:
# query = "impute missing values using suggested strategy"
# result = missing_vals(test_df, query)

Missing values have been imputed using the suggested strategies: mean for 'Life expectancy', 'Alcohol', ' BMI ', 'Total expenditure', and 'Schooling'; median for 'Adult Mortality', 'Hepatitis B', 'Polio', 'Diphtheria', 'GDP', 'Population', ' thinness  1-19 years', ' thinness 5-9 years', and 'Income composition of resources'.


In [9]:
# result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2938 non-null   float64
 4   Adult Mortality                  2938 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2938 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2938 non-null   float64
 9   Measles                          2938 non-null   int64  
 10   BMI                             2938 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Polio               