In [1]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer  # 启用 IterativeImputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import ParameterGrid

In [2]:
def preprocess_data(df, missing_threshold):
    """
    Removes rows with more than `missing_threshold`% missing values.
    
    Parameters:
    df (pd.DataFrame): Original dataset
    missing_threshold (float): Threshold for row removal (0.0 - 1.0)
    
    Returns:
    (pd.DataFrame, int): Cleaned dataset, number of removed rows
    """
    row_missing_percentage = df.isnull().mean(axis=1)  # 计算每行缺失值的比例
    cleaned_data = df[row_missing_percentage <= missing_threshold].copy()
    removed_rows = df.shape[0] - cleaned_data.shape[0]  # 计算删除的行数
    return cleaned_data, removed_rows

In [3]:
def create_masked_dataframe(df, mask_ratio=0.1, random_state=42):
    """
    Randomly masks a fraction of numeric values in df.
    
    Parameters:
    df (pd.DataFrame): The dataset with missing values
    mask_ratio (float): Proportion of values to mask (0 - 1)
    random_state (int): Random seed for reproducibility
    
    Returns:
    (pd.DataFrame, dict): Masked dataframe, dictionary of masked locations
    """
    np.random.seed(random_state)
    df_masked = df.copy()
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    mask_info = {}  # 记录被遮盖的数值位置
    
    for col in numeric_cols:
        non_missing_indices = df_masked.index[df_masked[col].notna()]
        n_to_mask = int(len(non_missing_indices) * mask_ratio)
        masked_indices = np.random.choice(non_missing_indices, size=n_to_mask, replace=False)
        
        mask_col = np.zeros(df_masked.shape[0], dtype=bool)
        mask_col[df_masked.index.get_indexer(masked_indices)] = True
        mask_info[col] = mask_col
        
        df_masked.loc[masked_indices, col] = np.nan
    
    return df_masked, mask_info

In [4]:
def grid_search_imputation(df, param_grid, mask_ratio=0.1, missing_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4], random_state=42):
    """
    Performs a grid search over imputation methods and missing data thresholds.
    
    Parameters:
    df (pd.DataFrame): The original dataframe
    param_grid (list of dict): List of imputation strategies and hyperparameters
    mask_ratio (float): Fraction of numeric values to mask
    missing_threshold_list (list): List of missing thresholds (proportion of missing values allowed per row)
    random_state (int): Random seed
    
    Returns:
    pd.DataFrame: Results sorted by MSE
    """
    all_results = []

    for missing_threshold in missing_threshold_list:
        # Step 1: Remove rows with excessive missing values
        cleaned_df, removed_rows = preprocess_data(df, missing_threshold)
        print(f"\nMissing Threshold: {missing_threshold*100}% - Removed Rows: {removed_rows}")

        # Step 2: Mask part of the data for evaluation
        df_original = cleaned_df.copy()
        df_masked, mask_info = create_masked_dataframe(df_original, mask_ratio, random_state)
        
        numeric_cols = df_original.select_dtypes(include=[np.number]).columns.tolist()
        if len(numeric_cols) == 0:
            continue

        all_params = list(ParameterGrid(param_grid))
        
        for param_dict in all_params:
            imputer_class = param_dict.get("imputer", None)
            if imputer_class is None:
                raise ValueError("Each param_dict must include 'imputer' key with an imputer object.")

            imputer = imputer_class
            for k, v in param_dict.items():
                if k.startswith("imputer__"):
                    param_name = k.split("__", 1)[1]
                    setattr(imputer, param_name, v)

            # Apply imputation
            imputed_array = imputer.fit_transform(df_masked[numeric_cols])
            imputed_df = pd.DataFrame(imputed_array, columns=numeric_cols, index=df_masked.index)

            method_mse_list = []
            method_mae_list = []

            # Step 4: Evaluate using MSE and MAE
            for col in numeric_cols:
                col_mask = mask_info[col]
                true_values = df_original.loc[col_mask, col]
                pred_values = imputed_df.loc[col_mask, col]

                if len(true_values) > 0:
                    mse = mean_squared_error(true_values, pred_values)
                    mae = mean_absolute_error(true_values, pred_values)
                    method_mse_list.append(mse)
                    method_mae_list.append(mae)

            avg_mse = np.mean(method_mse_list) if method_mse_list else None
            avg_mae = np.mean(method_mae_list) if method_mae_list else None

            all_results.append({
                "Missing_Threshold": missing_threshold,
                "Removed_Rows": removed_rows,
                "Params": param_dict,
                "MSE": avg_mse,
                "MAE": avg_mae
            })

    results_df = pd.DataFrame(all_results).sort_values(by=["Missing_Threshold", "MSE"], ascending=[True, True], na_position="last")
    return results_df

In [5]:
# Example Usage
data_path = "../data/final/nicu_120.csv"
df = pd.read_csv(data_path)

# Define imputation methods and hyperparameters
param_grid = [
    {
        "imputer": [SimpleImputer()],
        "imputer__strategy": ["mean", "median", "most_frequent"]
    },
    {
        "imputer": [KNNImputer()],
        "imputer__n_neighbors": [2, 5, 10]
    },
    {
        "imputer": [IterativeImputer(random_state=42)],
        "imputer__max_iter": [8, 10, 12, 14, 16, 18, 20, 22],
    }
]

# Run the grid search for imputation
results_df = grid_search_imputation(
    df,
    param_grid=param_grid,
    mask_ratio=0.1,  # 随机遮盖 10% 的数值用于评估插补效果
    missing_threshold_list=[0.95, 0.9, 0.85, 0.8, 0.75, 0.7, 0.65, 0.6],  # 缺失阈值从 5% 到 40%，每 5% 一档
    random_state=42
)



Missing Threshold: 95.0% - Removed Rows: 8





Missing Threshold: 90.0% - Removed Rows: 78

Missing Threshold: 85.0% - Removed Rows: 277





Missing Threshold: 80.0% - Removed Rows: 713





Missing Threshold: 75.0% - Removed Rows: 1071





Missing Threshold: 70.0% - Removed Rows: 1362

Missing Threshold: 65.0% - Removed Rows: 2285

Missing Threshold: 60.0% - Removed Rows: 3090




ModuleNotFoundError: No module named 'ace_tools'

In [6]:
# Print results in a readable table format
print("\n===== Final Results =====")
print(results_df.to_string(index=False))

# Print best result
best_result = results_df.loc[results_df["MSE"].idxmin()]
print("\n===== Best Imputation Strategy =====")
print(f"Missing Threshold: {best_result['Missing_Threshold']*100}%")
print(f"Removed Rows: {best_result['Removed_Rows']}")
print(f"Best Params: {best_result['Params']}")
print(f"Best MSE: {best_result['MSE']:.6f}")
print(f"Best MAE: {best_result['MAE']:.6f}")


===== Final Results =====
 Missing_Threshold  Removed_Rows                                                                                     Params          MSE        MAE
              0.60          3090       {'imputer': IterativeImputer(max_iter=20, random_state=42), 'imputer__max_iter': 10} 1.019090e+06  93.758805
              0.60          3090       {'imputer': IterativeImputer(max_iter=20, random_state=42), 'imputer__max_iter': 20} 1.019090e+06  93.758805
              0.60          3090        {'imputer': IterativeImputer(max_iter=20, random_state=42), 'imputer__max_iter': 5} 1.019122e+06  93.765137
              0.60          3090        {'imputer': SimpleImputer(strategy='most_frequent'), 'imputer__strategy': 'median'} 1.040429e+06 100.703428
              0.60          3090          {'imputer': SimpleImputer(strategy='most_frequent'), 'imputer__strategy': 'mean'} 1.044356e+06 101.073268
              0.60          3090                        {'imputer': KNNImputer(n_neig