In [13]:
import sys
import warnings
import time
from tqdm import tqdm
from math import sqrt

import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from hyperimpute.plugins.utils.metrics import RMSE
from hyperimpute.plugins.utils.simulate import simulate_nan

import xgboost as xgb

from IPython.display import HTML, display
import tabulate

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [None]:
from hyperimpute.plugins.imputers import Imputers, ImputerPlugin

imputers = Imputers()

imputers.list()

In [None]:
from sklearn.datasets import load_diabetes
from hyperimpute.plugins.imputers import Imputers
from hyperimpute.utils.benchmarks import compare_models

imputer = Imputers().get(
    "hyperimpute",  # the name of the imputation method.
    # The rest of the kwargs are specific to the method
    # optimizer: str. The optimizer to use: simple, hyperband, bayesian
    optimizer="hyperband",
    # classifier_seed: list. Model search pool for categorical columns.
    classifier_seed=["logistic_regression", "catboost", "xgboost", "random_forest"],
    # regression_seed: list. Model search pool for continuous columns.
    regression_seed=[
        "linear_regression",
        "catboost_regressor",
        "xgboost_regressor",
        "random_forest_regressor",
    ],
    # class_threshold: int. how many max unique items must be in the column to be is associated with categorical
    class_threshold=5,
    # imputation_order: int. 0 - ascending, 1 - descending, 2 - random
    imputation_order=2,
    # n_inner_iter: int. number of imputation iterations
    n_inner_iter=10,
    # select_model_by_column: bool. If true, select a different model for each column. Else, it reuses the model chosen for the first column.
    select_model_by_column=True,
    # select_model_by_iteration: bool. If true, selects new models for each iteration. Else, it reuses the models chosen in the first iteration.
    select_model_by_iteration=True,
    # select_lazy: bool. If false, starts the optimizer on every column unless other restrictions apply. Else, if for the current iteration there is a trend(at least to columns of the same type got the same model from the optimizer), it reuses the same model class for all the columns without starting the optimizer.
    select_lazy=True,
    # select_patience: int. How many iterations without objective function improvement to wait.
    select_patience=5,
)

# Load baseline dataset
X, _ = load_diabetes(as_frame=True, return_X_y=True)





print("Dataset shape:", X)

# Run benchmarks
_ = compare_models(
    name="example",
    evaluated_model=imputer,
    X_raw=X,
    ref_methods=["gain", "miracle"],
    scenarios=["MAR"],
    miss_pct=[0.3, 0.5],
    n_iter=2,
    n_jobs=1,
)

In [None]:
from hyperimpute.plugins.imputers import Imputers
from hyperimpute.utils.benchmarks import compare_models
from dataset import Preprocessor

# Load the gesture dataset
prepper = Preprocessor('gesture')
X = prepper.encodeDf('OHE', prepper.df_train)  # One-hot encode categorical columns
X = prepper.decodeNp('OHE', X)  # If you want to get back a DataFrame with numeric columns

print("Dataset shape:", X)


In [None]:
import os
import sys
import numpy as np
import argparse
import warnings
from tqdm import tqdm
from dataset import Preprocessor
from hyperimpute.plugins.imputers import Imputers

warnings.filterwarnings('ignore')

def parse_args():
    parser = argparse.ArgumentParser(description='Train HyperImpute on tabular datasets')
    parser.add_argument('--dataname', type=str, default='gesture', help='Name of dataset.')
    parser.add_argument('--mask', type=str, default='MCAR', help='Masking mechanism: MCAR, MAR, MNAR_logistic_T2')
    parser.add_argument('--ratio', type=float, default=0.3, help='Missing ratio')
    parser.add_argument('--num_trials', type=int, default=5, help='Number of mask trials')
    if any('ipykernel' in arg or 'jupyter' in arg for arg in sys.argv):
        return parser.parse_args(args=[])
    else:
        return parser.parse_args()

def main():
    args = parse_args()
    dataname = args.dataname
    mask_type = args.mask
    ratio = args.ratio
    num_trials = args.num_trials

    print(f"Dataset: {dataname}, Mask: {mask_type}, Ratio: {ratio}, Trials: {num_trials}")

    prepper = Preprocessor(dataname)
    train_X = prepper.encodeDf('OHE', prepper.df_train)
    num_numeric = prepper.numerical_indices_np_end

    np.random.seed(42)
    masks = [(np.random.rand(*train_X.shape) < ratio) for _ in range(num_trials)]

    MSEs = []
    models_dir = f'saved_models/{dataname}/'
    os.makedirs(models_dir, exist_ok=True)

    imputer = Imputers().get(
        "hyperimpute",
        optimizer="hyperband",
        classifier_seed=["logistic_regression", "catboost", "xgboost", "random_forest"],
        regression_seed=[
            "linear_regression",
            "catboost_regressor",
            "xgboost_regressor",
            "random_forest_regressor",
        ],
        class_threshold=5,
        imputation_order=2,
        n_inner_iter=10,
        select_model_by_column=True,
        select_model_by_iteration=True,
        select_lazy=True,
        select_patience=5,
    )

    for trial in tqdm(range(num_trials), desc='HyperImpute Training'):
        X_miss = train_X.copy()
        X_miss[masks[trial]] = np.nan

        imputer.fit(X_miss)
        X_imputed = imputer.transform(X_miss)

        mse = np.nanmean((X_imputed[masks[trial]] - train_X[masks[trial]]) ** 2)
        MSEs.append(mse)

        imputer.save(os.path.join(models_dir, f"hyperimpute_trial{trial}.pkl"))
        print(f"Trial {trial}: MSE={mse:.6f}")

    print(f"Avg MSE: {np.mean(MSEs):.6f} ± {np.std(MSEs):.6f}")

if __name__ == '__main__':
    main()

In [None]:
import numpy as np
import pandas as pd

def masked_mse(
    X_pred: pd.DataFrame,
    X_true: pd.DataFrame,
    mask: pd.DataFrame,
    mask_marks_missing: bool = True,
    by_column: bool = False,
) -> float | tuple[float, pd.Series]:
    """
    计算只在 mask 指定的单元格上的 MSE（仅数值列）。
    参数
    ----
    X_pred: 预测/填补后的表（DataFrame）
    X_true: 原始完整表（DataFrame）
    mask:   与表同形状的布尔/0-1 DataFrame。True 表示“评估这些位置”。
    mask_marks_missing: 若你的 mask 的 True 表示“未遮蔽/可见”，设为 False 以取反。
    by_column: True 时同时返回按列的 MSE（Series）
    """
    # 对齐索引与列，确保同形状
    X_pred = X_pred.reindex_like(X_true)
    mask = mask.reindex_like(X_true)

    # 只保留数值列
    num_cols = X_true.select_dtypes(include=[np.number]).columns
    Xp = X_pred[num_cols]
    Xt = X_true[num_cols]
    M  = mask[num_cols].astype(bool)

    # 如果 mask 的 True 不是“被遮蔽/需要评估”的含义，则取反
    if not mask_marks_missing:
        M = ~M

    # 计算误差（只在 M==True 的位置）
    diff = (Xp.to_numpy() - Xt.to_numpy())
    M_np = M.to_numpy()
    mse_all = float(np.nanmean((diff**2)[M_np]))  # 全局 MSE

    if not by_column:
        return mse_all

    # 按列 MSE
    per_col = {}
    for j, col in enumerate(num_cols):
        mj = M_np[:, j]
        per_col[col] = float(np.nanmean((diff[:, j]**2)[mj])) if mj.any() else np.nan
    return mse_all, pd.Series(per_col, name="mse")


In [10]:
import numpy as np
import pandas as pd

def ampute(x, mechanism: str, p_miss: float):
    """
    返回：
      X_true: 原始 DataFrame
      X_miss: 注入缺失后的 DataFrame
      M:      布尔掩码，True 表示该单元格被遮蔽/缺失
    """
    # 你已有的 simulate_nan
    x_sim = simulate_nan(np.asarray(x), p_miss, mechanism)

    X_true = pd.DataFrame(x)                          # 原始
    X_miss = pd.DataFrame(x_sim["X_incomp"])          # 注入缺失
    M_raw  = pd.DataFrame(x_sim["mask"]).astype(bool) # 原始掩码 -> bool

    # 自动判定 True 的语义是否与缺失一致（与 X_miss 的 NaN 对齐）
    miss_pos = X_miss.isna().to_numpy()
    same = (M_raw.to_numpy() == miss_pos).mean()
    inv  = ((~M_raw).to_numpy() == miss_pos).mean()
    M = M_raw if same >= inv else ~M_raw

    # 对齐索引与列名
    M.index, M.columns = X_true.index, X_true.columns
    X_miss.columns = X_true.columns
    X_miss.index   = X_true.index
    return X_true, X_miss, M


def masked_mse(
    X_pred: pd.DataFrame,
    X_true: pd.DataFrame,
    mask: pd.DataFrame,
    true_means_missing: str = "auto",   # "auto" | "missing" | "observed"
    X_miss: pd.DataFrame | None = None, # 若提供，可用于自动判定
    by_column: bool = False
):
    """
    只在 mask==True 的位置计算数值列 MSE。
    - true_means_missing="auto": 若提供 X_miss，则用 X_miss.isna() 判定；否则默认为 True=missing
    - "missing": 直接认为 True=missing
    - "observed": 认为 True=observed，会自动取反
    """
    # 对齐形状
    X_true = X_true.copy()
    X_pred = X_pred.reindex_like(X_true)
    M = mask.reindex_like(X_true).astype(bool)

    # 处理 True 的语义
    if true_means_missing == "auto":
        if X_miss is not None:
            miss_pos = X_miss.reindex_like(X_true).isna().to_numpy()
            same = (M.to_numpy() == miss_pos).mean()
            inv  = ((~M).to_numpy() == miss_pos).mean()
            if inv > same:
                M = ~M
        # 若无 X_miss，默认 True=missing（不处理）
    elif true_means_missing == "observed":
        M = ~M
    # else "missing": 不处理

    # 仅数值列
    num_cols = X_true.select_dtypes(include=[np.number]).columns
    if len(num_cols) == 0:
        return (np.nan, pd.Series(dtype=float, name="mse")) if by_column else np.nan

    Xp = X_pred[num_cols].to_numpy()
    Xt = X_true[num_cols].to_numpy()
    Mm = M[num_cols].to_numpy()

    if Mm.sum() == 0:
        return (np.nan, pd.Series({c: np.nan for c in num_cols}, name="mse")) if by_column else np.nan

    diff2 = (Xp - Xt) ** 2
    overall = float(np.nanmean(diff2[ Mm ]))

    if not by_column:
        return overall

    per_col = {}
    for j, c in enumerate(num_cols):
        mj = Mm[:, j]
        per_col[c] = float(np.nanmean(diff2[:, j][mj])) if mj.any() else np.nan
    return overall, pd.Series(per_col, name="mse")


In [19]:
# 生成被遮蔽数据与布尔 mask（True=缺失，若 simulate_nan 语义相反会自动修正）
# —— 原始真值表（X_true）——
df = pd.DataFrame(
    [[1.0, 10.0],
     [2.0, 20.0],
     [3.0, 30.0]],
    columns=["a","b"]
)
X_true, X_miss, M = ampute(df, mechanism="MCAR", p_miss=0.3)
print("Original DataFrame (X_true):", X_true, X_miss, M)

# 用你的 imputer 复原
model = Imputers().get("gain", random_state=42)
X_pred = model.fit_transform(X_miss)  # 或已训练好的 imputer.transform(X_miss)

# 计算只在被遮蔽位置上的 MSE（自动识别 True 的语义）
mse_all, mse_by_col = masked_mse(X_pred, X_true, M, true_means_missing="auto", X_miss=X_miss, by_column=True)
print("MSE:", mse_all)
print(mse_by_col.sort_values(ascending=False).head())


Original DataFrame (X_true):      a     b
0  1.0  10.0
1  2.0  20.0
2  3.0  30.0      a     b
0  1.0  10.0
1  2.0  20.0
2  3.0  30.0        a      b
0  False  False
1  False  False
2  False  False


TypeError: masked_mse() got an unexpected keyword argument 'true_means_missing'

In [17]:
import numpy as np
import pandas as pd

# —— 原始真值表（X_true）——
X_true = pd.DataFrame(
    [[1.0, 10.0],
     [2.0, 20.0],
     [3.0, 30.0]],
    columns=["a","b"]
)

# —— 预测/填补后的表（X_pred）——
X_pred = pd.DataFrame(
    [[1.0, 12.0],   # b: 12 vs 10 → diff=+2
     [1.7, 20.0],   # a: 1.7 vs 2.0 → diff=-0.3
     [2.6, 29.0]],  # a: 2.6 vs 3.0 → diff=-0.4;  b: 29 vs 30 → diff=-1
    columns=["a","b"]
)

# —— 评估用的 mask（True 表示该单元格被遮蔽过 → 要计入 MSE）——
mask = pd.DataFrame(
    [[False, True ],
     [ True, False],
     [ True, True ]],
    columns=["a","b"]
)

def masked_mse(X_pred, X_true, mask, mask_marks_missing=True):
    # 只算数值列；mask==True 的位置才计入
    num_cols = X_true.select_dtypes(include=[np.number]).columns
    Xp = X_pred[num_cols].to_numpy()
    Xt = X_true[num_cols].to_numpy()
    M  = mask[num_cols].astype(bool).to_numpy()
    if not mask_marks_missing:
        M = ~M
    return float(np.nanmean(((Xp - Xt) ** 2)[M]))

print(X_pred, X_true, mask)
mse = masked_mse(X_pred, X_true, mask, mask_marks_missing=True)
print("MSE =", mse)   # 期望：1.3125


     a     b
0  1.0  12.0
1  1.7  20.0
2  2.6  29.0      a     b
0  1.0  10.0
1  2.0  20.0
2  3.0  30.0        a      b
0  False   True
1   True  False
2   True   True
MSE = 1.3125


In [34]:
import pandas as pd
import numpy as np
from hyperimpute.plugins.imputers import Imputers
from hyperimpute.plugins.utils.simulate import simulate_nan

def ampute(x, mechanism, p_miss):
    x_simulated = simulate_nan(np.asarray(x), p_miss, mechanism)

    mask = x_simulated["mask"]
    x_miss = x_simulated["X_incomp"]
    print(x_miss, mask)
    # return pd.DataFrame(x), pd.DataFrame(x_miss, columns = x.columns), pd.DataFrame(mask, columns = x.columns)
    return pd.DataFrame(x), pd.DataFrame(x_miss), pd.DataFrame(mask)



X_miss = pd.DataFrame([[1, 1, 1, 1], [4, 5, np.nan, np.nan], [3, 3, 9, 9], [2, 2, 2, 2]])



X_true = pd.DataFrame(
    [[1, 1, 1, 1],
     [4, 5, 3, 8],
     [3, 3, 9, 9],
     [2, 2, 2, 2]],
     # 注意：这里的列名与后续 ampute 的一致
    # columns=["a","b"]
)
print("Original DataFrame (X_true):", X_true)
print("Missing data:", X_miss)

# 用你的 imputer 复原
model = Imputers().get("miracle", random_state=42)
X_pred = model.fit_transform(X_miss.copy())  # 或已训练好的 imputer.transform(X_miss)

plugin1 = Imputers().get(
    "hyperimpute",
    # optimizer="hyperband",
    # classifier_seed=["logistic_regression"],
    # regression_seed=["linear_regression"],
)
out = plugin1.fit_transform(X_miss.copy())

plugin2 = Imputers().get(
    "gain",
    # optimizer="hyperband",
    # classifier_seed=["logistic_regression"],
    # regression_seed=["linear_regression"],
)
out2 = plugin2.fit_transform(X_miss.copy())

print("miracle outpt", X_pred)
print("hyperimpute output:", out)
print("gain output:", out2)


Original DataFrame (X_true):    0  1  2  3
0  1  1  1  1
1  4  5  3  8
2  3  3  9  9
3  2  2  2  2
Missing data:    0  1    2    3
0  1  1  1.0  1.0
1  4  5  NaN  NaN
2  3  3  9.0  9.0
3  2  2  2.0  2.0
miracle outpt      0    1         2         3
0  1.0  1.0  1.000000  1.000000
1  4.0  5.0 -1.340026 -1.900554
2  3.0  3.0  9.000000  9.000000
3  2.0  2.0  2.000000  2.000000
hyperimpute output:    0  1    2    3
0  1  1  1.0  1.0
1  4  5  9.0  9.0
2  3  3  9.0  9.0
3  2  2  2.0  2.0
gain output:      0    1         2         3
0  1.0  1.0  1.000000  1.000000
1  4.0  5.0  7.284791  7.642774
2  3.0  3.0  9.000000  9.000000
3  2.0  2.0  2.000000  2.000000
