In [1]:
import sys
import warnings
import time
from tqdm import tqdm
from math import sqrt

import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from hyperimpute.plugins.utils.metrics import RMSE
from hyperimpute.plugins.utils.simulate import simulate_nan

import xgboost as xgb

from IPython.display import HTML, display
import tabulate

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [2]:
from hyperimpute.plugins.imputers import Imputers, ImputerPlugin

imputers = Imputers()

imputers.list()

['median',
 'sklearn_ice',
 'missforest',
 'ice',
 'gain',
 'mice',
 'miracle',
 'hyperimpute',
 'miwae',
 'softimpute',
 'sinkhorn',
 'nop',
 'EM',
 'most_frequent',
 'mean',
 'sklearn_missforest']

In [3]:
from sklearn.datasets import load_diabetes
from hyperimpute.plugins.imputers import Imputers
from hyperimpute.utils.benchmarks import compare_models

imputer = Imputers().get(
    "hyperimpute",  # the name of the imputation method.
    # The rest of the kwargs are specific to the method
    # optimizer: str. The optimizer to use: simple, hyperband, bayesian
    optimizer="hyperband",
    # classifier_seed: list. Model search pool for categorical columns.
    classifier_seed=["logistic_regression", "catboost", "xgboost", "random_forest"],
    # regression_seed: list. Model search pool for continuous columns.
    regression_seed=[
        "linear_regression",
        "catboost_regressor",
        "xgboost_regressor",
        "random_forest_regressor",
    ],
    # class_threshold: int. how many max unique items must be in the column to be is associated with categorical
    class_threshold=5,
    # imputation_order: int. 0 - ascending, 1 - descending, 2 - random
    imputation_order=2,
    # n_inner_iter: int. number of imputation iterations
    n_inner_iter=10,
    # select_model_by_column: bool. If true, select a different model for each column. Else, it reuses the model chosen for the first column.
    select_model_by_column=True,
    # select_model_by_iteration: bool. If true, selects new models for each iteration. Else, it reuses the models chosen in the first iteration.
    select_model_by_iteration=True,
    # select_lazy: bool. If false, starts the optimizer on every column unless other restrictions apply. Else, if for the current iteration there is a trend(at least to columns of the same type got the same model from the optimizer), it reuses the same model class for all the columns without starting the optimizer.
    select_lazy=True,
    # select_patience: int. How many iterations without objective function improvement to wait.
    select_patience=5,
)

# Load baseline dataset
X, _ = load_diabetes(as_frame=True, return_X_y=True)





print("Dataset shape:", X)

# Run benchmarks
_ = compare_models(
    name="example",
    evaluated_model=imputer,
    X_raw=X,
    ref_methods=["gain", "miracle"],
    scenarios=["MAR"],
    miss_pct=[0.3, 0.5],
    n_iter=2,
    n_jobs=1,
)

Dataset shape:           age       sex       bmi        bp        s1        s2        s3  \
0    0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1   -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2    0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3   -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4    0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   
..        ...       ...       ...       ...       ...       ...       ...   
437  0.041708  0.050680  0.019662  0.059744 -0.005697 -0.002566 -0.028674   
438 -0.005515  0.050680 -0.015906 -0.067642  0.049341  0.079165 -0.028674   
439  0.041708  0.050680 -0.015906  0.017293 -0.037344 -0.013840 -0.024993   
440 -0.045472 -0.044642  0.039062  0.001215  0.016318  0.015283 -0.028674   
441 -0.045472 -0.044642 -0.073030 -0.081413  0.083740  0.027809  0.173816   

           s4        s5        s6  
0   -0.002592  0.019907 

I0000 00:00:1755505280.764436 23521409 mlir_graph_optimization_pass.cc:437] MLIR V1 optimization pass is not enabled


RMSE score


Unnamed: 0,Scenario,"miss_pct [0, 1]",Evaluated: hyperimpute,gain,miracle
0,MAR,0.3,0.1299 +/- 0.005,0.1488 +/- 0.0053,4.2099 +/- 0.3155
1,MAR,0.5,0.2303 +/- 0.1315,0.2262 +/- 0.0843,4.6731 +/- 0.6089




Wasserstein score


Unnamed: 0,Scenario,"miss_pct [0, 1]",Evaluated: hyperimpute,gain,miracle
0,MAR,0.3,0.0663 +/- 0.0032,0.079 +/- 0.0014,4.9567 +/- 0.3283
1,MAR,0.5,0.115 +/- 0.0034,0.2231 +/- 0.0779,9.0983 +/- 1.4954


In [2]:
from hyperimpute.plugins.imputers import Imputers
from hyperimpute.utils.benchmarks import compare_models
from dataset import Preprocessor

# Load the gesture dataset
prepper = Preprocessor('gesture')
X = prepper.encodeDf('OHE', prepper.df_train)  # One-hot encode categorical columns
X = prepper.decodeNp('OHE', X)  # If you want to get back a DataFrame with numeric columns

print("Dataset shape:", X)


Dataset shape: [[ 3.542000e-05  6.820860e-03  7.064900e-04 ...  1.237900e-03
   7.116800e-04  3.780600e-04]
 [ 2.512460e-03  2.246600e-04 -4.145400e-04 ...  2.498000e-04
   1.759500e-04  1.803300e-04]
 [-1.333400e-04 -1.014410e-02  6.468200e-04 ...  1.356980e-03
   1.398190e-03  1.000380e-03]
 ...
 [-1.005159e-02 -9.317200e-03 -4.466700e-03 ...  1.248690e-03
   4.860510e-03  1.096630e-03]
 [-1.908400e-04 -9.545400e-04  4.951000e-05 ...  2.169000e-04
   4.130000e-05  1.337100e-04]
 [-5.777000e-05  3.842300e-04  1.311000e-05 ...  9.653000e-05
   2.932100e-04  9.357000e-05]]


In [1]:
import os
import sys
import numpy as np
import argparse
import warnings
from tqdm import tqdm
from dataset import Preprocessor
from hyperimpute.plugins.imputers import Imputers

warnings.filterwarnings('ignore')

def parse_args():
    parser = argparse.ArgumentParser(description='Train HyperImpute on tabular datasets')
    parser.add_argument('--dataname', type=str, default='gesture', help='Name of dataset.')
    parser.add_argument('--mask', type=str, default='MCAR', help='Masking mechanism: MCAR, MAR, MNAR_logistic_T2')
    parser.add_argument('--ratio', type=float, default=0.3, help='Missing ratio')
    parser.add_argument('--num_trials', type=int, default=5, help='Number of mask trials')
    if any('ipykernel' in arg or 'jupyter' in arg for arg in sys.argv):
        return parser.parse_args(args=[])
    else:
        return parser.parse_args()

def main():
    args = parse_args()
    dataname = args.dataname
    mask_type = args.mask
    ratio = args.ratio
    num_trials = args.num_trials

    print(f"Dataset: {dataname}, Mask: {mask_type}, Ratio: {ratio}, Trials: {num_trials}")

    prepper = Preprocessor(dataname)
    train_X = prepper.encodeDf('OHE', prepper.df_train)
    num_numeric = prepper.numerical_indices_np_end

    np.random.seed(42)
    masks = [(np.random.rand(*train_X.shape) < ratio) for _ in range(num_trials)]

    MSEs = []
    models_dir = f'saved_models/{dataname}/'
    os.makedirs(models_dir, exist_ok=True)

    imputer = Imputers().get(
        "hyperimpute",
        optimizer="hyperband",
        classifier_seed=["logistic_regression", "catboost", "xgboost", "random_forest"],
        regression_seed=[
            "linear_regression",
            "catboost_regressor",
            "xgboost_regressor",
            "random_forest_regressor",
        ],
        class_threshold=5,
        imputation_order=2,
        n_inner_iter=10,
        select_model_by_column=True,
        select_model_by_iteration=True,
        select_lazy=True,
        select_patience=5,
    )

    for trial in tqdm(range(num_trials), desc='HyperImpute Training'):
        X_miss = train_X.copy()
        X_miss[masks[trial]] = np.nan

        imputer.fit(X_miss)
        X_imputed = imputer.transform(X_miss)

        mse = np.nanmean((X_imputed[masks[trial]] - train_X[masks[trial]]) ** 2)
        MSEs.append(mse)

        imputer.save(os.path.join(models_dir, f"hyperimpute_trial{trial}.pkl"))
        print(f"Trial {trial}: MSE={mse:.6f}")

    print(f"Avg MSE: {np.mean(MSEs):.6f} Â± {np.std(MSEs):.6f}")

if __name__ == '__main__':
    main()

Dataset: gesture, Mask: MCAR, Ratio: 0.3, Trials: 5


HyperImpute Training:   0%|          | 0/5 [00:00<?, ?it/s]

: 