In [13]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [14]:
!pip install missingpy



In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
import math

In [16]:
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

In [17]:
import sys
import numpy as np
import pandas as pd

from missingpy import MissForest
from dsan import Imputer as DSANImputer
from utils import *

import warnings
warnings.filterwarnings('ignore')

In [37]:
seed = 32
set_seed(seed)

In [38]:
def get_imputer(config):
    
    model = config['model']
    
    if model == 'DSAN':
        imputer = DSANImputer(rep_dim=config['rep_dim'],
                num_heads=config['num_heads'],
                n_hidden=config['n_hidden'],
                lr=config['lr'],
                weight_decay=config['weight_decay'],
                batch_size=config['batch_size'],
                epochs=config['epochs'],
                noise_percent=config['noise_percent'],
                stopped_epoch=config['stopped_epoch'])
    
    elif model == 'MissForest':
        imputer = MissForest(random_state=0)
        
    elif model == 'Statistics':
        imputer = None
        
    return imputer

In [44]:
def experiment_run(data_name, config, test=False):
    # Retrieve dataset
    dataset = get_data(data_name)
    print(f"Retrieved dataset for {data_name}")

    # Limit to 1000 samples if test flag is true
    if test:
        dataset = (dataset[0][:1000], dataset[1], dataset[2], dataset[3])
        print("Limited dataset to 1000 samples for testing")

    k = 5
    percent_missing_lst = [p for p in range(5, 25, 5)]
    X_origin, n_col, num_vars, cat_vars = dataset

   # Apply category mapping
    X_origin = category_mapping(X_origin, cat_vars)
    print("Applied category mapping")

    # Check if X_origin is modified in-place or replaced
    if X_origin is not None:
        print("X_origin is replaced with a new version where categorical variables have been mapped to numerical values.")
    else:
        print("X_origin is modified in-place, with categorical variables mapped to numerical values.")

    # Print data size
    print("Data Size: {}".format(X_origin.shape))


    n_sample = X_origin.shape[0]
    idx_lst = list(range(n_sample))
    np.random.shuffle(idx_lst)

    # k-fold validation setup
    n_valid = len(idx_lst) // k
    print(f"Starting k-fold validation with k={k}")

    results = {}
    for percent_missing in percent_missing_lst:
        result = []
        print(f"\nProcessing {percent_missing}% missing data")

        for it, n in enumerate(range(0, n_sample, n_valid)):
            print(f"\nFold {it + 1}")
            valid_idx = idx_lst[n: n+n_valid]

            if len(valid_idx) < n_valid:
                print("Insufficient data for validation, breaking loop")
                break

            train_idx = list(set(idx_lst) - set(valid_idx))
            X = X_origin[train_idx, :]
            X_test = X_origin[valid_idx, :]

            # Generating incomplete data
            missing_mask = generate_missing_mask(X, percent_missing=percent_missing)
            X_incomplete = X.copy()
            X_incomplete[missing_mask] = np.nan
            print("Generated incomplete data with missing values for fold {}:".format(it+1))
            print("Original data with missing values (subset):")
            print(X_incomplete[:5])  # Adjust the number of rows as needed

            # Imputation
            imputer = get_imputer(config)
            X_imputed = imputer.fit_transform(X_incomplete, cat_vars=cat_vars)
            print("Completed imputation for fold {}:".format(it+1))

            X_res = X.copy()
            X_res[missing_mask] = X_imputed[missing_mask]
            print("X_res with imputed values (subset):")
            print(X_res[:5])  # Adjust the number of rows as needed

            # Calculate metrics
            metric = cal_metric_numpy(X_res, X, missing_mask, num_vars, cat_vars)
            score = ex_classify(data_name=data_name, train_array=X_res, test_array=X_test, num_vars=num_vars, cat_vars=cat_vars)
            
            metric['clf_aucroc'] = round(score, 4)
            result.append(metric)
            print("Metrics for this fold:", metric)

        # Average results
        avg_result = dict()
        for key in result[0].keys():
            value = np.mean([m[key] for m in result])
            avg_result[key] = value 
            print(f"Average {key} for {percent_missing}% missing data: {value}")

        results[percent_missing] = avg_result

    print("Completed all iterations")
    return results


In [45]:
config = {
    'model': 'DSAN',
    'rep_dim': 32,
    'num_heads': 4,
    'n_hidden': 2,
    'lr': 3e-2,
    'weight_decay': 1e-5,
    'batch_size': 32,
    'epochs': 34,
    'noise_percent': 15,
    'stopped_epoch': 15
}

In [46]:
data_name_lst = ['iris']

result_dir = './results_iris_'
os.makedirs(result_dir)

for data_name in data_name_lst:
    results = experiment_run(data_name, config, test=True)
    results['config'] = config
    results['dataset'] = data_name
    result_path = os.path.join(result_dir, '{}.json'.format(data_name))
    write_json(results, result_path)

Retrieved dataset for iris
Limited dataset to 1000 samples for testing
Applied category mapping
X_origin is replaced with a new version where categorical variables have been mapped to numerical values.
Data Size: (150, 5)
Starting k-fold validation with k=5

Processing 5% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:03<00:00,  8.51it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]
Metrics for this fold: {'nrmse': 0.3729, 'col_4_error_rate': 0.0, 'total_error_rate': 0.0, 'clf_aucroc': 0.9969}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 nan]
 [4.6 3.4 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:02<00:00, 12.74it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]]
Metrics for this fold: {'nrmse': 0.234, 'col_4_error_rate': 0.2, 'total_error_rate': 0.2, 'clf_aucroc': 1.0}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [nan 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 nan]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:03<00:00, 11.30it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [5.20267582 3.         1.4        0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.3371, 'col_4_error_rate': 0.125, 'total_error_rate': 0.125, 'clf_aucroc': 1.0}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 nan 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:03<00:00, 10.65it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[4.9        3.         1.4        0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]
 [4.6        2.81255436 1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.209, 'col_4_error_rate': 0.0, 'total_error_rate': 0.0, 'clf_aucroc': 1.0}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 nan 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [4.6 nan 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:03<00:00, 10.07it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.00892146 0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [4.6        2.9321053  1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.2334, 'col_4_error_rate': 0.3, 'total_error_rate': 0.3, 'clf_aucroc': 1.0}
Average nrmse for 5% missing data: 0.27728
Average col_4_error_rate for 5% missing data: 0.125
Average total_error_rate for 5% missing data: 0.125
Average clf_aucroc for 5% missing data: 0.99938

Processing 10% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[nan 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 nan]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:03<00:00,  9.47it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[5.40544176 3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.267, 'col_4_error_rate': 0.0909, 'total_error_rate': 0.0909, 'clf_aucroc': 0.9907}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.6 3.1 1.5 nan 0. ]
 [5.  3.6 nan 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:02<00:00, 11.77it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.6        3.1        1.5        0.34106478 0.        ]
 [5.         3.6        2.28507829 0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]
 [4.6        3.4        1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.3329, 'col_4_error_rate': 0.2, 'total_error_rate': 0.2, 'clf_aucroc': 0.9944}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [nan 3.2 1.3 0.2 nan]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:04<00:00,  8.20it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [5.85925531 3.2        1.3        0.2        1.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.2768, 'col_4_error_rate': 0.2353, 'total_error_rate': 0.2353, 'clf_aucroc': 0.9897}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[4.9 3.  1.4 0.2 0. ]
 [4.7 nan 1.3 0.2 nan]
 [5.  3.6 1.4 0.2 nan]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:03<00:00,  9.59it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[4.9        3.         1.4        0.2        0.        ]
 [4.7        3.12165308 1.3        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]
 [4.6        3.4        1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.1943, 'col_4_error_rate': 0.0909, 'total_error_rate': 0.0909, 'clf_aucroc': 0.997}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[5.1 nan nan 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 nan 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [nan 3.4 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:03<00:00,  9.41it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[5.1        2.91372132 2.0571053  0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.7        3.2        2.10325599 0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.60389042 3.4        1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.3452, 'col_4_error_rate': 0.1429, 'total_error_rate': 0.1429, 'clf_aucroc': 0.9985}
Average nrmse for 10% missing data: 0.28324
Average col_4_error_rate for 10% missing data: 0.152
Average total_error_rate for 10% missing data: 0.152
Average clf_aucroc for 10% missing data: 0.9940599999999999

Processing 15% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [nan 3.2 1.3 nan 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  nan 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:03<00:00,  9.43it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.96710825 3.2        1.3        0.30862203 0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.35102272 1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.3339, 'col_4_error_rate': 0.1818, 'total_error_rate': 0.1818, 'clf_aucroc': 0.9969}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [nan 3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 nan 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:03<00:00, 10.77it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.34998035 3.6        1.4        0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]
 [4.6        3.4        1.4        0.19533242 0.        ]]
Metrics for this fold: {'nrmse': 0.2747, 'col_4_error_rate': 0.5333, 'total_error_rate': 0.5333, 'clf_aucroc': 0.9957}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 nan 1.4 0.2 0. ]
 [4.7 nan 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 nan]
 [5.  3.6 1.4 0.2 nan]]


100%|███████████████████████████████████████████| 34/34 [00:04<00:00,  8.48it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.01348567 1.4        0.2        0.        ]
 [4.7        3.09403157 1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.2777, 'col_4_error_rate': 0.15, 'total_error_rate': 0.15, 'clf_aucroc': 0.9897}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[4.9 3.  1.4 0.2 0. ]
 [4.7 nan 1.3 0.2 nan]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 nan 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:03<00:00,  9.81it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[4.9        3.         1.4        0.2        0.        ]
 [4.7        3.19508028 1.3        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]
 [5.4        3.22833371 1.7        0.4        0.        ]
 [4.6        3.4        1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.3893, 'col_4_error_rate': 0.0667, 'total_error_rate': 0.0667, 'clf_aucroc': 1.0}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 nan]
 [4.7 3.2 1.3 nan 0. ]
 [4.6 3.1 1.5 0.2 nan]
 [4.6 3.4 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:04<00:00,  6.81it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[ 5.1         3.5         1.4         0.2         0.        ]
 [ 4.9         3.          1.4         0.2         0.        ]
 [ 4.7         3.2         1.3        -0.19014129  0.        ]
 [ 4.6         3.1         1.5         0.2         0.        ]
 [ 4.6         3.4         1.4         0.3         0.        ]]
Metrics for this fold: {'nrmse': 0.4038, 'col_4_error_rate': 0.1053, 'total_error_rate': 0.1053, 'clf_aucroc': 1.0}
Average nrmse for 15% missing data: 0.33588
Average col_4_error_rate for 15% missing data: 0.20742
Average total_error_rate for 15% missing data: 0.20742
Average clf_aucroc for 15% missing data: 0.9964600000000001

Processing 20% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 nan 0. ]
 [4.7 3.2 1.3 0.2 nan]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  nan 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:03<00:00, 10.12it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.39865521 0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.31161857 1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.516, 'col_4_error_rate': 0.16, 'total_error_rate': 0.16, 'clf_aucroc': 1.0}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[5.1 3.5 1.4 nan 0. ]
 [4.6 3.1 1.5 nan 0. ]
 [nan nan 1.4 0.2 0. ]
 [5.4 nan nan 0.4 0. ]
 [4.6 nan 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:03<00:00,  9.25it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.36513245 0.        ]
 [4.6        3.1        1.5        0.09019958 0.        ]
 [5.56800795 3.26151705 1.4        0.2        0.        ]
 [5.4        3.16853523 1.56374443 0.4        0.        ]
 [4.6        3.18802977 1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.3655, 'col_4_error_rate': 0.12, 'total_error_rate': 0.12, 'clf_aucroc': 0.9957}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[nan 3.5 1.4 0.2 nan]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 nan]
 [4.6 3.1 1.5 nan 0. ]
 [5.  3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:03<00:00,  9.47it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[5.58920479 3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.03810065 0.        ]
 [5.         3.6        1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.2709, 'col_4_error_rate': 0.12, 'total_error_rate': 0.12, 'clf_aucroc': 0.9897}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [5.  3.6 nan 0.2 0. ]
 [5.4 3.9 1.7 0.4 nan]
 [4.6 3.4 nan 0.3 nan]]


100%|███████████████████████████████████████████| 34/34 [00:03<00:00,  9.70it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[4.9        3.         1.4        0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [5.         3.6        1.84961939 0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]
 [4.6        3.4        2.21190858 0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.3211, 'col_4_error_rate': 0.3333, 'total_error_rate': 0.3333, 'clf_aucroc': 1.0}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 nan]
 [4.9 nan nan 0.2 nan]
 [4.7 3.2 nan 0.2 0. ]
 [4.6 3.1 1.5 0.2 nan]
 [4.6 nan nan 0.3 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:03<00:00,  9.75it/s]

Completed imputation for fold 5:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.03260398 4.11964464 0.2        0.        ]
 [4.7        3.2        3.85266042 0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [4.6        3.13040543 3.33251834 0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.5309, 'col_4_error_rate': 0.3636, 'total_error_rate': 0.3636, 'clf_aucroc': 1.0}
Average nrmse for 20% missing data: 0.40088
Average col_4_error_rate for 20% missing data: 0.21938
Average total_error_rate for 20% missing data: 0.21938
Average clf_aucroc for 20% missing data: 0.9970800000000001
Completed all iterations





In [None]:
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]
 [5.  3.4 1.5 0.2 0. ]
 [4.4 2.9 1.4 0.2 0. ]
 [4.9 3.1 1.5 0.1 0. ]]

In [12]:
from sklearn.datasets import load_iris
import numpy as np

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Combine the features and target labels into one array for display
X_origin = np.concatenate([X, y.reshape(-1, 1)], axis=1)

# Display the first 5 rows of the combined dataset
print(X_origin[:10, :])


[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]
 [5.  3.4 1.5 0.2 0. ]
 [4.4 2.9 1.4 0.2 0. ]
 [4.9 3.1 1.5 0.1 0. ]]
