In [13]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [14]:
!pip install missingpy



In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
import math

In [16]:
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

In [17]:
import sys
import numpy as np
import pandas as pd

from missingpy import MissForest
from dsan import Imputer as DSANImputer
from utils import *

import warnings
warnings.filterwarnings('ignore')

In [25]:
seed = 16
set_seed(seed)

In [26]:
def get_imputer(config):
    
    model = config['model']
    
    if model == 'DSAN':
        imputer = DSANImputer(rep_dim=config['rep_dim'],
                num_heads=config['num_heads'],
                n_hidden=config['n_hidden'],
                lr=config['lr'],
                weight_decay=config['weight_decay'],
                batch_size=config['batch_size'],
                epochs=config['epochs'],
                noise_percent=config['noise_percent'],
                stopped_epoch=config['stopped_epoch'])
    
    elif model == 'MissForest':
        imputer = MissForest(random_state=0)
        
    elif model == 'Statistics':
        imputer = None
        
    return imputer

In [27]:
def experiment_run(data_name, config, test=False):
    # Retrieve dataset
    dataset = get_data(data_name)
    print(f"Retrieved dataset for {data_name}")

    # Limit to 1000 samples if test flag is true
    if test:
        dataset = (dataset[0][:1000], dataset[1], dataset[2], dataset[3])
        print("Limited dataset to 1000 samples for testing")

    k = 5
    percent_missing_lst = [p for p in range(5, 25, 5)]
    X_origin, n_col, num_vars, cat_vars = dataset

   # Apply category mapping
    X_origin = category_mapping(X_origin, cat_vars)
    print("Applied category mapping")

    # Check if X_origin is modified in-place or replaced
    if X_origin is not None:
        print("X_origin is replaced with a new version where categorical variables have been mapped to numerical values.")
    else:
        print("X_origin is modified in-place, with categorical variables mapped to numerical values.")

    # Print data size
    print("Data Size: {}".format(X_origin.shape))


    n_sample = X_origin.shape[0]
    idx_lst = list(range(n_sample))
    np.random.shuffle(idx_lst)

    # k-fold validation setup
    n_valid = len(idx_lst) // k
    print(f"Starting k-fold validation with k={k}")

    results = {}
    for percent_missing in percent_missing_lst:
        result = []
        print(f"\nProcessing {percent_missing}% missing data")

        for it, n in enumerate(range(0, n_sample, n_valid)):
            print(f"\nFold {it + 1}")
            valid_idx = idx_lst[n: n+n_valid]

            if len(valid_idx) < n_valid:
                print("Insufficient data for validation, breaking loop")
                break

            train_idx = list(set(idx_lst) - set(valid_idx))
            X = X_origin[train_idx, :]
            X_test = X_origin[valid_idx, :]

            # Generating incomplete data
            missing_mask = generate_missing_mask(X, percent_missing=percent_missing)
            X_incomplete = X.copy()
            X_incomplete[missing_mask] = np.nan
            print("Generated incomplete data with missing values for fold {}:".format(it+1))
            print("Original data with missing values (subset):")
            print(X_incomplete[:5])  # Adjust the number of rows as needed

            # Imputation
            imputer = get_imputer(config)
            X_imputed = imputer.fit_transform(X_incomplete, cat_vars=cat_vars)
            print("Completed imputation for fold {}:".format(it+1))

            X_res = X.copy()
            X_res[missing_mask] = X_imputed[missing_mask]
            print("X_res with imputed values (subset):")
            print(X_res[:5])  # Adjust the number of rows as needed

            # Calculate metrics
            metric = cal_metric_numpy(X_res, X, missing_mask, num_vars, cat_vars)
            score = ex_classify(data_name=data_name, train_array=X_res, test_array=X_test, num_vars=num_vars, cat_vars=cat_vars)
            
            metric['clf_aucroc'] = round(score, 4)
            result.append(metric)
            print("Metrics for this fold:", metric)

        # Average results
        avg_result = dict()
        for key in result[0].keys():
            value = np.mean([m[key] for m in result])
            avg_result[key] = value 
            print(f"Average {key} for {percent_missing}% missing data: {value}")

        results[percent_missing] = avg_result

    print("Completed all iterations")
    return results


In [28]:
config = {
    'model': 'DSAN',
    'rep_dim': 32,
    'num_heads': 4,
    'n_hidden': 2,
    'lr': 3e-3,
    'weight_decay': 1e-5,
    'batch_size': 16,
    'epochs': 24,
    'noise_percent': 10,
    'stopped_epoch': 15
}

In [29]:
data_name_lst = ['iris']

result_dir = './results_iris_'
os.makedirs(result_dir)

for data_name in data_name_lst:
    results = experiment_run(data_name, config, test=True)
    results['config'] = config
    results['dataset'] = data_name
    result_path = os.path.join(result_dir, '{}.json'.format(data_name))
    write_json(results, result_path)

Retrieved dataset for iris
Limited dataset to 1000 samples for testing
Applied category mapping
X_origin is replaced with a new version where categorical variables have been mapped to numerical values.
Data Size: (150, 5)
Starting k-fold validation with k=5

Processing 5% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 nan]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]]


100%|███████████████████████████████████████████| 24/24 [00:07<00:00,  3.30it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]]
Metrics for this fold: {'nrmse': 0.2509, 'col_4_error_rate': 0.3, 'total_error_rate': 0.3, 'clf_aucroc': 0.9583}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 nan 0.2 0. ]
 [4.6 3.1 nan 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 24/24 [00:05<00:00,  4.29it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.7        3.2        0.7664696  0.2        0.        ]
 [4.6        3.1        0.78110659 0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.3524, 'col_4_error_rate': 0.0, 'total_error_rate': 0.0, 'clf_aucroc': 1.0}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [nan 3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]
 [5.  3.4 1.5 0.2 0. ]]


100%|███████████████████████████████████████████| 24/24 [00:04<00:00,  5.19it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [5.85604286 3.6        1.4        0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]
 [4.6        3.4        1.4        0.3        0.        ]
 [5.         3.4        1.5        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.4532, 'col_4_error_rate': 0.25, 'total_error_rate': 0.25, 'clf_aucroc': 0.9949}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 nan]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 24/24 [00:05<00:00,  4.21it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]
Metrics for this fold: {'nrmse': 0.3338, 'col_4_error_rate': 0.0, 'total_error_rate': 0.0, 'clf_aucroc': 1.0}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 nan 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]]


100%|███████████████████████████████████████████| 24/24 [00:05<00:00,  4.11it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.7        3.2        2.59686303 0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]]
Metrics for this fold: {'nrmse': 0.3312, 'col_4_error_rate': 0.0, 'total_error_rate': 0.0, 'clf_aucroc': 1.0}
Average nrmse for 5% missing data: 0.34429999999999994
Average col_4_error_rate for 5% missing data: 0.11000000000000001
Average total_error_rate for 5% missing data: 0.11000000000000001
Average clf_aucroc for 5% missing data: 0.99064

Processing 10% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 nan 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [nan 3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]]


100%|███████████████████████████████████████████| 24/24 [00:06<00:00,  3.51it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[4.9        3.         1.4        0.2        0.        ]
 [4.7        3.2        2.46942759 0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.32539272 3.6        1.4        0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]]
Metrics for this fold: {'nrmse': 0.3713, 'col_4_error_rate': 0.0833, 'total_error_rate': 0.0833, 'clf_aucroc': 0.9792}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 nan]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  nan 1.4 0.2 nan]]


100%|███████████████████████████████████████████| 24/24 [00:05<00:00,  4.59it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.19761109 1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.3691, 'col_4_error_rate': 0.3, 'total_error_rate': 0.3, 'clf_aucroc': 0.9943}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[nan 3.5 1.4 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 nan]
 [4.6 3.4 1.4 0.3 0. ]
 [5.  nan 1.5 0.2 nan]]


100%|███████████████████████████████████████████| 24/24 [00:04<00:00,  5.09it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[6.173985   3.5        1.4        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]
 [4.6        3.4        1.4        0.3        0.        ]
 [5.         3.88234782 1.5        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.388, 'col_4_error_rate': 0.2, 'total_error_rate': 0.2, 'clf_aucroc': 0.9949}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 nan]
 [4.9 3.  1.4 0.2 0. ]
 [nan 3.2 1.3 0.2 nan]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 24/24 [00:06<00:00,  3.58it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [6.19455338 3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.4587, 'col_4_error_rate': 0.2143, 'total_error_rate': 0.2143, 'clf_aucroc': 1.0}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 nan 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.4 nan 1.7 0.4 0. ]]


100%|███████████████████████████████████████████| 24/24 [00:06<00:00,  3.72it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.7        3.12901568 1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.4        3.11431241 1.7        0.4        0.        ]]
Metrics for this fold: {'nrmse': 0.3751, 'col_4_error_rate': 0.375, 'total_error_rate': 0.375, 'clf_aucroc': 1.0}
Average nrmse for 10% missing data: 0.39244
Average col_4_error_rate for 10% missing data: 0.23451999999999998
Average total_error_rate for 10% missing data: 0.23451999999999998
Average clf_aucroc for 10% missing data: 0.99368

Processing 15% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[4.9 3.  1.4 0.2 nan]
 [4.7 3.2 1.3 nan 0. ]
 [nan 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 nan 1.7 0.4 0. ]]


100%|███████████████████████████████████████████| 24/24 [00:07<00:00,  3.08it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[4.9        3.         1.4        0.2        0.        ]
 [4.7        3.2        1.3        0.38348803 0.        ]
 [4.99124384 3.1        1.5        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]
 [5.4        3.30709147 1.7        0.4        0.        ]]
Metrics for this fold: {'nrmse': 0.4439, 'col_4_error_rate': 0.0588, 'total_error_rate': 0.0588, 'clf_aucroc': 0.9792}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 24/24 [00:03<00:00,  6.24it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]
Metrics for this fold: {'nrmse': 0.4271, 'col_4_error_rate': 0.1765, 'total_error_rate': 0.1765, 'clf_aucroc': 1.0}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [nan 3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]
 [5.  3.4 1.5 0.2 0. ]]


100%|███████████████████████████████████████████| 24/24 [00:05<00:00,  4.59it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [5.89756012 3.6        1.4        0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]
 [4.6        3.4        1.4        0.3        0.        ]
 [5.         3.4        1.5        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.4178, 'col_4_error_rate': 0.4118, 'total_error_rate': 0.4118, 'clf_aucroc': 0.9924}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 nan]
 [4.9 3.  nan 0.2 nan]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 nan 0.2 0. ]
 [5.  nan 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 24/24 [00:06<00:00,  3.80it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.         3.21643472 0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.1        2.47179365 0.2        0.        ]
 [5.         2.93504977 1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.3664, 'col_4_error_rate': 0.1739, 'total_error_rate': 0.1739, 'clf_aucroc': 0.9921}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[5.1 nan 1.4 0.2 nan]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]]


100%|███████████████████████████████████████████| 24/24 [00:06<00:00,  3.55it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[5.1        3.33168864 1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]]
Metrics for this fold: {'nrmse': 0.5, 'col_4_error_rate': 0.2143, 'total_error_rate': 0.2143, 'clf_aucroc': 0.9968}
Average nrmse for 15% missing data: 0.43104
Average col_4_error_rate for 15% missing data: 0.20705999999999997
Average total_error_rate for 15% missing data: 0.20705999999999997
Average clf_aucroc for 15% missing data: 0.9921000000000001

Processing 20% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 nan nan 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 nan]
 [nan 3.9 nan 0.4 0. ]]


100%|███████████████████████████████████████████| 24/24 [00:07<00:00,  3.02it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[ 4.9         3.          1.4         0.2         0.        ]
 [ 4.7         3.2         3.63999605 -0.25879648  0.        ]
 [ 4.6         3.1         1.5         0.2         0.        ]
 [ 5.          3.6         1.4         0.2         0.        ]
 [ 5.82001066  3.9         4.56660175  0.4         0.        ]]
Metrics for this fold: {'nrmse': 0.5196, 'col_4_error_rate': 0.1154, 'total_error_rate': 0.1154, 'clf_aucroc': 0.9531}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[5.1 3.5 nan 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 24/24 [00:07<00:00,  3.36it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[5.1        3.5        3.42822433 0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.4324, 'col_4_error_rate': 0.45, 'total_error_rate': 0.45, 'clf_aucroc': 0.9943}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 nan]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [nan 3.4 1.4 0.3 nan]
 [5.  3.4 1.5 0.2 nan]]


100%|███████████████████████████████████████████| 24/24 [00:05<00:00,  4.15it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]
 [6.03004932 3.4        1.4        0.3        0.        ]
 [5.         3.4        1.5        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.3991, 'col_4_error_rate': 0.0588, 'total_error_rate': 0.0588, 'clf_aucroc': 0.9949}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 nan]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 nan]
 [4.6 3.1 1.5 nan 0. ]
 [nan 3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 24/24 [00:06<00:00,  3.90it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.4604862  0.        ]
 [5.43366766 3.6        1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.3048, 'col_4_error_rate': 0.16, 'total_error_rate': 0.16, 'clf_aucroc': 1.0}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[5.1 nan 1.4 nan 0. ]
 [nan 3.  nan 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.4 nan nan 0.4 0. ]]


100%|███████████████████████████████████████████| 24/24 [00:05<00:00,  4.29it/s]

Completed imputation for fold 5:
X_res with imputed values (subset):
[[5.1        3.40554214 1.4        0.3570053  0.        ]
 [6.19895506 3.         3.89596438 0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.4        3.17141724 3.66284943 0.4        0.        ]]
Metrics for this fold: {'nrmse': 0.4604, 'col_4_error_rate': 0.1579, 'total_error_rate': 0.1579, 'clf_aucroc': 1.0}
Average nrmse for 20% missing data: 0.42325999999999997
Average col_4_error_rate for 20% missing data: 0.18842
Average total_error_rate for 20% missing data: 0.18842
Average clf_aucroc for 20% missing data: 0.9884599999999999
Completed all iterations





In [None]:
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]
 [5.  3.4 1.5 0.2 0. ]
 [4.4 2.9 1.4 0.2 0. ]
 [4.9 3.1 1.5 0.1 0. ]]

In [12]:
from sklearn.datasets import load_iris
import numpy as np

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Combine the features and target labels into one array for display
X_origin = np.concatenate([X, y.reshape(-1, 1)], axis=1)

# Display the first 5 rows of the combined dataset
print(X_origin[:10, :])


[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]
 [5.  3.4 1.5 0.2 0. ]
 [4.4 2.9 1.4 0.2 0. ]
 [4.9 3.1 1.5 0.1 0. ]]
