In [1]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [2]:
!pip install missingpy



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
import math

In [4]:
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

In [5]:
import sys
import numpy as np
import pandas as pd

from missingpy import MissForest
from dsan import Imputer as DSANImputer
from utils import *

import warnings
warnings.filterwarnings('ignore')

In [6]:
seed = 128
set_seed(seed)

In [7]:
def get_imputer(config):
    
    model = config['model']
    
    if model == 'DSAN':
        imputer = DSANImputer(rep_dim=config['rep_dim'],
                num_heads=config['num_heads'],
                n_hidden=config['n_hidden'],
                lr=config['lr'],
                weight_decay=config['weight_decay'],
                batch_size=config['batch_size'],
                epochs=config['epochs'],
                noise_percent=config['noise_percent'],
                stopped_epoch=config['stopped_epoch'])
    
    elif model == 'MissForest':
        imputer = MissForest(random_state=0)
        
    elif model == 'Statistics':
        imputer = None
        
    return imputer

In [8]:
def experiment_run(data_name, config, test=False):
    # Retrieve dataset
    dataset = get_data(data_name)
    print(f"Retrieved dataset for {data_name}")

    # Limit to 1000 samples if test flag is true
    if test:
        dataset = (dataset[0][:1000], dataset[1], dataset[2], dataset[3])
        print("Limited dataset to 1000 samples for testing")

    k = 5
    percent_missing_lst = [p for p in range(5, 25, 5)]
    X_origin, n_col, num_vars, cat_vars = dataset

   # Apply category mapping
    X_origin = category_mapping(X_origin, cat_vars)
    print("Applied category mapping")

    # Check if X_origin is modified in-place or replaced
    if X_origin is not None:
        print("X_origin is replaced with a new version where categorical variables have been mapped to numerical values.")
    else:
        print("X_origin is modified in-place, with categorical variables mapped to numerical values.")

    # Print data size
    print("Data Size: {}".format(X_origin.shape))


    n_sample = X_origin.shape[0]
    idx_lst = list(range(n_sample))
    np.random.shuffle(idx_lst)

    # k-fold validation setup
    n_valid = len(idx_lst) // k
    print(f"Starting k-fold validation with k={k}")

    results = {}
    for percent_missing in percent_missing_lst:
        result = []
        print(f"\nProcessing {percent_missing}% missing data")

        for it, n in enumerate(range(0, n_sample, n_valid)):
            print(f"\nFold {it + 1}")
            valid_idx = idx_lst[n: n+n_valid]

            if len(valid_idx) < n_valid:
                print("Insufficient data for validation, breaking loop")
                break

            train_idx = list(set(idx_lst) - set(valid_idx))
            X = X_origin[train_idx, :]
            X_test = X_origin[valid_idx, :]

            # Generating incomplete data
            missing_mask = generate_missing_mask(X, percent_missing=percent_missing)
            X_incomplete = X.copy()
            X_incomplete[missing_mask] = np.nan
            print("Generated incomplete data with missing values for fold {}:".format(it+1))
            print("Original data with missing values (subset):")
            print(X_incomplete[:5])  # Adjust the number of rows as needed

            # Imputation
            imputer = get_imputer(config)
            X_imputed = imputer.fit_transform(X_incomplete, cat_vars=cat_vars)
            print("Completed imputation for fold {}:".format(it+1))

            X_res = X.copy()
            X_res[missing_mask] = X_imputed[missing_mask]
            print("X_res with imputed values (subset):")
            print(X_res[:5])  # Adjust the number of rows as needed

            # Calculate metrics
            metric = cal_metric_numpy(X_res, X, missing_mask, num_vars, cat_vars)
            score = ex_classify(data_name=data_name, train_array=X_res, test_array=X_test, num_vars=num_vars, cat_vars=cat_vars)
            
            metric['clf_aucroc'] = round(score, 4)
            result.append(metric)
            print("Metrics for this fold:", metric)

        # Average results
        avg_result = dict()
        for key in result[0].keys():
            value = np.mean([m[key] for m in result])
            avg_result[key] = value 
            print(f"Average {key} for {percent_missing}% missing data: {value}")

        results[percent_missing] = avg_result

    print("Completed all iterations")
    return results


In [9]:
config = {
    'model': 'DSAN',
    'rep_dim': 32,
    'num_heads': 8,
    'n_hidden': 2,
    'lr': 3e-3,
    'weight_decay': 1e-5,
    'batch_size': 128,
    'epochs': 25,
    'noise_percent': 10,
    'stopped_epoch': 25
}

In [10]:
data_name_lst = ['iris']

result_dir = './results_iris'
os.makedirs(result_dir)

for data_name in data_name_lst:
    results = experiment_run(data_name, config, test=True)
    results['config'] = config
    results['dataset'] = data_name
    result_path = os.path.join(result_dir, '{}.json'.format(data_name))
    write_json(results, result_path)

Retrieved dataset for iris
Limited dataset to 1000 samples for testing
Applied category mapping
X_origin is replaced with a new version where categorical variables have been mapped to numerical values.
Data Size: (150, 5)
Starting k-fold validation with k=5

Processing 5% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[5.1 nan 1.4 0.2 0. ]
 [4.9 3.  nan 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]]


100%|███████████████████████████████████████████| 25/25 [00:01<00:00, 21.65it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[5.1        3.75170827 1.4        0.2        0.        ]
 [4.9        3.         1.43910265 0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]]
Metrics for this fold: {'nrmse': 0.4058, 'col_4_error_rate': 0.0, 'total_error_rate': 0.0, 'clf_aucroc': 1.0}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[5.1 3.5 1.4 nan 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 nan 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 25/25 [00:01<00:00, 23.13it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.58063442 0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.39080715 0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.3009, 'col_4_error_rate': 0.125, 'total_error_rate': 0.125, 'clf_aucroc': 1.0}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[5.1 nan 1.4 0.2 0. ]
 [4.9 3.  1.4 nan 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [4.6 3.4 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 25/25 [00:01<00:00, 23.13it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[5.1        3.46124196 1.4        0.2        0.        ]
 [4.9        3.         1.4        0.26106587 0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]
 [4.6        3.4        1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.3595, 'col_4_error_rate': 0.5, 'total_error_rate': 0.5, 'clf_aucroc': 1.0}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 nan 1.3 0.2 0. ]
 [4.6 nan 1.5 0.2 0. ]
 [nan 3.9 1.7 0.4 0. ]]


100%|███████████████████████████████████████████| 25/25 [00:00<00:00, 27.08it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.7        2.91856265 1.3        0.2        0.        ]
 [4.6        2.90951705 1.5        0.2        0.        ]
 [5.13755751 3.9        1.7        0.4        0.        ]]
Metrics for this fold: {'nrmse': 0.475, 'col_4_error_rate': 0.0, 'total_error_rate': 0.0, 'clf_aucroc': 0.9962}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[4.7 nan 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 25/25 [00:00<00:00, 31.77it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[4.7        3.61538339 1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]
 [4.6        3.4        1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.4154, 'col_4_error_rate': 0.0, 'total_error_rate': 0.0, 'clf_aucroc': 0.9944}
Average nrmse for 5% missing data: 0.39132
Average col_4_error_rate for 5% missing data: 0.125
Average total_error_rate for 5% missing data: 0.125
Average clf_aucroc for 5% missing data: 0.9981199999999999

Processing 10% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.6 nan 1.5 0.2 nan]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 nan 1.7 0.4 0. ]]


100%|███████████████████████████████████████████| 25/25 [00:00<00:00, 31.84it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.6        3.17579889 1.5        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]
 [5.4        3.14000869 1.7        0.4        0.        ]]
Metrics for this fold: {'nrmse': 0.3288, 'col_4_error_rate': 0.25, 'total_error_rate': 0.25, 'clf_aucroc': 1.0}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 nan 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 25/25 [00:00<00:00, 30.86it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.7        3.2        1.3        0.71018195 0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.4467, 'col_4_error_rate': 0.2727, 'total_error_rate': 0.2727, 'clf_aucroc': 0.9877}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [4.6 3.4 1.4 nan 0. ]]


100%|███████████████████████████████████████████| 25/25 [00:00<00:00, 25.32it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[ 5.1         3.5         1.4         0.2         0.        ]
 [ 4.9         3.          1.4         0.2         0.        ]
 [ 4.7         3.2         1.3         0.2         0.        ]
 [ 5.          3.6         1.4         0.2         0.        ]
 [ 4.6         3.4         1.4        -0.18403861  0.        ]]
Metrics for this fold: {'nrmse': 0.4587, 'col_4_error_rate': 0.0909, 'total_error_rate': 0.0909, 'clf_aucroc': 1.0}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[5.1 3.5 nan 0.2 0. ]
 [nan 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.4 nan 1.7 0.4 0. ]]


100%|███████████████████████████████████████████| 25/25 [00:00<00:00, 32.64it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[5.1        3.5        2.39724374 0.2        0.        ]
 [4.98875856 3.         1.4        0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.4        3.47465658 1.7        0.4        0.        ]]
Metrics for this fold: {'nrmse': 0.4452, 'col_4_error_rate': 0.0, 'total_error_rate': 0.0, 'clf_aucroc': 0.9962}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[4.7 3.2 1.3 0.2 nan]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 25/25 [00:00<00:00, 30.59it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]]
Metrics for this fold: {'nrmse': 0.4717, 'col_4_error_rate': 0.0, 'total_error_rate': 0.0, 'clf_aucroc': 0.9806}
Average nrmse for 10% missing data: 0.43022
Average col_4_error_rate for 10% missing data: 0.12271999999999998
Average total_error_rate for 10% missing data: 0.12271999999999998
Average clf_aucroc for 10% missing data: 0.9929

Processing 15% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[nan 3.5 1.4 0.2 0. ]
 [nan 3.  1.4 0.2 0. ]
 [4.6 nan 1.5 0.2 0. ]
 [5.  nan 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]]


100%|███████████████████████████████████████████| 25/25 [00:00<00:00, 31.16it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[4.89763737 3.5        1.4        0.2        0.        ]
 [4.56990671 3.         1.4        0.2        0.        ]
 [4.6        3.02295446 1.5        0.2        0.        ]
 [5.         3.22872686 1.4        0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]]
Metrics for this fold: {'nrmse': 0.2947, 'col_4_error_rate': 0.2667, 'total_error_rate': 0.2667, 'clf_aucroc': 1.0}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 nan nan 0.2 0. ]
 [4.7 3.2 1.3 0.2 nan]
 [4.6 nan 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 25/25 [00:00<00:00, 28.78it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.64620447 1.69280863 0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.69085932 1.5        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.3769, 'col_4_error_rate': 0.125, 'total_error_rate': 0.125, 'clf_aucroc': 1.0}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[5.1 nan nan 0.2 nan]
 [nan 3.  nan nan 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [4.6 3.4 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 25/25 [00:00<00:00, 27.52it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[5.1        3.44331503 4.01731014 0.2        0.        ]
 [6.31239319 3.         3.13163805 0.03874445 0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]
 [4.6        3.4        1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.5437, 'col_4_error_rate': 0.0833, 'total_error_rate': 0.0833, 'clf_aucroc': 0.9948}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[5.1 nan nan 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 nan 1.3 0.2 0. ]
 [4.6 3.1 1.5 nan 0. ]
 [5.4 3.9 1.7 0.4 0. ]]


100%|███████████████████████████████████████████| 25/25 [00:00<00:00, 29.90it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[ 5.1         3.33763075  2.56024337  0.2         0.        ]
 [ 4.9         3.          1.4         0.2         0.        ]
 [ 4.7         3.26025343  1.3         0.2         0.        ]
 [ 4.6         3.1         1.5        -0.09623729  0.        ]
 [ 5.4         3.9         1.7         0.4         0.        ]]
Metrics for this fold: {'nrmse': 0.4043, 'col_4_error_rate': 0.0, 'total_error_rate': 0.0, 'clf_aucroc': 0.9924}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[4.7 3.2 1.3 nan 0. ]
 [4.6 3.1 nan 0.2 nan]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 nan]
 [4.6 3.4 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 25/25 [00:01<00:00, 24.77it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[4.7        3.2        1.3        1.18041515 0.        ]
 [4.6        3.1        2.36659956 0.2        1.        ]
 [5.         3.6        1.4        0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]
 [4.6        3.4        1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.3857, 'col_4_error_rate': 0.2, 'total_error_rate': 0.2, 'clf_aucroc': 0.9889}
Average nrmse for 15% missing data: 0.40106
Average col_4_error_rate for 15% missing data: 0.135
Average total_error_rate for 15% missing data: 0.135
Average clf_aucroc for 15% missing data: 0.99522

Processing 20% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [nan 3.  1.4 0.2 0. ]
 [nan 3.1 1.5 nan 0. ]
 [nan 3.6 1.4 nan nan]
 [5.4 3.9 nan 0.4 0. ]]


100%|███████████████████████████████████████████| 25/25 [00:00<00:00, 25.28it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [5.66219234 3.         1.4        0.2        0.        ]
 [5.47725773 3.1        1.5        0.15902828 0.        ]
 [5.72588539 3.6        1.4        0.17855726 0.        ]
 [5.4        3.9        2.67292333 0.4        0.        ]]
Metrics for this fold: {'nrmse': 0.3683, 'col_4_error_rate': 0.28, 'total_error_rate': 0.28, 'clf_aucroc': 0.9924}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[nan nan nan 0.2 0. ]
 [4.9 nan 1.4 0.2 nan]
 [4.7 nan 1.3 nan 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 nan 0. ]]


100%|███████████████████████████████████████████| 25/25 [00:01<00:00, 23.34it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[5.67516947 3.6136992  1.36309814 0.2        0.        ]
 [4.9        3.73414469 1.4        0.2        0.        ]
 [4.7        3.5907011  1.3        0.12332743 0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.6        1.4        0.27012599 0.        ]]
Metrics for this fold: {'nrmse': 0.4741, 'col_4_error_rate': 0.2308, 'total_error_rate': 0.2308, 'clf_aucroc': 0.9728}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[5.1 nan nan 0.2 0. ]
 [4.9 3.  nan 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [4.6 3.4 1.4 nan nan]]


100%|███████████████████████████████████████████| 25/25 [00:00<00:00, 27.63it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[5.1        2.87859058 3.50134683 0.2        0.        ]
 [4.9        3.         2.75997519 0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]
 [4.6        3.4        1.4        0.2999137  0.        ]]
Metrics for this fold: {'nrmse': 0.4014, 'col_4_error_rate': 0.0909, 'total_error_rate': 0.0909, 'clf_aucroc': 1.0}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 nan 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 nan]
 [5.4 3.9 1.7 0.4 nan]]


100%|███████████████████████████████████████████| 25/25 [00:01<00:00, 23.90it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[ 5.1         3.5         1.4         0.2         0.        ]
 [ 4.9         3.          1.4        -0.52324694  0.        ]
 [ 4.7         3.2         1.3         0.2         0.        ]
 [ 4.6         3.1         1.5         0.2         0.        ]
 [ 5.4         3.9         1.7         0.4         0.        ]]
Metrics for this fold: {'nrmse': 0.4506, 'col_4_error_rate': 0.0882, 'total_error_rate': 0.0882, 'clf_aucroc': 0.9962}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  nan 1.4 0.2 0. ]
 [5.4 3.9 nan 0.4 0. ]
 [4.6 nan 1.4 0.3 nan]]


100%|███████████████████████████████████████████| 25/25 [00:01<00:00, 23.45it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.46504998 1.4        0.2        0.        ]
 [5.4        3.9        1.6803509  0.4        0.        ]
 [4.6        3.0136838  1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.4621, 'col_4_error_rate': 0.375, 'total_error_rate': 0.375, 'clf_aucroc': 0.9778}
Average nrmse for 20% missing data: 0.43129999999999996
Average col_4_error_rate for 20% missing data: 0.21298
Average total_error_rate for 20% missing data: 0.21298
Average clf_aucroc for 20% missing data: 0.9878399999999999
Completed all iterations


In [None]:
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]
 [5.  3.4 1.5 0.2 0. ]
 [4.4 2.9 1.4 0.2 0. ]
 [4.9 3.1 1.5 0.1 0. ]]

In [12]:
from sklearn.datasets import load_iris
import numpy as np

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Combine the features and target labels into one array for display
X_origin = np.concatenate([X, y.reshape(-1, 1)], axis=1)

# Display the first 5 rows of the combined dataset
print(X_origin[:10, :])


[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]
 [5.  3.4 1.5 0.2 0. ]
 [4.4 2.9 1.4 0.2 0. ]
 [4.9 3.1 1.5 0.1 0. ]]
