In [1]:
!nvidia-smi
!pip install missingpy

zsh:1: command not found: nvidia-smi


In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
import math

In [13]:
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

In [14]:
import sys
import numpy as np
import pandas as pd

from missingpy import MissForest
from dsan import Imputer as DSANImputer
from utils import *

import warnings
warnings.filterwarnings('ignore')

In [15]:
seed = 32
set_seed(seed)

In [16]:
def get_imputer(config):
    
    model = config['model']
    
    if model == 'DSAN':
        imputer = DSANImputer(rep_dim=config['rep_dim'],
                num_heads=config['num_heads'],
                n_hidden=config['n_hidden'],
                lr=config['lr'],
                weight_decay=config['weight_decay'],
                batch_size=config['batch_size'],
                epochs=config['epochs'],
                noise_percent=config['noise_percent'],
                stopped_epoch=config['stopped_epoch'])
    
    elif model == 'MissForest':
        imputer = MissForest(random_state=0)
        
    elif model == 'Statistics':
        imputer = None
        
    return imputer

In [17]:
def experiment_run(data_name, config, test=False):
    # Retrieve dataset
    dataset = get_data(data_name)
    print(f"Retrieved dataset for {data_name}")

    # Limit to 1000 samples if test flag is true
    if test:
        dataset = (dataset[0][:1000], dataset[1], dataset[2], dataset[3])
        print("Limited dataset to 1000 samples for testing")

    k = 5
    percent_missing_lst = [p for p in range(5, 25, 5)]
    X_origin, n_col, num_vars, cat_vars = dataset

   # Apply category mapping
    X_origin = category_mapping(X_origin, cat_vars)
    print("Applied category mapping")

    # Check if X_origin is modified in-place or replaced
    if X_origin is not None:
        print("X_origin is replaced with a new version where categorical variables have been mapped to numerical values.")
    else:
        print("X_origin is modified in-place, with categorical variables mapped to numerical values.")

    # Print data size
    print("Data Size: {}".format(X_origin.shape))


    n_sample = X_origin.shape[0]
    idx_lst = list(range(n_sample))
    np.random.shuffle(idx_lst)

    # k-fold validation setup
    n_valid = len(idx_lst) // k
    print(f"Starting k-fold validation with k={k}")

    results = {}
    for percent_missing in percent_missing_lst:
        result = []
        print(f"\nProcessing {percent_missing}% missing data")

        for it, n in enumerate(range(0, n_sample, n_valid)):
            print(f"\nFold {it + 1}")
            valid_idx = idx_lst[n: n+n_valid]

            if len(valid_idx) < n_valid:
                print("Insufficient data for validation, breaking loop")
                break

            train_idx = list(set(idx_lst) - set(valid_idx))
            X = X_origin[train_idx, :]
            X_test = X_origin[valid_idx, :]

            # Generating incomplete data
            missing_mask = generate_missing_mask(X, percent_missing=percent_missing)
            X_incomplete = X.copy()
            X_incomplete[missing_mask] = np.nan
            print("Generated incomplete data with missing values for fold {}:".format(it+1))
            print("Original data with missing values (subset):")
            print(X_incomplete[:5])  # Adjust the number of rows as needed

            # Imputation
            imputer = get_imputer(config)
            X_imputed = imputer.fit_transform(X_incomplete, cat_vars=cat_vars)
            print("Completed imputation for fold {}:".format(it+1))

            X_res = X.copy()
            X_res[missing_mask] = X_imputed[missing_mask]
            print("X_res with imputed values (subset):")
            print(X_res[:5])  # Adjust the number of rows as needed

            # Calculate metrics
            metric = cal_metric_numpy(X_res, X, missing_mask, num_vars, cat_vars)
            score = ex_regress(data_name=data_name, train_array=X_res, test_array=X_test, num_vars=num_vars, cat_vars=cat_vars)
            
            metric['clf_aucroc'] = round(score, 4)
            result.append(metric)
            print("Metrics for this fold:", metric)

        # Average results
        avg_result = dict()
        for key in result[0].keys():
            value = np.mean([m[key] for m in result])
            avg_result[key] = value 
            print(f"Average {key} for {percent_missing}% missing data: {value}")

        results[percent_missing] = avg_result

    print("Completed all iterations")
    return results


In [18]:
config = {
    'model': 'DSAN',
    'rep_dim': 32,
    'num_heads': 8,
    'n_hidden': 2,
    'lr': 3e-3,
    'weight_decay': 1e-5,
    'batch_size': 32,
    'epochs': 34,
    'noise_percent': 15,
    'stopped_epoch': 15
}

In [19]:
data_name_lst = ['insurance']

result_dir = './results'
os.makedirs(result_dir)

for data_name in data_name_lst:
    results = experiment_run(data_name, config, test=True)
    results['config'] = config
    results['dataset'] = data_name
    result_path = os.path.join(result_dir, '{}.json'.format(data_name))
    write_json(results, result_path)

Retrieved dataset for insurance
Limited dataset to 1000 samples for testing
Applied category mapping
X_origin is replaced with a new version where categorical variables have been mapped to numerical values.
Data Size: (1000, 7)
Starting k-fold validation with k=5

Processing 5% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[19 0 27.9 0 0 nan 16884.924]
 [18 1 33.77 1 1 nan 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [32 1 28.88 0 1 2 nan]
 [31 0 25.74 0 1 1 3756.6216]]


100%|███████████████████████████████████████████| 34/34 [00:17<00:00,  1.89it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 1 1 3 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [32 1 28.88 0 1 2 4809.94775390625]
 [31 0 25.74 0 1 1 3756.6216]]
Metrics for this fold: {'nrmse': 0.3635, 'col_1_error_rate': 0.6111, 'col_4_error_rate': 0.0857, 'col_5_error_rate': 0.7442, 'total_error_rate': 0.5, 'clf_aucroc': 33141851.5382}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[nan 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]
 [31 0 25.74 0 1 1 3756.6216]
 [46 0 33.44 1 1 1 8240.5896]]


100%|███████████████████████████████████████████| 34/34 [00:21<00:00,  1.55it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[30.002639770507812 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]
 [31 0 25.74 0 1 1 3756.6216]
 [46 0 33.44 1 1 1 8240.5896]]
Metrics for this fold: {'nrmse': 0.2447, 'col_1_error_rate': 0.4865, 'col_4_error_rate': 0.1304, 'col_5_error_rate': 0.8205, 'total_error_rate': 0.459, 'clf_aucroc': 31693369.7378}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [33 1 22.705 0 1 2 21984.47061]
 [32 1 28.88 0 1 2 nan]
 [46 0 33.44 1 1 nan 8240.5896]]


100%|███████████████████████████████████████████| 34/34 [00:18<00:00,  1.83it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [33 1 22.705 0 1 2 21984.47061]
 [32 1 28.88 0 1 2 5955.8232421875]
 [46 0 33.44 1 1 3 8240.5896]]
Metrics for this fold: {'nrmse': 0.3471, 'col_1_error_rate': 0.5, 'col_4_error_rate': 0.0476, 'col_5_error_rate': 0.6818, 'total_error_rate': 0.4167, 'clf_aucroc': 38001725.1058}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 nan 0 1 2 nan]
 [32 1 28.88 0 1 2 3866.8552]]


100%|███████████████████████████████████████████| 34/34 [00:26<00:00,  1.30it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 30.083847045898438 0 1 2 6948.88427734375]
 [32 1 28.88 0 1 2 3866.8552]]
Metrics for this fold: {'nrmse': 0.3821, 'col_1_error_rate': 0.4062, 'col_4_error_rate': 0.0952, 'col_5_error_rate': 0.6939, 'total_error_rate': 0.4146, 'clf_aucroc': 39985738.3235}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[19 0 27.9 0 0 nan 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [31 0 25.74 0 1 1 3756.6216]]


100%|███████████████████████████████████████████| 34/34 [00:21<00:00,  1.61it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[19 0 27.9 0 0 2 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [31 0 25.74 0 1 1 3756.6216]]
Metrics for this fold: {'nrmse': 0.3727, 'col_1_error_rate': 0.475, 'col_4_error_rate': 0.0513, 'col_5_error_rate': 0.6739, 'total_error_rate': 0.416, 'clf_aucroc': 34430890.354}
Average nrmse for 5% missing data: 0.34202000000000005
Average col_1_error_rate for 5% missing data: 0.49576000000000003
Average col_4_error_rate for 5% missing data: 0.08204
Average col_5_error_rate for 5% missing data: 0.7228600000000001
Average total_error_rate for 5% missing data: 0.44126000000000004
Average clf_aucroc for 5% missing data: 35450715.01186

Processing 10% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 nan 1 1 1 1725.5523]
 [28 1 33.0 3 1 nan 4449.462]
 [nan 1 28.88 

100%|███████████████████████████████████████████| 34/34 [00:20<00:00,  1.70it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 32.77574920654297 1 1 1 1725.5523]
 [28 1 33.0 3 1 0 4449.462]
 [35.336647033691406 1 28.88 0 1 2 3866.8552]
 [31 0 25.74 0 1 1 3756.6216]]
Metrics for this fold: {'nrmse': 0.4439, 'col_1_error_rate': 0.4321, 'col_4_error_rate': 0.0947, 'col_5_error_rate': 0.7083, 'total_error_rate': 0.3831, 'clf_aucroc': 32897908.2554}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 nan]
 [32 nan 28.88 0 1 2 3866.8552]
 [31 0 25.74 0 1 nan 3756.6216]
 [46 0 33.44 1 1 1 nan]]


100%|███████████████████████████████████████████| 34/34 [00:20<00:00,  1.69it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 4070.623779296875]
 [32 1 28.88 0 1 2 3866.8552]
 [31 0 25.74 0 1 3 3756.6216]
 [46 0 33.44 1 1 1 9735.7822265625]]
Metrics for this fold: {'nrmse': 0.3902, 'col_1_error_rate': 0.481, 'col_4_error_rate': 0.1071, 'col_5_error_rate': 0.6374, 'total_error_rate': 0.4134, 'clf_aucroc': 31248971.7181}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[19 0 27.9 nan 0 0 16884.924]
 [18 1 33.77 1 nan 1 1725.5523]
 [33 1 22.705 0 1 2 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]
 [46 0 33.44 1 1 1 8240.5896]]


100%|███████████████████████████████████████████| 34/34 [00:20<00:00,  1.65it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[19 0 27.9 0.7772734761238098 0 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [33 1 22.705 0 1 2 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]
 [46 0 33.44 1 1 1 8240.5896]]
Metrics for this fold: {'nrmse': 0.3668, 'col_1_error_rate': 0.5132, 'col_4_error_rate': 0.1579, 'col_5_error_rate': 0.7283, 'total_error_rate': 0.4836, 'clf_aucroc': 38291152.1237}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 nan 2 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]]


100%|███████████████████████████████████████████| 34/34 [00:20<00:00,  1.63it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 0 2 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]]
Metrics for this fold: {'nrmse': 0.5106, 'col_1_error_rate': 0.4156, 'col_4_error_rate': 0.1084, 'col_5_error_rate': 0.7179, 'total_error_rate': 0.4076, 'clf_aucroc': 39916987.5532}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[19 0 27.9 0 0 0 nan]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [31 0 25.74 0 1 1 3756.6216]]


100%|███████████████████████████████████████████| 34/34 [00:20<00:00,  1.70it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[19 0 27.9 0 0 0 20331.23828125]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [31 0 25.74 0 1 1 3756.6216]]
Metrics for this fold: {'nrmse': 0.4207, 'col_1_error_rate': 0.439, 'col_4_error_rate': 0.0741, 'col_5_error_rate': 0.8088, 'total_error_rate': 0.4199, 'clf_aucroc': 34736225.3175}
Average nrmse for 10% missing data: 0.42644000000000004
Average col_1_error_rate for 10% missing data: 0.45618
Average col_4_error_rate for 10% missing data: 0.10844000000000001
Average col_5_error_rate for 10% missing data: 0.72014
Average total_error_rate for 10% missing data: 0.42152
Average clf_aucroc for 10% missing data: 35418248.99358

Processing 15% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 nan nan 4449.462]
 [32 nan 28.88 0 1 2

100%|███████████████████████████████████████████| 34/34 [00:19<00:00,  1.73it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 1 3 4449.462]
 [32 0 28.88 0 1 2 3866.8552]
 [31 0 31.109346389770508 0 1 1 9705.54296875]]
Metrics for this fold: {'nrmse': 0.5452, 'col_1_error_rate': 0.4407, 'col_4_error_rate': 0.113, 'col_5_error_rate': 0.688, 'total_error_rate': 0.4218, 'clf_aucroc': 33027086.8299}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 nan 2 21984.47061]
 [32 1 28.88 0 1 2 nan]
 [31 0 nan 0 1 1 3756.6216]
 [nan 0 33.44 1 1 1 8240.5896]]


100%|███████████████████████████████████████████| 34/34 [00:19<00:00,  1.77it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 0 2 21984.47061]
 [32 1 28.88 0 1 2 7857.87548828125]
 [31 0 32.631004333496094 0 1 1 3756.6216]
 [47.408355712890625 0 33.44 1 1 1 8240.5896]]
Metrics for this fold: {'nrmse': 0.4865, 'col_1_error_rate': 0.5, 'col_4_error_rate': 0.1151, 'col_5_error_rate': 0.7339, 'total_error_rate': 0.4235, 'clf_aucroc': 31595241.5884}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[19 0 27.9 nan 0 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [33 1 22.705 0 1 2 nan]
 [32 1 28.88 0 1 2 3866.8552]
 [46 0 33.44 nan 1 1 8240.5896]]


100%|███████████████████████████████████████████| 34/34 [00:19<00:00,  1.71it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[19 0 27.9 0.6793169975280762 0 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [33 1 22.705 0 1 2 6497.12353515625]
 [32 1 28.88 0 1 2 3866.8552]
 [46 0 33.44 1.4192754030227661 1 1 8240.5896]]
Metrics for this fold: {'nrmse': 0.3916, 'col_1_error_rate': 0.4327, 'col_4_error_rate': 0.0565, 'col_5_error_rate': 0.6731, 'total_error_rate': 0.4089, 'clf_aucroc': 38045362.6811}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 nan 1 1 1 1725.5523]
 [28 1 33.0 3 1 1 nan]
 [33 1 22.705 0 nan nan 21984.47061]
 [32 1 28.88 0 1 2 nan]]


100%|███████████████████████████████████████████| 34/34 [00:19<00:00,  1.75it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 32.291343688964844 1 1 1 1725.5523]
 [28 1 33.0 3 1 1 8544.4580078125]
 [33 1 22.705 0 0 3 21984.47061]
 [32 1 28.88 0 1 2 6329.6640625]]
Metrics for this fold: {'nrmse': 0.3854, 'col_1_error_rate': 0.5086, 'col_4_error_rate': 0.1083, 'col_5_error_rate': 0.6696, 'total_error_rate': 0.4224, 'clf_aucroc': 39300302.524}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 nan 1 1 1 1725.5523]
 [28 nan 33.0 3 1 nan 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [31 0 25.74 0 nan 1 3756.6216]]


100%|███████████████████████████████████████████| 34/34 [00:18<00:00,  1.84it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 30.06769371032715 1 1 1 1725.5523]
 [28 1 33.0 3 1 3 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [31 0 25.74 0 1 1 3756.6216]]
Metrics for this fold: {'nrmse': 0.418, 'col_1_error_rate': 0.4436, 'col_4_error_rate': 0.1083, 'col_5_error_rate': 0.7083, 'total_error_rate': 0.4209, 'clf_aucroc': 34848048.1892}
Average nrmse for 15% missing data: 0.44534
Average col_1_error_rate for 15% missing data: 0.46512000000000003
Average col_4_error_rate for 15% missing data: 0.10024
Average col_5_error_rate for 15% missing data: 0.69458
Average total_error_rate for 15% missing data: 0.41950000000000004
Average clf_aucroc for 15% missing data: 35363208.36252

Processing 20% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 1 nan 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [nan 1 28.88 n

100%|███████████████████████████████████████████| 34/34 [00:19<00:00,  1.75it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [32.66828155517578 1 28.88 1.358992099761963 1 2 3866.8552]
 [31.960697174072266 1 31.410459518432617 0 1 1 3756.6216]]
Metrics for this fold: {'nrmse': 0.3502, 'col_1_error_rate': 0.4724, 'col_4_error_rate': 0.1088, 'col_5_error_rate': 0.6753, 'total_error_rate': 0.4246, 'clf_aucroc': 33142642.4313}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[nan 1 33.0 3 1 1 4449.462]
 [33 nan 22.705 0 1 2 21984.47061]
 [32 1 28.88 nan 1 2 3866.8552]
 [31 nan 25.74 0 1 1 3756.6216]
 [46 0 33.44 1 1 1 8240.5896]]


100%|███████████████████████████████████████████| 34/34 [00:18<00:00,  1.84it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[38.62839889526367 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [32 1 28.88 1.191323161125183 1 2 3866.8552]
 [31 1 25.74 0 1 1 3756.6216]
 [46 0 33.44 1 1 1 8240.5896]]
Metrics for this fold: {'nrmse': 0.3956, 'col_1_error_rate': 0.4902, 'col_4_error_rate': 0.0865, 'col_5_error_rate': 0.7015, 'total_error_rate': 0.3919, 'clf_aucroc': 32383443.7746}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[19 0 27.9 nan 0 0 16884.924]
 [18 1 33.77 1 nan 1 1725.5523]
 [33 1 22.705 nan 1 nan 21984.47061]
 [32 1 28.88 0 nan 2 3866.8552]
 [nan 0 nan 1 1 1 8240.5896]]


100%|███████████████████████████████████████████| 34/34 [00:18<00:00,  1.81it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[19 0 27.9 0.9147516489028931 0 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [33 1 22.705 1.5129168033599854 1 3 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]
 [41.892906188964844 0 32.51597213745117 1 1 1 8240.5896]]
Metrics for this fold: {'nrmse': 0.5529, 'col_1_error_rate': 0.5031, 'col_4_error_rate': 0.1412, 'col_5_error_rate': 0.7532, 'total_error_rate': 0.4581, 'clf_aucroc': 38108338.3482}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[19 0 27.9 0 0 nan 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 nan 1 nan 4449.462]
 [33 1 22.705 0 nan nan nan]
 [32 1 28.88 0 1 2 nan]]


100%|███████████████████████████████████████████| 34/34 [00:19<00:00,  1.75it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[19 0 27.9 0 0 2 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 1.7457783222198486 1 1 4449.462]
 [33 1 22.705 0 1 2 10411.27734375]
 [32 1 28.88 0 1 2 10776.1298828125]]
Metrics for this fold: {'nrmse': 0.5393, 'col_1_error_rate': 0.4803, 'col_4_error_rate': 0.1214, 'col_5_error_rate': 0.7289, 'total_error_rate': 0.4379, 'clf_aucroc': 39370343.6245}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[19 0 27.9 0 nan 0 nan]
 [nan nan 33.77 1 1 nan 1725.5523]
 [nan 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 nan]
 [31 0 25.74 0 nan 1 nan]]


100%|███████████████████████████████████████████| 34/34 [00:19<00:00,  1.74it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[19 0 27.9 0 1 0 10027.443359375]
 [29.508689880371094 0 33.77 1 1 0 1725.5523]
 [35.16301345825195 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 8311.388671875]
 [31 0 25.74 0 1 1 10788.5517578125]]
Metrics for this fold: {'nrmse': 0.491, 'col_1_error_rate': 0.5127, 'col_4_error_rate': 0.0549, 'col_5_error_rate': 0.7793, 'total_error_rate': 0.4347, 'clf_aucroc': 35643480.3432}
Average nrmse for 20% missing data: 0.46580000000000005
Average col_1_error_rate for 20% missing data: 0.49173999999999995
Average col_4_error_rate for 20% missing data: 0.10255999999999998
Average col_5_error_rate for 20% missing data: 0.72764
Average total_error_rate for 20% missing data: 0.42943999999999993
Average clf_aucroc for 20% missing data: 35729649.70436
Completed all iterations
