In [1]:
!nvidia-smi
!pip install missingpy

zsh:1: command not found: nvidia-smi


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
import math

In [5]:
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

In [6]:
import sys
import numpy as np
import pandas as pd

from missingpy import MissForest
from dsan import Imputer as DSANImputer
from utils import *

import warnings
warnings.filterwarnings('ignore')

In [7]:
seed = 128
set_seed(seed)

In [8]:
def get_imputer(config):
    
    model = config['model']
    
    if model == 'DSAN':
        imputer = DSANImputer(rep_dim=config['rep_dim'],
                num_heads=config['num_heads'],
                n_hidden=config['n_hidden'],
                lr=config['lr'],
                weight_decay=config['weight_decay'],
                batch_size=config['batch_size'],
                epochs=config['epochs'],
                noise_percent=config['noise_percent'],
                stopped_epoch=config['stopped_epoch'])
    
    elif model == 'MissForest':
        imputer = MissForest(random_state=0)
        
    elif model == 'Statistics':
        imputer = None
        
    return imputer

In [11]:
def experiment_run(data_name, config, test=False):
    # Retrieve dataset
    dataset = get_data(data_name)
    print(f"Retrieved dataset for {data_name}")

    # Limit to 1000 samples if test flag is true
    if test:
        dataset = (dataset[0][:1000], dataset[1], dataset[2], dataset[3])
        print("Limited dataset to 1000 samples for testing")

    k = 5
    percent_missing_lst = [p for p in range(5, 25, 5)]
    X_origin, n_col, num_vars, cat_vars = dataset

   # Apply category mapping
    X_origin = category_mapping(X_origin, cat_vars)
    print("Applied category mapping")

    # Check if X_origin is modified in-place or replaced
    if X_origin is not None:
        print("X_origin is replaced with a new version where categorical variables have been mapped to numerical values.")
    else:
        print("X_origin is modified in-place, with categorical variables mapped to numerical values.")

    # Print data size
    print("Data Size: {}".format(X_origin.shape))


    n_sample = X_origin.shape[0]
    idx_lst = list(range(n_sample))
    np.random.shuffle(idx_lst)

    # k-fold validation setup
    n_valid = len(idx_lst) // k
    print(f"Starting k-fold validation with k={k}")

    results = {}
    for percent_missing in percent_missing_lst:
        result = []
        print(f"\nProcessing {percent_missing}% missing data")

        for it, n in enumerate(range(0, n_sample, n_valid)):
            print(f"\nFold {it + 1}")
            valid_idx = idx_lst[n: n+n_valid]

            if len(valid_idx) < n_valid:
                print("Insufficient data for validation, breaking loop")
                break

            train_idx = list(set(idx_lst) - set(valid_idx))
            X = X_origin[train_idx, :]
            X_test = X_origin[valid_idx, :]

            # Generating incomplete data
            missing_mask = generate_missing_mask(X, percent_missing=percent_missing)
            X_incomplete = X.copy()
            X_incomplete[missing_mask] = np.nan
            print("Generated incomplete data with missing values for fold {}:".format(it+1))
            print("Original data with missing values (subset):")
            print(X_incomplete[:5])  # Adjust the number of rows as needed

            # Imputation
            imputer = get_imputer(config)
            X_imputed = imputer.fit_transform(X_incomplete, cat_vars=cat_vars)
            print("Completed imputation for fold {}:".format(it+1))

            X_res = X.copy()
            X_res[missing_mask] = X_imputed[missing_mask]
            print("X_res with imputed values (subset):")
            print(X_res[:5])  # Adjust the number of rows as needed

            # Calculate metrics
            metric = cal_metric_numpy(X_res, X, missing_mask, num_vars, cat_vars)
            score = ex_regress(data_name=data_name, train_array=X_res, test_array=X_test, num_vars=num_vars, cat_vars=cat_vars)
            
            metric['clf_aucroc'] = round(score, 4)
            result.append(metric)
            print("Metrics for this fold:", metric)

        # Average results
        avg_result = dict()
        for key in result[0].keys():
            value = np.mean([m[key] for m in result])
            avg_result[key] = value 
            print(f"Average {key} for {percent_missing}% missing data: {value}")

        results[percent_missing] = avg_result

    print("Completed all iterations")
    return results


In [12]:
config = {
    'model': 'DSAN',
    'rep_dim': 32,
    'num_heads': 16,
    'n_hidden': 2,
    'lr': 3e-3,
    'weight_decay': 1e-5,
    'batch_size': 32,
    'epochs': 34,
    'noise_percent': 15,
    'stopped_epoch': 15
}

In [13]:
data_name_lst = ['insurance']

result_dir = './results1'
os.makedirs(result_dir)

for data_name in data_name_lst:
    results = experiment_run(data_name, config, test=True)
    results['config'] = config
    results['dataset'] = data_name
    result_path = os.path.join(result_dir, '{}.json'.format(data_name))
    write_json(results, result_path)

Retrieved dataset for insurance
Limited dataset to 1000 samples for testing
Applied category mapping
X_origin is replaced with a new version where categorical variables have been mapped to numerical values.
Data Size: (1000, 7)
Starting k-fold validation with k=5

Processing 5% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[28 1 33.0 3 1 nan nan]
 [32 1 28.88 nan 1 2 3866.8552]
 [31 0 25.74 0 1 1 3756.6216]
 [46 0 33.44 1 1 nan 8240.5896]
 [37 1 29.83 2 1 3 6406.4107]]


100%|███████████████████████████████████████████| 34/34 [00:23<00:00,  1.43it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[28 1 33.0 3 1 3 8232.080078125]
 [32 1 28.88 1.4683914184570312 1 2 3866.8552]
 [31 0 25.74 0 1 1 3756.6216]
 [46 0 33.44 1 1 3 8240.5896]
 [37 1 29.83 2 1 3 6406.4107]]
Metrics for this fold: {'nrmse': 0.3271, 'col_1_error_rate': 0.5263, 'col_4_error_rate': 0.0455, 'col_5_error_rate': 0.8462, 'total_error_rate': 0.4545, 'clf_aucroc': 37805139.9444}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[19 0 27.9 0 nan 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]]


100%|███████████████████████████████████████████| 34/34 [00:21<00:00,  1.59it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]]
Metrics for this fold: {'nrmse': 0.3888, 'col_1_error_rate': 0.4444, 'col_4_error_rate': 0.1026, 'col_5_error_rate': 0.617, 'total_error_rate': 0.4046, 'clf_aucroc': 42461949.5522}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[19 0 27.9 nan 0 0 16884.924]
 [nan 1 33.77 1 nan 1 1725.5523]
 [28 nan 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]]


100%|███████████████████████████████████████████| 34/34 [00:28<00:00,  1.20it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[19 0 27.9 1.1014131307601929 0 0 16884.924]
 [28.762935638427734 1 33.77 1 1 1 1725.5523]
 [28 0 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]]
Metrics for this fold: {'nrmse': 0.331, 'col_1_error_rate': 0.5102, 'col_4_error_rate': 0.1944, 'col_5_error_rate': 0.7436, 'total_error_rate': 0.4919, 'clf_aucroc': 39733910.3517}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 1 1 1 nan]
 [nan 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 nan]
 [32 1 28.88 0 1 2 3866.8552]]


100%|███████████████████████████████████████████| 34/34 [00:39<00:00,  1.15s/it]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 1 1 1 6989.7998046875]
 [35.88955307006836 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 9468.4853515625]
 [32 1 28.88 0 1 2 3866.8552]]
Metrics for this fold: {'nrmse': 0.4372, 'col_1_error_rate': 0.5833, 'col_4_error_rate': 0.0526, 'col_5_error_rate': 0.6216, 'total_error_rate': 0.4144, 'clf_aucroc': 28132814.251}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [33 1 22.705 0 1 2 21984.47061]
 [31 0 25.74 0 1 1 3756.6216]
 [46 0 33.44 1 1 nan 8240.5896]]


100%|███████████████████████████████████████████| 34/34 [00:37<00:00,  1.09s/it]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [33 1 22.705 0 1 2 21984.47061]
 [31 0 25.74 0 1 1 3756.6216]
 [46 0 33.44 1 1 3 8240.5896]]
Metrics for this fold: {'nrmse': 0.4628, 'col_1_error_rate': 0.5455, 'col_4_error_rate': 0.0256, 'col_5_error_rate': 0.8, 'total_error_rate': 0.4554, 'clf_aucroc': 31100824.102}
Average nrmse for 5% missing data: 0.38937999999999995
Average col_1_error_rate for 5% missing data: 0.5219400000000001
Average col_4_error_rate for 5% missing data: 0.08414
Average col_5_error_rate for 5% missing data: 0.72568
Average total_error_rate for 5% missing data: 0.44416
Average clf_aucroc for 5% missing data: 35846927.640259996

Processing 10% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[28 1 33.0 3 1 1 4449.462]
 [32 nan 28.88 0 1 2 3866.8552]
 [31 0 25.74 0 1 1 3756.6216]
 [46 0 33.44 1 1 1 8240.5896]


100%|███████████████████████████████████████████| 34/34 [00:43<00:00,  1.29s/it]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[28 1 33.0 3 1 1 4449.462]
 [32 1 28.88 0 1 2 3866.8552]
 [31 0 25.74 0 1 1 3756.6216]
 [46 0 33.44 1 1 1 8240.5896]
 [37 1 29.83 2 1 3 6406.4107]]
Metrics for this fold: {'nrmse': 0.4011, 'col_1_error_rate': 0.5294, 'col_4_error_rate': 0.1375, 'col_5_error_rate': 0.7067, 'total_error_rate': 0.4542, 'clf_aucroc': 37354522.1251}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [nan 1 33.77 1 1 1 nan]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]]


100%|███████████████████████████████████████████| 34/34 [00:33<00:00,  1.02it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [39.50954818725586 1 33.77 1 1 1 9462.662109375]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]]
Metrics for this fold: {'nrmse': 0.427, 'col_1_error_rate': 0.4444, 'col_4_error_rate': 0.0833, 'col_5_error_rate': 0.6292, 'total_error_rate': 0.3878, 'clf_aucroc': 43245198.0473}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 nan 1 1 1725.5523]
 [28 nan 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]]


100%|███████████████████████████████████████████| 34/34 [00:34<00:00,  1.02s/it]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 0.6172112822532654 1 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]]
Metrics for this fold: {'nrmse': 0.4009, 'col_1_error_rate': 0.4941, 'col_4_error_rate': 0.1071, 'col_5_error_rate': 0.6882, 'total_error_rate': 0.4389, 'clf_aucroc': 39953175.2903}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[19 0 27.9 nan 0 0 16884.924]
 [18 1 33.77 nan 1 1 1725.5523]
 [28 nan 33.0 3 1 nan 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [32 1 nan 0 1 2 nan]]


100%|███████████████████████████████████████████| 34/34 [00:40<00:00,  1.18s/it]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[19 0 27.9 1.0616837739944458 0 0 16884.924]
 [18 1 33.77 0.8119320869445801 1 1 1725.5523]
 [28 1 33.0 3 1 3 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [32 1 28.961198806762695 0 1 2 6828.1708984375]]
Metrics for this fold: {'nrmse': 0.4512, 'col_1_error_rate': 0.4521, 'col_4_error_rate': 0.1341, 'col_5_error_rate': 0.6353, 'total_error_rate': 0.4083, 'clf_aucroc': 28877329.8996}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[nan 0 27.9 0 0 0 16884.924]
 [18 1 nan 1 1 1 1725.5523]
 [nan nan 22.705 0 1 2 nan]
 [31 0 25.74 0 1 1 3756.6216]
 [46 0 33.44 1 1 1 8240.5896]]


100%|███████████████████████████████████████████| 34/34 [00:32<00:00,  1.05it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[30.783367156982422 0 27.9 0 0 0 16884.924]
 [18 1 29.995508193969727 1 1 1 1725.5523]
 [41.21665573120117 1 22.705 0 1 2 8898.302734375]
 [31 0 25.74 0 1 1 3756.6216]
 [46 0 33.44 1 1 1 8240.5896]]
Metrics for this fold: {'nrmse': 0.4209, 'col_1_error_rate': 0.5309, 'col_4_error_rate': 0.08, 'col_5_error_rate': 0.6667, 'total_error_rate': 0.4346, 'clf_aucroc': 31450269.0657}
Average nrmse for 10% missing data: 0.42022000000000004
Average col_1_error_rate for 10% missing data: 0.49017999999999995
Average col_4_error_rate for 10% missing data: 0.10839999999999998
Average col_5_error_rate for 10% missing data: 0.66522
Average total_error_rate for 10% missing data: 0.42476
Average clf_aucroc for 10% missing data: 36176098.88559999

Processing 15% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[28 1 33.0 3 nan 1 4449.462]
 [32 1 28.88 0 1 2 nan]

100%|███████████████████████████████████████████| 34/34 [00:34<00:00,  1.02s/it]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[28 1 33.0 3 1 1 4449.462]
 [32 1 28.88 0 1 2 6137.2373046875]
 [31 0 25.74 0 1 1 3756.6216]
 [46 0 33.44 1 1 1 8240.5896]
 [37 1 29.83 2 1 3 6406.4107]]
Metrics for this fold: {'nrmse': 0.4614, 'col_1_error_rate': 0.4194, 'col_4_error_rate': 0.1034, 'col_5_error_rate': 0.7407, 'total_error_rate': 0.3899, 'clf_aucroc': 38423112.6178}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[19 0 nan 0 0 0 nan]
 [18 1 33.77 nan 1 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 nan 1 2 21984.47061]
 [32 1 28.88 0 1 nan 3866.8552]]


100%|███████████████████████████████████████████| 34/34 [00:30<00:00,  1.11it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[19 0 28.64301300048828 0 0 0 16196.083984375]
 [18 1 33.77 0.25501736998558044 1 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 1.0819120407104492 1 2 21984.47061]
 [32 1 28.88 0 1 1 3866.8552]]
Metrics for this fold: {'nrmse': 0.3947, 'col_1_error_rate': 0.4545, 'col_4_error_rate': 0.0619, 'col_5_error_rate': 0.708, 'total_error_rate': 0.4092, 'clf_aucroc': 43109773.6342}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 nan 1 nan 1725.5523]
 [28 1 33.0 3 1 nan 4449.462]
 [33 1 22.705 nan 1 2 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]]


100%|███████████████████████████████████████████| 34/34 [00:34<00:00,  1.02s/it]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 0.5395395755767822 1 3 1725.5523]
 [28 1 33.0 3 1 3 4449.462]
 [33 1 22.705 1.0916324853897095 1 2 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]]
Metrics for this fold: {'nrmse': 0.5812, 'col_1_error_rate': 0.4797, 'col_4_error_rate': 0.1217, 'col_5_error_rate': 0.7018, 'total_error_rate': 0.4347, 'clf_aucroc': 39785451.5514}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[19 0 27.9 0 nan 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 nan 2 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]]


100%|███████████████████████████████████████████| 34/34 [00:32<00:00,  1.05it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[19 0 27.9 0 0 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]]
Metrics for this fold: {'nrmse': 0.4649, 'col_1_error_rate': 0.5455, 'col_4_error_rate': 0.0862, 'col_5_error_rate': 0.7218, 'total_error_rate': 0.4649, 'clf_aucroc': 27549304.8187}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[19 0 27.9 0 0 nan 16884.924]
 [nan 1 33.77 nan 1 1 1725.5523]
 [33 1 22.705 0 1 nan nan]
 [31 0 25.74 0 nan nan nan]
 [46 nan 33.44 1 1 nan 8240.5896]]


100%|███████████████████████████████████████████| 34/34 [00:30<00:00,  1.13it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[19 0 27.9 0 0 2 16884.924]
 [31.965576171875 1 33.77 1.503210425376892 1 1 1725.5523]
 [33 1 22.705 0 1 3 9751.486328125]
 [31 0 25.74 0 1 2 8246.6787109375]
 [46 0 33.44 1 1 0 8240.5896]]
Metrics for this fold: {'nrmse': 0.529, 'col_1_error_rate': 0.4776, 'col_4_error_rate': 0.121, 'col_5_error_rate': 0.6829, 'total_error_rate': 0.4278, 'clf_aucroc': 31525501.8641}
Average nrmse for 15% missing data: 0.48624
Average col_1_error_rate for 15% missing data: 0.4753400000000001
Average col_4_error_rate for 15% missing data: 0.09884000000000001
Average col_5_error_rate for 15% missing data: 0.71104
Average total_error_rate for 15% missing data: 0.4253
Average clf_aucroc for 15% missing data: 36078628.89724

Processing 20% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[28 1 33.0 3 1 1 4449.462]
 [32 nan 28.88 nan 1 2 3866.8552]
 [31 nan nan 0 1 

100%|███████████████████████████████████████████| 34/34 [00:35<00:00,  1.03s/it]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[28 1 33.0 3 1 1 4449.462]
 [32 1 28.88 1.0560623407363892 1 2 3866.8552]
 [31 1 33.49957275390625 0 1 1 8986.37890625]
 [46 0 33.44 1 1 1 8240.5896]
 [37 0 29.83 2 1 0 12671.5400390625]]
Metrics for this fold: {'nrmse': 0.4967, 'col_1_error_rate': 0.4481, 'col_4_error_rate': 0.1515, 'col_5_error_rate': 0.6687, 'total_error_rate': 0.441, 'clf_aucroc': 37056216.9813}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[19 0 27.9 0 0 0 nan]
 [nan 1 33.77 1 1 1 1725.5523]
 [28 nan 33.0 3 1 1 4449.462]
 [33 1 22.705 0 nan 2 21984.47061]
 [32 nan nan 0 1 2 3866.8552]]


100%|███████████████████████████████████████████| 34/34 [00:29<00:00,  1.15it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[19 0 27.9 0 0 0 18296.53515625]
 [31.36916732788086 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 0 2 21984.47061]
 [32 0 30.935569763183594 0 1 2 3866.8552]]
Metrics for this fold: {'nrmse': 0.5585, 'col_1_error_rate': 0.4691, 'col_4_error_rate': 0.0929, 'col_5_error_rate': 0.7079, 'total_error_rate': 0.4187, 'clf_aucroc': 43185404.1277}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[19 0 27.9 nan 0 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [28 nan 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 nan]
 [nan nan 28.88 nan 1 2 3866.8552]]


100%|███████████████████████████████████████████| 34/34 [00:33<00:00,  1.01it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[19 0 27.9 0.8390485048294067 0 0 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 22.705 0 1 2 5285.45263671875]
 [32.931114196777344 1 28.88 1.6381959915161133 1 2 3866.8552]]
Metrics for this fold: {'nrmse': 0.4813, 'col_1_error_rate': 0.426, 'col_4_error_rate': 0.1128, 'col_5_error_rate': 0.7416, 'total_error_rate': 0.4562, 'clf_aucroc': 39185847.3237}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[19 nan nan 0 0 0 nan]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 nan 1 4449.462]
 [33 1 nan 0 1 nan 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]]


100%|███████████████████████████████████████████| 34/34 [00:28<00:00,  1.18it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[19 1 30.40291404724121 0 0 0 20440.734375]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 1 1 4449.462]
 [33 1 30.450634002685547 0 1 1 21984.47061]
 [32 1 28.88 0 1 2 3866.8552]]
Metrics for this fold: {'nrmse': 0.5377, 'col_1_error_rate': 0.481, 'col_4_error_rate': 0.1304, 'col_5_error_rate': 0.7467, 'total_error_rate': 0.4456, 'clf_aucroc': 31200505.9189}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[19 0 27.9 0 0 nan 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [33 1 22.705 0 1 2 21984.47061]
 [31 nan nan 0 nan 1 3756.6216]
 [46 0 nan 1 1 nan 8240.5896]]


100%|███████████████████████████████████████████| 34/34 [00:26<00:00,  1.30it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[19 0 27.9 0 0 1 16884.924]
 [18 1 33.77 1 1 1 1725.5523]
 [33 1 22.705 0 1 2 21984.47061]
 [31 0 31.51054573059082 0 1 1 3756.6216]
 [46 0 30.23199462890625 1 1 1 8240.5896]]
Metrics for this fold: {'nrmse': 0.4253, 'col_1_error_rate': 0.4356, 'col_4_error_rate': 0.1528, 'col_5_error_rate': 0.6667, 'total_error_rate': 0.4207, 'clf_aucroc': 31389789.2269}
Average nrmse for 20% missing data: 0.49989999999999996
Average col_1_error_rate for 20% missing data: 0.45196
Average col_4_error_rate for 20% missing data: 0.12808000000000003
Average col_5_error_rate for 20% missing data: 0.7063200000000001
Average total_error_rate for 20% missing data: 0.43644
Average clf_aucroc for 20% missing data: 36403552.71570001
Completed all iterations


In [15]:
config = {
    'model': 'MissForest'
    }

In [16]:
experiment_run('insurance', config, test=True)

Retrieved dataset for insurance
Limited dataset to 1000 samples for testing
Applied category mapping
X_origin is replaced with a new version where categorical variables have been mapped to numerical values.
Data Size: (1000, 7)
Starting k-fold validation with k=5

Processing 5% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[19 0 27.9 0 0 0 nan]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 nan 1 4449.462]
 [31 0 25.74 0 1 1 3756.6216]
 [46 0 33.44 1 1 1 8240.5896]]
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Completed imputation for fold 1:
X_res with imputed values (subset):
[[19 0 27.9 0 0 0 17107.402605999992]
 [18 1 33.77 1 1 1 1725.5523]
 [28 1 33.0 3 1.0 1 4449.462]
 [31 0 25.74 0 1 1 3756.6216]
 [46 0 33.44 1 1 1 8240.5896]]
Metrics for this fold: {'nrmse': 0.4377, 'col_1_error_rate': 0.4865, 'col_4_error_rate': 0.1071, 'col_5_error_rate': 0.5263, 'total_error_rate': 0.3981, 'clf_aucroc': 4026816

{5: {'nrmse': 0.36196,
  'col_1_error_rate': 0.42517999999999995,
  'col_4_error_rate': 0.06649999999999999,
  'col_5_error_rate': 0.68566,
  'total_error_rate': 0.38612,
  'clf_aucroc': 35610693.37522},
 10: {'nrmse': 0.41546000000000005,
  'col_1_error_rate': 0.4549,
  'col_4_error_rate': 0.07698,
  'col_5_error_rate': 0.6965,
  'total_error_rate': 0.39996,
  'clf_aucroc': 35609725.57057999},
 15: {'nrmse': 0.45152000000000003,
  'col_1_error_rate': 0.44764,
  'col_4_error_rate': 0.061380000000000004,
  'col_5_error_rate': 0.6248400000000001,
  'total_error_rate': 0.38106,
  'clf_aucroc': 35335115.39186},
 20: {'nrmse': 0.48793999999999993,
  'col_1_error_rate': 0.46747999999999995,
  'col_4_error_rate': 0.10154,
  'col_5_error_rate': 0.65954,
  'total_error_rate': 0.4123799999999999,
  'clf_aucroc': 35674720.311079994}}