In [1]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [2]:
!pip install missingpy



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
import math

In [6]:
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

In [102]:
import sys
import numpy as np
import pandas as pd

from missingpy import MissForest
from dsan import Imputer as DSANImputer
from utils import *

import warnings
warnings.filterwarnings('ignore')

In [107]:
seed = 128
set_seed(seed)

In [108]:
def get_imputer(config):
    
    model = config['model']
    
    if model == 'DSAN':
        imputer = DSANImputer(rep_dim=config['rep_dim'],
                num_heads=config['num_heads'],
                n_hidden=config['n_hidden'],
                lr=config['lr'],
                weight_decay=config['weight_decay'],
                batch_size=config['batch_size'],
                epochs=config['epochs'],
                noise_percent=config['noise_percent'],
                stopped_epoch=config['stopped_epoch'])
    
    elif model == 'MissForest':
        imputer = MissForest(random_state=0)
        
    elif model == 'Statistics':
        imputer = None
        
    return imputer

In [109]:
def experiment_run(data_name, config, test=False):
    # Retrieve dataset
    dataset = get_data(data_name)
    print(f"Retrieved dataset for {data_name}")

    # Limit to 1000 samples if test flag is true
    if test:
        dataset = (dataset[0][:1000], dataset[1], dataset[2], dataset[3])
        print("Limited dataset to 1000 samples for testing")

    k = 5
    percent_missing_lst = [p for p in range(5, 25, 5)]
    X_origin, n_col, num_vars, cat_vars = dataset

   # Apply category mapping
    X_origin = category_mapping(X_origin, cat_vars)
    print("Applied category mapping")

    # Check if X_origin is modified in-place or replaced
    if X_origin is not None:
        print("X_origin is replaced with a new version where categorical variables have been mapped to numerical values.")
    else:
        print("X_origin is modified in-place, with categorical variables mapped to numerical values.")

    # Print data size
    print("Data Size: {}".format(X_origin.shape))


    n_sample = X_origin.shape[0]
    idx_lst = list(range(n_sample))
    np.random.shuffle(idx_lst)

    # k-fold validation setup
    n_valid = len(idx_lst) // k
    print(f"Starting k-fold validation with k={k}")

    results = {}
    for percent_missing in percent_missing_lst:
        result = []
        print(f"\nProcessing {percent_missing}% missing data")

        for it, n in enumerate(range(0, n_sample, n_valid)):
            print(f"\nFold {it + 1}")
            valid_idx = idx_lst[n: n+n_valid]

            if len(valid_idx) < n_valid:
                print("Insufficient data for validation, breaking loop")
                break

            train_idx = list(set(idx_lst) - set(valid_idx))
            X = X_origin[train_idx, :]
            X_test = X_origin[valid_idx, :]

            # Generating incomplete data
            missing_mask = generate_missing_mask(X, percent_missing=percent_missing)
            X_incomplete = X.copy()
            X_incomplete[missing_mask] = np.nan
            print("Generated incomplete data with missing values for fold {}:".format(it+1))
            print("Original data with missing values (subset):")
            print(X_incomplete[:5])  # Adjust the number of rows as needed

            # Imputation
            imputer = get_imputer(config)
            X_imputed = imputer.fit_transform(X_incomplete, cat_vars=cat_vars)
            print("Completed imputation for fold {}:".format(it+1))

            X_res = X.copy()
            X_res[missing_mask] = X_imputed[missing_mask]
            print("X_res with imputed values (subset):")
            print(X_res[:5])  # Adjust the number of rows as needed

            # Calculate metrics
            metric = cal_metric_numpy(X_res, X, missing_mask, num_vars, cat_vars)
            score = ex_classify(data_name=data_name, train_array=X_res, test_array=X_test, num_vars=num_vars, cat_vars=cat_vars)
            
            metric['clf_aucroc'] = round(score, 4)
            result.append(metric)
            print("Metrics for this fold:", metric)

        # Average results
        avg_result = dict()
        for key in result[0].keys():
            value = np.mean([m[key] for m in result])
            avg_result[key] = value 
            print(f"Average {key} for {percent_missing}% missing data: {value}")

        results[percent_missing] = avg_result

    print("Completed all iterations")
    return results


In [113]:
config = {
     'model': 'DSAN',
    'rep_dim': 32,
    'num_heads': 4,
    'n_hidden': 2,
    'lr': 3e-2,
    'weight_decay': 1e-5,
    'batch_size': 32,
    'epochs': 34,
    'noise_percent': 15,
    'stopped_epoch': 15
}

In [None]:
config = {
    'model': 'MissForest'
    }

In [95]:
data_name_lst = ['iris']

result_dir = './results_iris_2'
os.makedirs(result_dir)

for data_name in data_name_lst:
    results = experiment_run(data_name, config, test=True)
    results['config'] = config
    results['dataset'] = data_name
    result_path = os.path.join(result_dir, '{}.json'.format(data_name))
    write_json(results, result_path)

Retrieved dataset for iris
Limited dataset to 1000 samples for testing
Applied category mapping
X_origin is replaced with a new version where categorical variables have been mapped to numerical values.
Data Size: (150, 5)
Starting k-fold validation with k=5

Processing 5% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 nan]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [nan 3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:08<00:00,  4.10it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.58147192 3.6        1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.5043, 'col_4_error_rate': 0.1429, 'total_error_rate': 0.1429, 'clf_aucroc': 1.0}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  nan nan 0.2 0. ]
 [4.6 3.4 1.4 nan 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:10<00:00,  3.36it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.10150981 1.81462574 0.2        0.        ]
 [4.6        3.4        1.4        0.40085322 0.        ]]
Metrics for this fold: {'nrmse': 0.3931, 'col_4_error_rate': 0.4286, 'total_error_rate': 0.4286, 'clf_aucroc': 1.0}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 nan 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 nan 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:06<00:00,  5.01it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.6        1.4        0.56778109 0.        ]
 [5.4        3.9        1.7        0.4        0.        ]
 [4.6        3.42574596 1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.3322, 'col_4_error_rate': 0.0, 'total_error_rate': 0.0, 'clf_aucroc': 0.9926}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[5.1 nan 1.4 0.2 0. ]
 [4.9 nan 1.4 0.2 0. ]
 [nan 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:09<00:00,  3.68it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[5.1        3.0322597  1.4        0.2        0.        ]
 [4.9        3.03168631 1.4        0.2        0.        ]
 [4.91024208 3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.3606, 'col_4_error_rate': 0.375, 'total_error_rate': 0.375, 'clf_aucroc': 1.0}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [nan 3.  1.4 0.2 nan]
 [4.7 3.2 1.3 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:09<00:00,  3.64it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [6.00344658 3.         1.4        0.2        1.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]
 [4.6        3.4        1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.2977, 'col_4_error_rate': 0.25, 'total_error_rate': 0.25, 'clf_aucroc': 0.997}
Average nrmse for 5% missing data: 0.37758
Average col_4_error_rate for 5% missing data: 0.23929999999999998
Average total_error_rate for 5% missing data: 0.23929999999999998
Average clf_aucroc for 5% missing data: 0.99792

Processing 10% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  nan 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:12<00:00,  2.71it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         2.62094164 1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.4831, 'col_4_error_rate': 0.5, 'total_error_rate': 0.5, 'clf_aucroc': 1.0}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [nan 3.6 1.4 0.2 0. ]
 [4.6 3.4 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:10<00:00,  3.27it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.88708448 3.6        1.4        0.2        0.        ]
 [4.6        3.4        1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.3643, 'col_4_error_rate': 0.2, 'total_error_rate': 0.2, 'clf_aucroc': 0.9714}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[nan nan 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 nan 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [nan 3.4 1.4 0.3 nan]]


100%|███████████████████████████████████████████| 34/34 [00:06<00:00,  5.26it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[5.66755152 3.43336487 1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.6        3.80282259 0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]
 [5.85656834 3.4        1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.4603, 'col_4_error_rate': 0.1429, 'total_error_rate': 0.1429, 'clf_aucroc': 0.9963}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[5.1 nan 1.4 0.2 nan]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:08<00:00,  3.94it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[5.1        3.27182508 1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.3291, 'col_4_error_rate': 0.2308, 'total_error_rate': 0.2308, 'clf_aucroc': 1.0}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 nan 1.4 0.2 nan]
 [4.7 3.2 1.3 0.2 0. ]
 [5.4 3.9 1.7 nan 0. ]
 [nan 3.4 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:09<00:00,  3.58it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        2.93467236 1.4        0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [5.4        3.9        1.7        0.60198063 0.        ]
 [5.54307842 3.4        1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.3485, 'col_4_error_rate': 0.1818, 'total_error_rate': 0.1818, 'clf_aucroc': 0.997}
Average nrmse for 10% missing data: 0.39705999999999997
Average col_4_error_rate for 10% missing data: 0.2511
Average total_error_rate for 10% missing data: 0.2511
Average clf_aucroc for 10% missing data: 0.9929399999999999

Processing 15% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 nan]
 [nan 3.  nan 0.2 0. ]
 [4.7 3.2 nan nan 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:09<00:00,  3.62it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [5.10178709 3.         3.06333041 0.2        0.        ]
 [4.7        3.2        3.04047322 0.95491672 0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.3644, 'col_4_error_rate': 0.125, 'total_error_rate': 0.125, 'clf_aucroc': 1.0}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[5.1 3.5 1.4 nan 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [4.6 3.4 nan 0.3 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:06<00:00,  5.25it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.12106174 0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]
 [4.6        3.4        2.69474053 0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.3534, 'col_4_error_rate': 0.4118, 'total_error_rate': 0.4118, 'clf_aucroc': 0.9976}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[nan 3.2 1.3 0.2 0. ]
 [4.6 3.1 nan 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 nan 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:07<00:00,  4.71it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[5.07195902 3.2        1.3        0.2        0.        ]
 [4.6        3.1        3.16310573 0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]
 [4.6        3.24327421 1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.3603, 'col_4_error_rate': 0.1111, 'total_error_rate': 0.1111, 'clf_aucroc': 0.9944}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [nan 3.  1.4 0.2 0. ]
 [4.7 nan 1.3 nan nan]
 [4.6 nan 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:08<00:00,  4.11it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [5.33884764 3.         1.4        0.2        0.        ]
 [4.7        3.06237173 1.3        0.51623321 0.        ]
 [4.6        3.0900135  1.5        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.3595, 'col_4_error_rate': 0.0556, 'total_error_rate': 0.0556, 'clf_aucroc': 1.0}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[5.1 3.5 1.4 nan 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [nan 3.2 1.3 nan 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 nan 0.3 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:07<00:00,  4.40it/s]


Completed imputation for fold 5:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.38257504 0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [5.40587711 3.2        1.3        0.37859631 0.        ]
 [5.4        3.9        1.7        0.4        0.        ]
 [4.6        3.4        3.45535922 0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.403, 'col_4_error_rate': 0.0952, 'total_error_rate': 0.0952, 'clf_aucroc': 0.997}
Average nrmse for 15% missing data: 0.36812
Average col_4_error_rate for 15% missing data: 0.15974
Average total_error_rate for 15% missing data: 0.15974
Average clf_aucroc for 15% missing data: 0.9978

Processing 20% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 nan 0.2 nan]
 [4.6 3.1 1.5 nan 0. ]
 [5.  3.6 1.4 0.2 nan]]


100%|███████████████████████████████████████████| 34/34 [00:08<00:00,  3.95it/s]


Completed imputation for fold 1:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.         1.4        0.2        0.        ]
 [4.7        3.2        3.39559054 0.2        0.        ]
 [4.6        3.1        1.5        0.29466701 0.        ]
 [5.         3.6        1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.3979, 'col_4_error_rate': 0.1538, 'total_error_rate': 0.1538, 'clf_aucroc': 0.9794}

Fold 2
Generated incomplete data with missing values for fold 2:
Original data with missing values (subset):
[[5.1 3.5 1.4 nan nan]
 [4.9 3.  nan 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [4.6 3.4 1.4 0.3 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:07<00:00,  4.40it/s]


Completed imputation for fold 2:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.47269547 0.        ]
 [4.9        3.         3.09919167 0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]
 [4.6        3.4        1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.4125, 'col_4_error_rate': 0.4, 'total_error_rate': 0.4, 'clf_aucroc': 1.0}

Fold 3
Generated incomplete data with missing values for fold 3:
Original data with missing values (subset):
[[nan 3.2 1.3 0.2 0. ]
 [4.6 nan 1.5 0.2 nan]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 nan]
 [4.6 3.4 1.4 0.3 nan]]


100%|███████████████████████████████████████████| 34/34 [00:05<00:00,  5.70it/s]


Completed imputation for fold 3:
X_res with imputed values (subset):
[[5.33300686 3.2        1.3        0.2        0.        ]
 [4.6        3.08027959 1.5        0.2        0.        ]
 [5.         3.6        1.4        0.2        0.        ]
 [5.4        3.9        1.7        0.4        0.        ]
 [4.6        3.4        1.4        0.3        0.        ]]
Metrics for this fold: {'nrmse': 0.3508, 'col_4_error_rate': 0.2692, 'total_error_rate': 0.2692, 'clf_aucroc': 0.9926}

Fold 4
Generated incomplete data with missing values for fold 4:
Original data with missing values (subset):
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 nan 1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [nan 3.6 1.4 0.2 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:09<00:00,  3.69it/s]


Completed imputation for fold 4:
X_res with imputed values (subset):
[[5.1        3.5        1.4        0.2        0.        ]
 [4.9        3.27496552 1.4        0.2        0.        ]
 [4.7        3.2        1.3        0.2        0.        ]
 [4.6        3.1        1.5        0.2        0.        ]
 [5.21546412 3.6        1.4        0.2        0.        ]]
Metrics for this fold: {'nrmse': 0.3961, 'col_4_error_rate': 0.3182, 'total_error_rate': 0.3182, 'clf_aucroc': 1.0}

Fold 5
Generated incomplete data with missing values for fold 5:
Original data with missing values (subset):
[[5.1 3.5 nan 0.2 0. ]
 [4.9 3.  nan nan 0. ]
 [4.7 3.2 1.3 nan 0. ]
 [5.4 3.9 1.7 nan 0. ]
 [4.6 3.4 1.4 nan 0. ]]


100%|███████████████████████████████████████████| 34/34 [00:10<00:00,  3.40it/s]

Completed imputation for fold 5:
X_res with imputed values (subset):
[[5.1        3.5        3.03043818 0.2        0.        ]
 [4.9        3.         3.32786131 0.90566272 0.        ]
 [4.7        3.2        1.3        0.48348728 0.        ]
 [5.4        3.9        1.7        0.52140206 0.        ]
 [4.6        3.4        1.4        0.46712074 0.        ]]
Metrics for this fold: {'nrmse': 0.4951, 'col_4_error_rate': 0.24, 'total_error_rate': 0.24, 'clf_aucroc': 0.9939}
Average nrmse for 20% missing data: 0.41048
Average col_4_error_rate for 20% missing data: 0.27624
Average total_error_rate for 20% missing data: 0.27624
Average clf_aucroc for 20% missing data: 0.99318
Completed all iterations





In [86]:
config = {
    'model': 'DSAN'
    }

In [115]:
experiment_run('iris', config, test=True)

Retrieved dataset for iris
Limited dataset to 1000 samples for testing
Applied category mapping
X_origin is replaced with a new version where categorical variables have been mapped to numerical values.
Data Size: (150, 5)
Starting k-fold validation with k=5

Processing 5% missing data

Fold 1
Generated incomplete data with missing values for fold 1:
Original data with missing values (subset):
[[5.1 3.5 nan 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 nan 0. ]
 [4.6 3.4 1.4 0.3 0. ]]
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Completed imputation for fold 1:
X_res with imputed values (subset):
[[5.1   3.5   1.469 0.2   0.   ]
 [4.9   3.    1.4   0.2   0.   ]
 [5.    3.6   1.4   0.2   0.   ]
 [5.4   3.9   1.7   0.286 0.   ]
 [4.6   3.4   1.4   0.3   0.   ]]
Metrics for this fold: {'nrmse': 0.165, 'col_4_error_rate': 0.0, 'total_error_rate': 0.0, 'clf_aucroc': 0.9926}

Fold 2
Generated incomplete data with missing values for fold 

{5: {'nrmse': 0.13082000000000002,
  'col_4_error_rate': 0.09,
  'total_error_rate': 0.09,
  'clf_aucroc': 0.99796},
 10: {'nrmse': 0.16240000000000002,
  'col_4_error_rate': 0.03356,
  'total_error_rate': 0.03356,
  'clf_aucroc': 0.99824},
 15: {'nrmse': 0.1815,
  'col_4_error_rate': 0.0646,
  'total_error_rate': 0.0646,
  'clf_aucroc': 0.99778},
 20: {'nrmse': 0.19172,
  'col_4_error_rate': 0.1583,
  'total_error_rate': 0.1583,
  'clf_aucroc': 0.9959199999999999}}