In [1]:
import pandas as pd
import warnings
from hyperimpute.utils.serialization import load_model_from_file, save_model_to_file
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler

workspace = Path("workspace")
results_dir = Path("results")
data_dir = Path("data")

workspace.mkdir(parents=True, exist_ok=True)

warnings.filterwarnings("ignore")

cat_limit = 10
n_seeds = 5

version = "take6_v1"
changelog = f"multiple_imputation{n_seeds}_last_try"


In [2]:
def augment_base_dataset(df):
    for rid in df["RID_HASH"].unique():
        visits = len(df[df["RID_HASH"] == rid])
        last_visit = df[df["RID_HASH"] == rid]["VISCODE"].max()

        df.loc[df["RID_HASH"] == rid, "total_visits"] = visits
        df.loc[df["RID_HASH"] == rid, "last_visit"] = last_visit

    return df

def dataframe_hash(df: pd.DataFrame) -> str:
    cols = sorted(list(df.columns))
    return str(abs(pd.util.hash_pandas_object(df[cols].fillna(0)).sum()))


In [3]:
dev_set = pd.read_csv(data_dir / "dev_set.csv")
dev_set = dev_set.sort_values(["RID_HASH", "VISCODE"])
dev_set = augment_base_dataset(dev_set)

scaled_cols = [
    "MMSE",
    "ADAS13",
    "Ventricles",
    "Hippocampus",
    "WholeBrain",
    "Entorhinal",
    "Fusiform",
    "MidTemp",
]

scaler = MinMaxScaler().fit(dev_set[scaled_cols])
dev_set[scaled_cols] = scaler.transform(dev_set[scaled_cols])

dev_set

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit
2163,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,0,79.1,0,20,1.0,1.0,0.5,0.923077,0.164384,0.071871,0.548646,0.376516,0.464021,0.194906,0.400709,2.0,6.0
154,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,6,79.6,0,20,1.0,1.0,1.5,0.923077,0.237397,0.071956,0.548307,0.366398,0.403880,0.193367,0.397291,2.0,6.0
1385,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,0,72.9,1,12,1.0,1.0,1.0,1.000000,0.123288,0.142655,0.525169,0.235599,0.513404,0.356253,0.294774,6.0,60.0
2698,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,6,73.4,1,12,1.0,1.0,1.0,1.000000,0.164384,0.144729,0.549210,0.230361,0.435097,0.322395,0.294175,6.0,60.0
2291,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,12,73.9,1,12,1.0,1.0,1.0,0.961538,0.109589,0.155550,0.527878,0.215944,0.487831,0.342600,0.277552,6.0,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,60,79.8,1,19,1.0,0.0,3.0,0.923077,0.223699,0.170895,0.357020,0.321346,0.310935,0.399047,0.461476,7.0,102.0
2646,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,102,83.3,1,19,1.0,0.0,3.0,0.846154,0.168904,0.178231,0.352043,0.309095,0.256790,0.372685,0.416478,7.0,102.0
1962,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,0,72.1,0,12,1.0,0.0,0.5,0.884615,0.150685,0.416382,0.602438,0.636654,0.610229,0.743037,0.624631,3.0,24.0
122,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,12,73.1,0,12,1.0,0.0,1.0,0.961538,0.155205,0.398451,0.608521,0.634650,0.617108,0.729087,0.638477,3.0,24.0


In [4]:
dev_1 = pd.read_csv(data_dir / "dev_1.csv")
dev_1 = dev_1.sort_values(["RID_HASH", "VISCODE"])
dev_1 = augment_base_dataset(dev_1)
dev_1[scaled_cols] = scaler.transform(dev_1[scaled_cols])

dev_1

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit
2163,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,0,,0.0,20.0,1.0,1.0,0.5,0.923077,0.164384,,,0.376516,,,,2.0,6.0
154,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,6,79.6,0.0,20.0,1.0,1.0,1.5,0.923077,0.237397,0.071956,0.548307,0.366398,0.403880,0.193367,0.397291,2.0,6.0
1385,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,0,,1.0,12.0,,1.0,,,,,0.525169,0.235599,0.513404,0.356253,0.294774,6.0,60.0
2698,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,6,,1.0,12.0,,1.0,,,,,0.549210,0.230361,0.435097,0.322395,0.294175,6.0,60.0
2291,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,12,,1.0,12.0,,1.0,,,,,0.527878,0.215944,0.487831,0.342600,0.277552,6.0,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,60,79.8,1.0,19.0,,0.0,,,,0.170895,,0.321346,,,,7.0,102.0
2646,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,102,83.3,1.0,19.0,,0.0,,,,0.178231,,0.309095,,,,7.0,102.0
1962,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,0,72.1,,12.0,1.0,0.0,0.5,0.884615,0.150685,0.416382,0.602438,,0.610229,0.743037,0.624631,3.0,24.0
122,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,12,73.1,,12.0,1.0,0.0,1.0,0.961538,0.155205,0.398451,0.608521,,0.617108,0.729087,0.638477,3.0,24.0


In [5]:
dev_2 = pd.read_csv(data_dir / "dev_2.csv")
dev_2 = dev_2.sort_values(["RID_HASH", "VISCODE"])
dev_2 = augment_base_dataset(dev_2)
dev_2[scaled_cols] = scaler.transform(dev_2[scaled_cols])

dev_2

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit
2163,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,0,79.1,0.0,20.0,1.0,1.0,0.5,0.923077,0.164384,0.071871,0.548646,0.376516,0.464021,0.194906,0.400709,2.0,6.0
154,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,6,79.6,,,,1.0,,,,0.071956,0.548307,,0.403880,0.193367,0.397291,2.0,6.0
1385,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,0,72.9,,12.0,1.0,1.0,1.0,1.000000,0.123288,0.142655,0.525169,,0.513404,0.356253,0.294774,6.0,60.0
2698,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,6,,,12.0,1.0,1.0,1.0,1.000000,0.164384,,0.549210,,0.435097,0.322395,0.294175,6.0,60.0
2291,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,12,,,12.0,1.0,1.0,1.0,0.961538,0.109589,,,,,,,6.0,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,60,,,19.0,1.0,0.0,3.0,0.923077,0.223699,,0.357020,,0.310935,0.399047,0.461476,7.0,102.0
2646,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,102,,,19.0,1.0,0.0,3.0,0.846154,0.168904,,0.352043,,0.256790,0.372685,0.416478,7.0,102.0
1962,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,0,72.1,,12.0,,0.0,,,,0.416382,0.602438,,0.610229,0.743037,0.624631,3.0,24.0
122,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,12,,,12.0,,0.0,,,,,,,,,,3.0,24.0


In [6]:
submission = pd.read_csv(data_dir / "sample_submission.csv")

submission.values[1]

array(['6b6a7136f42a8dbd469a201b88e2abb54a93667822761357db2f6d620da6af8a_0_Ventricles_test_A',
       40613.0818580834], dtype=object)

In [7]:
test_A = pd.read_csv(data_dir / "test_A.csv")
test_A = augment_base_dataset(test_A)
test_A[scaled_cols] = scaler.transform(test_A[scaled_cols])

test_A

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit
0,988b6137f4352c01e4b52790505caa0c3ec438f117000a...,24,,,18.0,,0.0,,,,,,,,,,6.0,36.0
1,fb640cef87a6af00053e632140ce18f5722431bb92576b...,12,66.4,1.0,18.0,1.0,1.0,1.5,0.961538,0.077671,0.145063,,0.542904,,,,5.0,24.0
2,f24f78d62c90319b575dfb48a482159c4d0df14cb71530...,66,74.5,0.0,14.0,0.0,0.0,0.0,0.961538,0.050274,0.559104,0.565102,0.753302,0.641093,0.911086,0.866886,5.0,96.0
3,da4cbd3f09e8ddc87cc72e542d43f072e7df288face65e...,0,,,16.0,0.0,0.0,0.0,1.000000,0.191781,,,,,,,2.0,36.0
4,f665c6ee86356bdd135be03c61348607cabd64ed8433ba...,12,82.7,,13.0,,1.0,,,,0.206872,0.353047,,0.208289,0.188006,0.363489,7.0,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1323,51923c5d7573ef46aa9197cae78c3305abea5b3479331f...,6,83.5,,18.0,1.0,0.0,2.5,0.846154,0.150685,0.252103,0.541648,,0.454850,0.480663,0.479467,5.0,60.0
1324,06407d9ec85d62cd38189108ddffec23822f421b3db357...,0,,,20.0,,0.0,,,,,,,,,,1.0,0.0
1325,e5015703a58ccd5582a46d9f4a779edf062d683f3ae873...,132,83.0,0.0,20.0,0.0,1.0,0.0,0.961538,0.205479,0.193712,0.533115,0.491958,0.513933,0.508464,0.573437,9.0,132.0
1326,cf6ea2601bb119113371df79931cc3734b77218f734ad0...,12,82.4,,18.0,1.0,0.0,0.5,0.884615,0.205479,0.272259,0.572799,,0.597707,0.380480,0.361608,3.0,12.0


In [8]:
test_B = pd.read_csv(data_dir / "test_B.csv")
test_B = augment_base_dataset(test_B)
test_B[scaled_cols] = scaler.transform(test_B[scaled_cols])

test_B

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit
0,90a4f1869cf459af5fe39e53f1c328540f1dcf5a1908f7...,60,67.9,1.0,20.0,0.0,1.0,0.0,1.000000,0.123288,0.069404,,0.330562,,,,6.0,60.0
1,fad8ca8f903cf3ddf566926eabdb8718e8568962675519...,30,69.1,,16.0,0.0,0.0,0.0,0.961538,0.059315,0.162418,0.749086,,0.899471,0.724619,0.481817,3.0,30.0
2,d342fb7689e49c754709870c77e1aa3ed770dd193e9f9c...,12,,,12.0,,1.0,,,,,,,,,,1.0,12.0
3,5319e7ba149f0f81715b5e7f854036fc937141840bbd52...,6,,,18.0,,0.0,,,,,0.476637,,0.430159,0.520727,0.415281,5.0,126.0
4,6eef135d8c4eca67b0e130b8f4aedbc37a99938224d661...,0,,0.0,16.0,0.0,0.0,0.0,0.961538,0.095890,,0.605756,0.508509,0.629277,0.604379,0.449383,5.0,72.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,fbf6267bf7d92b507feb4957d7aa90ea5bb50893bb79d4...,0,,,12.0,,2.0,,,,,0.492325,,0.497354,0.374770,0.460151,7.0,60.0
1466,03e8ddc654f8e27332c5b09618b355d7f9529d614adb0f...,12,81.5,0.0,15.0,,0.0,,,,0.264134,,0.208488,,,,3.0,12.0
1467,1156748dfd6e69e1f364c31584e957d3b1ef656b898942...,0,,0.0,18.0,0.0,0.0,0.0,0.961538,0.136986,,,0.686284,,,,4.0,84.0
1468,0c7e17c442e715e067bd472c1e472b4937914d7fb8d492...,12,,,16.0,,0.0,,,,,0.553883,,0.753086,0.606563,0.458784,4.0,42.0


In [9]:
test_A.isna().sum()

RID_HASH          0
VISCODE           0
AGE             612
PTGENDER_num    626
PTEDUCAT         65
DX_num          428
APOE4            49
CDRSB           428
MMSE            428
ADAS13          428
Ventricles      612
Hippocampus     668
WholeBrain      626
Entorhinal      668
Fusiform        668
MidTemp         668
total_visits      0
last_visit        0
dtype: int64

In [10]:
test_A.columns

Index(['RID_HASH', 'VISCODE', 'AGE', 'PTGENDER_num', 'PTEDUCAT', 'DX_num',
       'APOE4', 'CDRSB', 'MMSE', 'ADAS13', 'Ventricles', 'Hippocampus',
       'WholeBrain', 'Entorhinal', 'Fusiform', 'MidTemp', 'total_visits',
       'last_visit'],
      dtype='object')

In [11]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
from hyperimpute.plugins.prediction import Classifiers, Regression
from hyperimpute.utils.tester import evaluate_regression, evaluate_estimator
from hyperimpute.utils.benchmarks import RMSE
from hyperimpute.utils.optimizer import EarlyStoppingExceeded, create_study
import optuna

train_cols = list(dev_set.drop(columns=["RID_HASH"]).columns)

eval_cols = [
    "DX_num",
    "CDRSB",
    "MMSE",
    "ADAS13",
    "Ventricles",
    "Hippocampus",
    "WholeBrain",
    "Entorhinal",
    "Fusiform",
    "MidTemp",
]



def prepare_temporal_data(data, target_col: str, direction: str):
    target_train_data = []
    target_train_labels = []

    for item in data.groupby("RID_HASH"):
        # print(item[0])
        local = item[1]
        local = local.sort_values(["RID_HASH", "VISCODE"])

        rid = local["RID_HASH"]

        prev_cols = [f"prev_{col}" for col in train_cols]
        prev_row = np.zeros(len(prev_cols))

        if direction == "forward":
            rows = local.iterrows()
        else:
            rows = local.iloc[::-1].iterrows()

        for idx, row in rows:
            target_val = row[target_col]
            tmp_row = row[train_cols].copy()
            src_data = tmp_row.to_frame().T.drop(columns=[target_col])

            src_data[prev_cols] = prev_row

            prev_row = tmp_row

            target_train_data.append(src_data)
            target_train_labels.append(target_val)

    target_train_data = pd.concat(target_train_data, ignore_index=True).astype(float)

    return target_train_data, target_train_labels


def evaluate_target(data, target_col: str, direction: str):
    train_data, labels = prepare_temporal_data(data, target_col, direction)
    assert target_col not in train_data.columns

    
    def evaluate_clf(plugin, args = {}):
        model = plugin(**args)
        encoded_labels = LabelEncoder().fit_transform(labels)

        return evaluate_estimator(
            model, train_data, pd.Series(encoded_labels)
        )["clf"]["aucroc"][0]
    
    def evaluate_reg(plugin, args = {}):
        model = plugin(**args)
        return evaluate_regression(model, train_data, labels)["clf"]["r2"][0]

    best_score = -99
    best_target_plugin = None
    for (clf_type, reg_type) in [
        ("xgboost", "xgboost_regressor"),
        ("logistic_regression", "linear_regression"),
        ("catboost", "catboost_regressor"),
        ("random_forest", "random_forest_regressor"),
        ("kneighbors", "kneighbors_regressor"),
    ]:
        if len(np.unique(labels)) < cat_limit:
            if clf_type is None:
                continue
            plugin = Classifiers().get_type(clf_type)
            cbk = evaluate_clf
        else:
            if reg_type is None:
                continue
            plugin = Regression().get_type(reg_type)
            cbk = evaluate_reg

        study, pruner = create_study(
            study_name=f"long_imputation_{plugin.name()}_{target_col}_{direction}",
            direction="maximize",
            load_if_exists = True,
        )
        
        def objective(trial: optuna.Trial) -> float:
            args = plugin.sample_hyperparameters(trial)
            pruner.check_trial(trial)

            try:
                score = cbk(plugin, args)
            except BaseException:
                print("      failed evaluation", plugin.name(), args)
                return -5

            #print(f"    >>  {plugin.name()} {args} -> {score}")
            pruner.report_score(score)

            return score

        try:
            study.optimize(objective, n_trials=100, timeout=60 * 10)
        except EarlyStoppingExceeded:
            pass

        baseline_score = cbk(plugin)

        if study.best_value > baseline_score:
            score = study.best_value
            args = study.best_trial.params
        else:
            score = baseline_score
            args = {}
            
        if score > best_score:
            best_score = score
            best_target_plugin, best_target_plugin_args = plugin, args
            
    print(f"     >> Selected {target_col} --> {best_target_plugin.name()} -- {best_target_plugin_args}", best_score)

    model = best_target_plugin(**best_target_plugin_args)
        
    if len(np.unique(labels)) < cat_limit:
        labels = LabelEncoder().fit_transform(labels)
    
    model.fit(train_data, labels)
    
    return model


def prepare_longitudinal_imputers(data, columns):
    imputers = {}

    for direction in ["forward", "reverse"]:
        imputers[direction] = {}
        for target_col in columns:
            train_data, labels = prepare_temporal_data(data, target_col, direction)
            print("train", target_col, direction, len(np.unique(labels)))

            if len(np.unique(labels)) > cat_limit:
                model = Regression().get("catboost_regressor")
            else:
                model = Classifiers().get("catboost")

            model.fit(train_data, labels)

            imputers[direction][target_col] = model

    return imputers


def prepare_longitudinal_imputers_automl(data, columns):
    imputers = {}

    for direction in ["forward", "reverse"]:
        imputers[direction] = {}
        for target_col in columns:
            model = evaluate_target(data, target_col, direction=direction)

            imputers[direction][target_col] = model

    return imputers

In [12]:
# for col in eval_cols:
#     evaluate_target(dev_set, col, direction = "forward")

In [13]:
# for col in eval_cols:
#     evaluate_target(dev_set, col, direction = "reverse")

In [14]:
dev_set_id = dataframe_hash(dev_set)

imputers_bkp_file = workspace / f"longitudinal_imputers_scaled_cat{cat_limit}_{dev_set_id}_automl_100.bkp"

if imputers_bkp_file.exists():
    longitudinal_imputers = load_model_from_file(imputers_bkp_file)
else:
    longitudinal_imputers = prepare_longitudinal_imputers_automl(dev_set, eval_cols)
    save_model_to_file(imputers_bkp_file, longitudinal_imputers)

In [15]:
for direction in longitudinal_imputers:
    for col in longitudinal_imputers[direction]:
        print(direction, col,  longitudinal_imputers[direction][col].name())

forward DX_num xgboost
forward CDRSB linear_regression
forward MMSE xgboost_regressor
forward ADAS13 linear_regression
forward Ventricles linear_regression
forward Hippocampus xgboost_regressor
forward WholeBrain linear_regression
forward Entorhinal linear_regression
forward Fusiform linear_regression
forward MidTemp linear_regression
reverse DX_num xgboost
reverse CDRSB xgboost_regressor
reverse MMSE xgboost_regressor
reverse ADAS13 xgboost_regressor
reverse Ventricles linear_regression
reverse Hippocampus xgboost_regressor
reverse WholeBrain linear_regression
reverse Entorhinal linear_regression
reverse Fusiform linear_regression
reverse MidTemp linear_regression


## Preprocess data

In [16]:
from hyperimpute.plugins.imputers import Imputers

# VISCODE 6 * x -> AGE 0.5 * x

const_by_patient = ["PTGENDER_num", "PTEDUCAT", "APOE4"]


def dataframe_hash(df: pd.DataFrame) -> str:
    return str(abs(pd.util.hash_pandas_object(df).sum()))


def normalize(test_data):
    return test_data

def prepare_consts(train_data, test_data):
    test_data = test_data.copy()
    train_data = train_data.copy()

    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    test_data = test_data.sort_values(["RID_HASH", "VISCODE"])

    for item in test_data.groupby("RID_HASH"):
        local = item[1]

        # fill consts
        for col in const_by_patient:
            if len(local[col].unique()) == 1:
                continue
            rid = local["RID_HASH"].unique()[0]

            val = local[col][~local[col].isna()].unique()[0]
            local[col] = local[col].fillna(val)
            test_data.loc[test_data["RID_HASH"] == rid, col] = test_data[
                test_data["RID_HASH"] == rid
            ][col].fillna(val)
            assert len(local[col].unique()) == 1, col

    return test_data


def prepare_age(train_data, test_data):
    test_data = test_data.copy()
    train_data = train_data.copy()

    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    test_data = test_data.sort_values(["RID_HASH", "VISCODE"])

    col = "AGE"

    for rid in test_data["RID_HASH"].unique():
        local = test_data[test_data["RID_HASH"] == rid]

        # fill age
        ages = local["AGE"]
        if ages.isna().sum() == 0:
            continue

        if ages.isna().sum() == len(ages):
            continue

        # forward impute age
        prev_viscode = 0
        prev_age = 0
        for idx, row in local.iterrows():
            current_viscode = row["VISCODE"]
            local_idx = (test_data["VISCODE"] == current_viscode) & (
                test_data["RID_HASH"] == rid
            )
            if prev_age > 0 and prev_age == prev_age:
                pred_age = (current_viscode - prev_viscode) / 6 * 0.5 + prev_age
            else:
                pred_age = row[col]

            if pred_age == pred_age:
                # print("forward imputed", pred_age, current_viscode)
                test_data.loc[local_idx, col] = test_data.loc[local_idx][col].fillna(
                    pred_age
                )

            prev_viscode = row["VISCODE"]
            prev_age = pred_age

        # reverse impute age
        prev_viscode = 0
        prev_age = 0
        for idx, row in local.iloc[::-1].iterrows():
            current_viscode = row["VISCODE"]
            local_idx = (test_data["VISCODE"] == current_viscode) & (
                test_data["RID_HASH"] == rid
            )

            if prev_age > 0 and prev_age == prev_age:
                pred_age = prev_age - (prev_viscode - current_viscode) / 6 * 0.5
            else:
                pred_age = row[col]

            if pred_age == pred_age:
                # print("reversed imputed", pred_age, current_viscode)
                test_data.loc[local_idx, col] = test_data.loc[local_idx][col].fillna(
                    pred_age
                )

            prev_viscode = row["VISCODE"]
            prev_age = pred_age

        # print(test_data[(test_data["RID_HASH"] == rid)][["VISCODE", "AGE"]])
    return test_data


def impute_longitudinal(
    train_data,
    test_data,
    n_iter=5,
    eval_cols=[
        "DX_num",
        "CDRSB",
        "MMSE",
        "ADAS13",
        "Ventricles",
        "Hippocampus",
        "WholeBrain",
        "Entorhinal",
        "Fusiform",
        "MidTemp",
    ],
    imputed_test_data=None,
):
    test_data = test_data.copy()
    train_data = train_data.copy()

    if imputed_test_data is None:
        imputed_test_data = interm_imputation(
            train_data, test_data
        )

    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    test_data = test_data.sort_values(["RID_HASH", "VISCODE"])
    imputed_test_data = imputed_test_data.sort_values(["RID_HASH", "VISCODE"])

    prev_cols = [f"prev_{col}" for col in train_cols]

    for rid in test_data["RID_HASH"].unique():
        patient = test_data[test_data["RID_HASH"] == rid]
        patient_imputed = imputed_test_data[imputed_test_data["RID_HASH"] == rid]

        prediction_rows = [pd.Series(np.zeros(len(prev_cols)), index=train_cols)]
        for ridx, row in patient.iterrows():
            prediction_rows.append(row[train_cols])
        prediction_rows.append(pd.Series(np.zeros(len(prev_cols)), index=train_cols))

        for col in eval_cols:
            if patient[col].isna().sum() == 0:
                continue

            for ridx, row in enumerate(prediction_rows[1:-1]):
                real_idx = ridx + 1
                if row[col] == row[col]:
                    continue
                current_viscode = row["VISCODE"]
                local_idx = (test_data["VISCODE"] == current_viscode) & (
                    test_data["RID_HASH"] == rid
                )

                prev_col_val = prediction_rows[real_idx - 1][col]
                next_col_val = prediction_rows[real_idx + 1][col]

                if next_col_val == next_col_val and ridx + 1 < len(patient_imputed):
                    eval_data = (
                        patient_imputed.iloc[ridx].to_frame().T[train_cols]
                    ).drop(columns = [col]) #row.to_frame().T[train_cols]
                    eval_data[prev_cols] = (
                        patient_imputed.iloc[ridx + 1].to_frame().T[train_cols].values
                    )
                    eval_data = eval_data.astype(float)

                    assert eval_data.isna().sum().sum() == 0
                    assert eval_data[f"prev_{col}"].values[0] == next_col_val

                    imputer = longitudinal_imputers["reverse"][col]
                    imputed_val = imputer.predict(eval_data).values.squeeze()

                    test_data.loc[local_idx, col] = imputed_val

                if prev_col_val == prev_col_val and ridx > 0:
                    # print("Imputing using the prev value", prev_col_val)
                    eval_data = (
                        patient_imputed.iloc[ridx].to_frame().T[train_cols]
                    ).drop(columns = [col])
                    eval_data[prev_cols] = (
                        patient_imputed.iloc[ridx - 1].to_frame().T[train_cols].values
                    )
                    eval_data = eval_data.astype(float)

                    assert eval_data.isna().sum().sum() == 0
                    assert eval_data[f"prev_{col}"].values[0] == prev_col_val

                    imputer = longitudinal_imputers["forward"][col]
                    imputed_val = imputer.predict(eval_data).values.squeeze()
                    
                    existing_value = test_data.loc[local_idx, col].values[0]
                    if existing_value == existing_value:
                        imputed_val = (imputed_val + existing_value) / 2
                    test_data.loc[local_idx, col] = imputed_val


    return normalize(test_data)

def interm_imputation(train_data, test_data, forward_first = True):
    test_data = test_data.copy()

    for rid in test_data["RID_HASH"].unique():
        local = test_data[test_data["RID_HASH"] == rid]

        if forward_first:
            local = local.ffill().bfill()
        else:
            local = local.bfill().ffill()

        test_data.loc[test_data["RID_HASH"] == rid] = local
        
    test_data = prepare_consts(train_data, test_data)
    test_data = prepare_age(train_data, test_data)
    return full_imputation(train_data, test_data)

def full_imputation(train_data, test_data):
    imputed_test_data = test_data.copy()
    imputer_kwargs = {
        "optimizer": "bayesian",
        "classifier_seed": ["xgboost",],
        "regression_seed": ["xgboost_regressor"],
        "class_threshold": cat_limit,
    }

    imputer = Imputers().get(
        "hyperimpute",
        **imputer_kwargs,
    )
    imputation_input = pd.concat([train_data, test_data], ignore_index=True)
    imputation_ids = imputation_input["RID_HASH"]
    imputation_input = imputation_input.drop(columns = ["RID_HASH"])
    
    imputed_test_data = imputer.fit_transform(imputation_input)
    imputed_test_data = imputed_test_data.tail(len(test_data))

    out_cols = ["RID_HASH"] + list(imputed_test_data.columns)
    imputed_test_data["RID_HASH"] = test_data["RID_HASH"].values
    
    return imputed_test_data[out_cols]

def evaluate_static_imputation(train_data, test_data, static_imputation):
    test_data = test_data.copy()
    train_data = train_data.copy()

    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    test_data = test_data.sort_values(["RID_HASH", "VISCODE"])

    for rid in test_data["RID_HASH"].unique():
        patient = test_data[test_data["RID_HASH"] == rid]
        misses = []
        viscodes = []
        for idx, row in patient.iterrows():
            misses.append(row.isna().sum())
            viscodes.append(row["VISCODE"])
        cidx = np.argmin(misses)

        current_viscode = viscodes[cidx]
        local_idx = (test_data["VISCODE"] == current_viscode) & (
            test_data["RID_HASH"] == rid
        )
        imputed_idx = (static_imputation["VISCODE"] == current_viscode) & (
            static_imputation["RID_HASH"] == rid
        )

        if len(test_data[local_idx]) == 0:
            continue

        for col in test_data.columns:
            val = test_data.loc[local_idx][col].values[0]
            if val == val:
                continue
            imputed_val = static_imputation.loc[imputed_idx][col].values[0]
            test_data.loc[local_idx, col] = imputed_val

            # print("imputed", test_data.loc[local_idx, col])

    return normalize(test_data)


def impute_data(
    train_data, test_data, use_longitudinal=True, static_strategy="missmin"
):
    test_id = dataframe_hash(test_data)
    train_id = dataframe_hash(train_data)

    print("Evaluate constants", test_id, test_data.isna().sum().sum())
    test_data = prepare_consts(train_data, test_data)
    test_data = prepare_age(train_data, test_data)

    while use_longitudinal:
        print("Evaluate longitudinals", test_id, test_data.isna().sum().sum())
        new_test_data = impute_longitudinal(train_data, test_data)
        if new_test_data.isna().sum().sum() == test_data.isna().sum().sum():
            break

        test_data = new_test_data

    print(
        "Evaluate static imputation",
        test_id,
        test_data.isna().sum().sum(),
        static_strategy,
    )
    static_imputation = full_imputation(train_data, test_data)

    test_data = evaluate_static_imputation(train_data, test_data, static_imputation)

    print("Evaluate constants take 2", test_id, test_data.isna().sum().sum())
    test_data = prepare_consts(train_data, test_data)
    test_data = prepare_age(train_data, test_data)

    while use_longitudinal:
        print("Evaluate longitudinals take 2", test_id, test_data.isna().sum().sum())
        new_test_data = impute_longitudinal(train_data, test_data)
        if new_test_data.isna().sum().sum() == test_data.isna().sum().sum():
            break

        test_data = new_test_data

    print("Normalize data", test_id, test_data.isna().sum().sum())
    return normalize(test_data)

In [17]:
# dev_1_eval = impute_data(dev_set, dev_1)
# dev_2_eval = impute_data(dev_set, dev_2)

In [18]:
# # use static missmin visit strategy
# from hyperimpute.plugins.imputers import Imputers
# from hyperimpute.utils.benchmarks import benchmark_model
# from sklearn.preprocessing import LabelEncoder

# train_eval_data = pd.concat([dev_set, dev_1_eval, dev_2_eval], ignore_index=True)

# train_eval_data_raw = pd.concat([dev_set, dev_1, dev_2], ignore_index=True)

# train_gt = pd.concat([dev_set, dev_set, dev_set], ignore_index=True)

# train_mask = train_eval_data_raw.isna().astype(int)
# train_mask_bool = train_eval_data_raw.isna()

# le = LabelEncoder().fit(train_gt["RID_HASH"])
# train_gt["RID_HASH"] = le.transform(train_gt["RID_HASH"])
# train_eval_data["RID_HASH"] = le.transform(train_eval_data["RID_HASH"])

# plugin = Imputers().get(
#     "hyperimpute",
#     optimizer="simple",
# )

# benchmark_model("missforest", plugin, train_gt, train_eval_data, train_mask)

In [19]:
# Current best (1.2941082455707285, 0.6173876160675963)


## Submission data

In [20]:
test_AB = pd.concat([dev_set, test_A, test_B], ignore_index=True)

eval_data = impute_data(dev_set, test_AB)

Evaluate constants 609826745966306637 15026
Evaluate longitudinals 609826745966306637 13086


[2022-11-10T14:49:18.510472+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T14:49:18.540833+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T14:49:18.541510+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T14:49:18.877984+0200][913463][INFO] [Evaluate PTGENDER_num] previous config new score = 0.9347794135406013. old score = -9999999
[2022-11-10T14:49:44.807534+0200][913463][INFO]      >>> Column PTGENDER_num <-- score 0.9581489608173681 <-- Model xgboost({'reg_lambda': 5.047146750787419, 'reg_alpha': 6.76511248668679, 'colsample_bytree': 0.8104392038078263, 'colsample_bynode': 0.8362908590082906, 'colsample_bylevel': 0.8415650566586329, 'subsample': 0.899881421386532, 'lr': 0.001, 'max_depth': 5, 'n_estimators': 282, 'min_child_weight': 6, 'max_bin': 384, 'grow_policy': 1})
[2022-11-10T14:49:45.898468+0200][913463][INFO] [Evaluate MidTemp] previous config new score = 0.7260647184345996. ol

[2022-11-10T15:11:21.333191+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T15:11:33.027337+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T15:11:44.748050+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T15:11:56.432426+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T15:12:08.154293+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T15:12:19.853606+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T15:12:31.547404+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T15:12:43.243480+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T15:12:54.934268+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T15:13:06.604116+0200][913463][INFO]   > Imputation iter 10
[2022-11-10T15:13:18.291886+0200][913463][INFO]   > Imputation iter 11
[2022-11-10T15:13:29.954654+0200][913463][INFO]   > Imputation iter 12
[2022-11-10T15:13:41.635307+0200][913463][INFO]   > Imputation iter 13
[2022-11-10T15:13:53.322968+0200][913463][INFO]   > Imputation iter 14
[2022-11-10T15:

Evaluate longitudinals 609826745966306637 9882


[2022-11-10T15:18:08.093851+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T15:18:08.120275+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T15:18:08.120930+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T15:18:08.458619+0200][913463][INFO] [Evaluate PTGENDER_num] previous config new score = 0.9369902300018494. old score = -9999999
[2022-11-10T15:18:28.397270+0200][913463][INFO]      >>> Column PTGENDER_num <-- score 0.9583182170011176 <-- Model xgboost({'reg_lambda': 5.047146750787419, 'reg_alpha': 6.76511248668679, 'colsample_bytree': 0.8104392038078263, 'colsample_bynode': 0.8362908590082906, 'colsample_bylevel': 0.8415650566586329, 'subsample': 0.899881421386532, 'lr': 0.001, 'max_depth': 5, 'n_estimators': 282, 'min_child_weight': 6, 'max_bin': 384, 'grow_policy': 1})
[2022-11-10T15:18:29.494038+0200][913463][INFO] [Evaluate MidTemp] previous config new score = 0.7270394059414285. ol

[2022-11-10T15:37:28.453922+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T15:37:38.462810+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T15:37:48.499009+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T15:37:58.517238+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T15:38:08.553482+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T15:38:18.605080+0200][913463][INFO]   > Imputation iter 10
[2022-11-10T15:38:28.626893+0200][913463][INFO]   > Imputation iter 11
[2022-11-10T15:38:38.674929+0200][913463][INFO]   > Imputation iter 12
[2022-11-10T15:38:48.750074+0200][913463][INFO]   > Imputation iter 13
[2022-11-10T15:38:58.784677+0200][913463][INFO]   > Imputation iter 14
[2022-11-10T15:39:08.832414+0200][913463][INFO]   > Imputation iter 15
[2022-11-10T15:39:18.902448+0200][913463][INFO]   > Imputation iter 16
[2022-11-10T15:39:28.949528+0200][913463][INFO]   > Imputation iter 17
[2022-11-10T15:39:38.978698+0200][913463][INFO]   > Imputation iter 18
[2022-11-10

Evaluate longitudinals 609826745966306637 7636


[2022-11-10T15:41:14.422386+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T15:41:14.450704+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T15:41:14.451232+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T15:41:14.809829+0200][913463][INFO] [Evaluate PTGENDER_num] previous config new score = 0.9354699618850366. old score = -9999999
[2022-11-10T15:41:37.274147+0200][913463][INFO]      >>> Column PTGENDER_num <-- score 0.9559625180525276 <-- Model xgboost({'reg_lambda': 5.047146750787419, 'reg_alpha': 6.76511248668679, 'colsample_bytree': 0.8104392038078263, 'colsample_bynode': 0.8362908590082906, 'colsample_bylevel': 0.8415650566586329, 'subsample': 0.899881421386532, 'lr': 0.001, 'max_depth': 5, 'n_estimators': 282, 'min_child_weight': 6, 'max_bin': 384, 'grow_policy': 1})
[2022-11-10T15:41:38.384436+0200][913463][INFO] [Evaluate MidTemp] previous config new score = 0.7283932172490013. ol

[2022-11-10T16:02:35.165885+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T16:02:46.274081+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T16:02:57.359044+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T16:03:08.460229+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T16:03:19.603928+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T16:03:30.749014+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T16:03:41.876156+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T16:03:52.990627+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T16:04:04.136255+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T16:04:15.278962+0200][913463][INFO]   > Imputation iter 10
[2022-11-10T16:04:26.382247+0200][913463][INFO]   > Imputation iter 11
[2022-11-10T16:04:37.498463+0200][913463][INFO]   > Imputation iter 12
[2022-11-10T16:04:48.622063+0200][913463][INFO]   > Imputation iter 13
[2022-11-10T16:04:59.756247+0200][913463][INFO]   > Imputation iter 14
[2022-11-10T16:

Evaluate longitudinals 609826745966306637 6542


[2022-11-10T16:09:03.668350+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T16:09:03.694830+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T16:09:03.695398+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T16:09:04.041057+0200][913463][INFO] [Evaluate PTGENDER_num] previous config new score = 0.935599056819095. old score = -9999999
[2022-11-10T16:09:31.617432+0200][913463][INFO]      >>> Column PTGENDER_num <-- score 0.9556328184738669 <-- Model xgboost({'reg_lambda': 5.047146750787419, 'reg_alpha': 6.76511248668679, 'colsample_bytree': 0.8104392038078263, 'colsample_bynode': 0.8362908590082906, 'colsample_bylevel': 0.8415650566586329, 'subsample': 0.899881421386532, 'lr': 0.001, 'max_depth': 5, 'n_estimators': 282, 'min_child_weight': 6, 'max_bin': 384, 'grow_policy': 1})
[2022-11-10T16:09:32.734542+0200][913463][INFO] [Evaluate MidTemp] previous config new score = 0.7307166260223137. old

[2022-11-10T16:28:37.106678+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T16:28:47.044911+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T16:28:56.982205+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T16:29:06.904456+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T16:29:16.874360+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T16:29:26.837517+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T16:29:36.767192+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T16:29:46.686283+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T16:29:56.622301+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T16:30:06.551722+0200][913463][INFO]   > Imputation iter 10
[2022-11-10T16:30:16.502590+0200][913463][INFO]   > Imputation iter 11
[2022-11-10T16:30:26.456794+0200][913463][INFO]   > Imputation iter 12
[2022-11-10T16:30:36.408514+0200][913463][INFO]   > Imputation iter 13
[2022-11-10T16:30:46.376710+0200][913463][INFO]   > Imputation iter 14
[2022-11-10T16:

Evaluate longitudinals 609826745966306637 6246


[2022-11-10T16:32:49.813138+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T16:32:49.840425+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T16:32:49.841140+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T16:32:50.184919+0200][913463][INFO] [Evaluate PTGENDER_num] previous config new score = 0.9358502519093908. old score = -9999999
[2022-11-10T16:33:12.425621+0200][913463][INFO]      >>> Column PTGENDER_num <-- score 0.9561097632107203 <-- Model xgboost({'reg_lambda': 5.047146750787419, 'reg_alpha': 6.76511248668679, 'colsample_bytree': 0.8104392038078263, 'colsample_bynode': 0.8362908590082906, 'colsample_bylevel': 0.8415650566586329, 'subsample': 0.899881421386532, 'lr': 0.001, 'max_depth': 5, 'n_estimators': 282, 'min_child_weight': 6, 'max_bin': 384, 'grow_policy': 1})
[2022-11-10T16:33:13.529191+0200][913463][INFO] [Evaluate MidTemp] previous config new score = 0.7281731638656475. ol

[2022-11-10T16:53:58.290315+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T16:54:10.481278+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T16:54:22.674875+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T16:54:34.852600+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T16:54:47.036530+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T16:54:59.250863+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T16:55:11.429303+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T16:55:23.651131+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T16:55:35.850609+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T16:55:49.876681+0200][913463][INFO]   > Imputation iter 10
[2022-11-10T16:56:02.665981+0200][913463][INFO]   > Imputation iter 11
[2022-11-10T16:56:14.868427+0200][913463][INFO]   > Imputation iter 12
[2022-11-10T16:56:27.069147+0200][913463][INFO]   > Imputation iter 13
[2022-11-10T16:56:39.242983+0200][913463][INFO]   > Imputation iter 14
[2022-11-10T16:

Evaluate longitudinals 609826745966306637 6104


[2022-11-10T17:01:05.843910+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T17:01:05.871862+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T17:01:05.872497+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T17:01:06.221548+0200][913463][INFO] [Evaluate PTGENDER_num] previous config new score = 0.9347893979547854. old score = -9999999
[2022-11-10T17:02:29.184099+0200][913463][INFO]      >>> Column PTGENDER_num <-- score 0.9695521346712936 <-- Model xgboost({'reg_lambda': 1.5549932689920478, 'reg_alpha': 0.24683150245522478, 'colsample_bytree': 0.7010838060310757, 'colsample_bynode': 0.8972462009056147, 'colsample_bylevel': 0.5643692244322612, 'subsample': 0.5384057833735143, 'lr': 0.01, 'max_depth': 5, 'n_estimators': 281, 'min_child_weight': 0, 'max_bin': 365, 'grow_policy': 1})
[2022-11-10T17:02:30.236427+0200][913463][INFO] [Evaluate MidTemp] previous config new score = 0.7298405514935522

[2022-11-10T17:22:59.421564+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T17:23:11.703046+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T17:23:23.964028+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T17:23:36.233613+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T17:23:48.549941+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T17:24:00.812985+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T17:24:13.090158+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T17:24:25.362144+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T17:24:37.669970+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T17:24:49.934897+0200][913463][INFO]   > Imputation iter 10
[2022-11-10T17:25:02.201285+0200][913463][INFO]   > Imputation iter 11
[2022-11-10T17:25:14.434705+0200][913463][INFO]   > Imputation iter 12
[2022-11-10T17:25:26.670408+0200][913463][INFO]   > Imputation iter 13
[2022-11-10T17:25:38.918715+0200][913463][INFO]   > Imputation iter 14
[2022-11-10T17:

Evaluate longitudinals 609826745966306637 6015


[2022-11-10T17:28:03.043859+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T17:28:03.071177+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T17:28:03.071749+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T17:28:03.409595+0200][913463][INFO] [Evaluate PTGENDER_num] previous config new score = 0.9348631109742663. old score = -9999999
[2022-11-10T17:28:29.999013+0200][913463][INFO]      >>> Column PTGENDER_num <-- score 0.957404731082131 <-- Model xgboost({'reg_lambda': 5.047146750787419, 'reg_alpha': 6.76511248668679, 'colsample_bytree': 0.8104392038078263, 'colsample_bynode': 0.8362908590082906, 'colsample_bylevel': 0.8415650566586329, 'subsample': 0.899881421386532, 'lr': 0.001, 'max_depth': 5, 'n_estimators': 282, 'min_child_weight': 6, 'max_bin': 384, 'grow_policy': 1})
[2022-11-10T17:28:31.115159+0200][913463][INFO] [Evaluate MidTemp] previous config new score = 0.7295591386742214. old

[2022-11-10T17:47:04.996396+0200][913463][INFO]   > Imputation iter 10
[2022-11-10T17:47:13.824962+0200][913463][INFO]   > Imputation iter 11
[2022-11-10T17:47:22.652423+0200][913463][INFO]   > Imputation iter 12
[2022-11-10T17:47:31.461587+0200][913463][INFO]   > Imputation iter 13
[2022-11-10T17:47:40.251441+0200][913463][INFO]   > Imputation iter 14
[2022-11-10T17:47:49.042505+0200][913463][INFO]   > Imputation iter 15
[2022-11-10T17:47:57.858892+0200][913463][INFO]   > Imputation iter 16
[2022-11-10T17:48:06.652331+0200][913463][INFO]   > Imputation iter 17
[2022-11-10T17:48:15.463622+0200][913463][INFO]   > Imputation iter 18
[2022-11-10T17:48:24.244579+0200][913463][INFO]   > Imputation iter 19
[2022-11-10T17:48:33.040039+0200][913463][INFO]   > Imputation iter 20
[2022-11-10T17:48:41.830568+0200][913463][INFO]   > Imputation iter 21
[2022-11-10T17:48:50.612411+0200][913463][INFO]   > Imputation iter 22
[2022-11-10T17:48:59.405549+0200][913463][INFO]   > Imputation iter 23
[2022-

Evaluate longitudinals 609826745966306637 5996


[2022-11-10T17:50:13.157860+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T17:50:13.184559+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T17:50:13.185186+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T17:50:13.525016+0200][913463][INFO] [Evaluate PTGENDER_num] previous config new score = 0.9342197304922395. old score = -9999999
[2022-11-10T17:50:41.226412+0200][913463][INFO]      >>> Column PTGENDER_num <-- score 0.9572522377177861 <-- Model xgboost({'reg_lambda': 5.047146750787419, 'reg_alpha': 6.76511248668679, 'colsample_bytree': 0.8104392038078263, 'colsample_bynode': 0.8362908590082906, 'colsample_bylevel': 0.8415650566586329, 'subsample': 0.899881421386532, 'lr': 0.001, 'max_depth': 5, 'n_estimators': 282, 'min_child_weight': 6, 'max_bin': 384, 'grow_policy': 1})
[2022-11-10T17:50:42.342666+0200][913463][INFO] [Evaluate MidTemp] previous config new score = 0.7299807522091242. ol

[2022-11-10T18:10:20.334022+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T18:10:30.383396+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T18:10:40.477968+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T18:10:50.550402+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T18:11:00.621206+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T18:11:10.662693+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T18:11:20.715253+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T18:11:30.779800+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T18:11:40.846504+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T18:11:50.895679+0200][913463][INFO]   > Imputation iter 10
[2022-11-10T18:12:00.983129+0200][913463][INFO]   > Imputation iter 11
[2022-11-10T18:12:11.057928+0200][913463][INFO]   > Imputation iter 12
[2022-11-10T18:12:21.119481+0200][913463][INFO]   > Imputation iter 13
[2022-11-10T18:12:31.182912+0200][913463][INFO]   > Imputation iter 14
[2022-11-10T18:

Evaluate longitudinals 609826745966306637 5987


[2022-11-10T18:14:12.576518+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T18:14:12.603235+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T18:14:12.603869+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T18:14:12.942044+0200][913463][INFO] [Evaluate PTGENDER_num] previous config new score = 0.9333137679143315. old score = -9999999
[2022-11-10T18:14:31.885302+0200][913463][INFO]      >>> Column PTGENDER_num <-- score 0.9567457368092847 <-- Model xgboost({'reg_lambda': 5.047146750787419, 'reg_alpha': 6.76511248668679, 'colsample_bytree': 0.8104392038078263, 'colsample_bynode': 0.8362908590082906, 'colsample_bylevel': 0.8415650566586329, 'subsample': 0.899881421386532, 'lr': 0.001, 'max_depth': 5, 'n_estimators': 282, 'min_child_weight': 6, 'max_bin': 384, 'grow_policy': 1})
[2022-11-10T18:14:32.998891+0200][913463][INFO] [Evaluate MidTemp] previous config new score = 0.7308475528158325. ol

[2022-11-10T18:33:22.906891+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T18:33:33.356536+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T18:33:43.802236+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T18:33:54.271984+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T18:34:04.754122+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T18:34:15.210619+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T18:34:25.668929+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T18:34:36.097714+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T18:34:46.543592+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T18:34:56.986319+0200][913463][INFO]   > Imputation iter 10
[2022-11-10T18:35:07.421671+0200][913463][INFO]   > Imputation iter 11
[2022-11-10T18:35:17.897372+0200][913463][INFO]   > Imputation iter 12
[2022-11-10T18:35:28.341534+0200][913463][INFO]   > Imputation iter 13
[2022-11-10T18:35:38.792574+0200][913463][INFO]   > Imputation iter 14
[2022-11-10T18:

Evaluate static imputation 609826745966306637 5987 missmin


[2022-11-10T18:38:37.519543+0200][913463][INFO] [Evaluate PTGENDER_num] previous config new score = 0.9333137679143315. old score = -9999999
[2022-11-10T18:39:09.082924+0200][913463][INFO]      >>> Column PTGENDER_num <-- score 0.9567457368092847 <-- Model xgboost({'reg_lambda': 5.047146750787419, 'reg_alpha': 6.76511248668679, 'colsample_bytree': 0.8104392038078263, 'colsample_bynode': 0.8362908590082906, 'colsample_bylevel': 0.8415650566586329, 'subsample': 0.899881421386532, 'lr': 0.001, 'max_depth': 5, 'n_estimators': 282, 'min_child_weight': 6, 'max_bin': 384, 'grow_policy': 1})
[2022-11-10T18:39:10.205567+0200][913463][INFO] [Evaluate MidTemp] previous config new score = 0.7308475528158325. old score = -9999999
[2022-11-10T18:40:21.928403+0200][913463][INFO]      >>> Column MidTemp <-- score 0.8219709601177894 <-- Model xgboost_regressor({'reg_lambda': 7.245218715691966, 'reg_alpha': 0.04216785478729102, 'lr': 0.01, 'colsample_bytree': 0.8067296804801838, 'colsample_bynode': 0.63

[2022-11-10T18:58:27.761218+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T18:58:37.636645+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T18:58:47.543095+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T18:58:57.414905+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T18:59:07.310587+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T18:59:17.216986+0200][913463][INFO]   > Imputation iter 10
[2022-11-10T18:59:27.116374+0200][913463][INFO]   > Imputation iter 11
[2022-11-10T18:59:36.993404+0200][913463][INFO]   > Imputation iter 12
[2022-11-10T18:59:46.883775+0200][913463][INFO]   > Imputation iter 13
[2022-11-10T18:59:56.801182+0200][913463][INFO]   > Imputation iter 14
[2022-11-10T19:00:06.716485+0200][913463][INFO]   > Imputation iter 15
[2022-11-10T19:00:16.616775+0200][913463][INFO]   > Imputation iter 16
[2022-11-10T19:00:26.511014+0200][913463][INFO]   > Imputation iter 17
[2022-11-10T19:00:36.394426+0200][913463][INFO]   > Imputation iter 18
[2022-11-10

Evaluate constants take 2 609826745966306637 3556
Evaluate longitudinals take 2 609826745966306637 2818


[2022-11-10T19:03:58.320741+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T19:03:58.347453+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T19:03:58.348061+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T19:03:58.356311+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T19:03:58.363711+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T19:03:58.371218+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T19:03:58.378628+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T19:03:58.386096+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T19:03:58.393658+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T19:03:58.401142+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T19:03:58.408796+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T19:03:58.416474+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T19:03:58.424088+0200][913463][INFO]   > Imputation iter 10
[2022-1

Evaluate longitudinals take 2 609826745966306637 1948


[2022-11-10T19:04:26.176589+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T19:04:26.209857+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T19:04:26.210489+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T19:04:26.219223+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T19:04:26.227187+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T19:04:26.234962+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T19:04:26.242731+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T19:04:26.250513+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T19:04:26.258260+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T19:04:26.265998+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T19:04:26.273699+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T19:04:26.281391+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T19:04:26.288974+0200][913463][INFO]   > Imputation iter 10
[2022-1

Evaluate longitudinals take 2 609826745966306637 1232


[2022-11-10T19:04:51.869305+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T19:04:51.896004+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T19:04:51.896540+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T19:04:51.904911+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T19:04:51.912321+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T19:04:51.919872+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T19:04:51.927506+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T19:04:51.934993+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T19:04:51.942560+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T19:04:51.950125+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T19:04:51.957821+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T19:04:51.965501+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T19:04:51.973201+0200][913463][INFO]   > Imputation iter 10
[2022-1

Evaluate longitudinals take 2 609826745966306637 746


[2022-11-10T19:05:14.984794+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T19:05:15.011708+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T19:05:15.012338+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T19:05:15.020773+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T19:05:15.028142+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T19:05:15.035520+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T19:05:15.043209+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T19:05:15.050638+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T19:05:15.058187+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T19:05:15.065792+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T19:05:15.073473+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T19:05:15.081163+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T19:05:15.088851+0200][913463][INFO]   > Imputation iter 10
[2022-1

Evaluate longitudinals take 2 609826745966306637 424


[2022-11-10T19:05:36.351847+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T19:05:36.378681+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T19:05:36.379311+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T19:05:36.387815+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T19:05:36.395410+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T19:05:36.402834+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T19:05:36.410555+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T19:05:36.418022+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T19:05:36.425643+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T19:05:36.433216+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T19:05:36.440823+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T19:05:36.448450+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T19:05:36.456056+0200][913463][INFO]   > Imputation iter 10
[2022-1

Evaluate longitudinals take 2 609826745966306637 257


[2022-11-10T19:05:56.091915+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T19:05:56.118637+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T19:05:56.119238+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T19:05:56.127673+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T19:05:56.135296+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T19:05:56.142731+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T19:05:56.150318+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T19:05:56.157981+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T19:05:56.165616+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T19:05:56.173136+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T19:05:56.180679+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T19:05:56.188197+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T19:05:56.195705+0200][913463][INFO]   > Imputation iter 10
[2022-1

Evaluate longitudinals take 2 609826745966306637 154


[2022-11-10T19:06:15.188460+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T19:06:15.215309+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T19:06:15.215936+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T19:06:15.224833+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T19:06:15.232199+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T19:06:15.239558+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T19:06:15.246995+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T19:06:15.254436+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T19:06:15.261953+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T19:06:15.269446+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T19:06:15.277055+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T19:06:15.284643+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T19:06:15.292289+0200][913463][INFO]   > Imputation iter 10
[2022-1

Evaluate longitudinals take 2 609826745966306637 85


[2022-11-10T19:06:33.905334+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T19:06:33.931989+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T19:06:33.932599+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T19:06:33.941391+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T19:06:33.948930+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T19:06:33.956188+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T19:06:33.963773+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T19:06:33.971239+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T19:06:33.978715+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T19:06:33.986217+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T19:06:33.993736+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T19:06:34.001290+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T19:06:34.008887+0200][913463][INFO]   > Imputation iter 10
[2022-1

Evaluate longitudinals take 2 609826745966306637 42


[2022-11-10T19:06:52.306275+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T19:06:52.332945+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T19:06:52.333543+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T19:06:52.341979+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T19:06:52.349341+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T19:06:52.356916+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T19:06:52.364490+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T19:06:52.372145+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T19:06:52.379778+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T19:06:52.387361+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T19:06:52.394994+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T19:06:52.402558+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T19:06:52.410101+0200][913463][INFO]   > Imputation iter 10
[2022-1

Evaluate longitudinals take 2 609826745966306637 18


[2022-11-10T19:07:10.505116+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T19:07:10.533228+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T19:07:10.533826+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T19:07:10.541809+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T19:07:10.549212+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T19:07:10.556494+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T19:07:10.563990+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T19:07:10.571380+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T19:07:10.578900+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T19:07:10.586399+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T19:07:10.593938+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T19:07:10.601447+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T19:07:10.608957+0200][913463][INFO]   > Imputation iter 10
[2022-1

Evaluate longitudinals take 2 609826745966306637 8


[2022-11-10T19:07:28.596518+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T19:07:28.623294+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T19:07:28.623916+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T19:07:28.631864+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T19:07:28.639111+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T19:07:28.646473+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T19:07:28.653932+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T19:07:28.661341+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T19:07:28.668904+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T19:07:28.676323+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T19:07:28.683869+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T19:07:28.691360+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T19:07:28.698816+0200][913463][INFO]   > Imputation iter 10
[2022-1

Evaluate longitudinals take 2 609826745966306637 4


[2022-11-10T19:07:46.621389+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T19:07:46.648121+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T19:07:46.648808+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T19:07:46.657263+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T19:07:46.664643+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T19:07:46.672001+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T19:07:46.679414+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T19:07:46.686990+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T19:07:46.694545+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T19:07:46.702100+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T19:07:46.709611+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T19:07:46.717098+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T19:07:46.724586+0200][913463][INFO]   > Imputation iter 10
[2022-1

Evaluate longitudinals take 2 609826745966306637 0


[2022-11-10T19:08:04.608003+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T19:08:04.634897+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T19:08:04.635494+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T19:08:04.643760+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T19:08:04.651318+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T19:08:04.658830+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T19:08:04.666331+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T19:08:04.673827+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T19:08:04.681300+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T19:08:04.688740+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T19:08:04.696255+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T19:08:04.703771+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T19:08:04.711364+0200][913463][INFO]   > Imputation iter 10
[2022-1

Normalize data 609826745966306637 0


In [21]:
eval_data

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit
6610,001854e92967164311f3acd5a58be9790f28ab3968bbbc...,0,71.400000,1.0,15.000000,0.0,2.0,0.0,0.961538,0.077671,0.085164,0.638939,0.467467,0.608113,0.424862,0.523781,2.0,36.0
6855,001854e92967164311f3acd5a58be9790f28ab3968bbbc...,36,74.400000,1.0,15.000000,0.0,2.0,0.0,1.000000,0.027397,0.089750,0.605453,0.446726,0.606472,0.439387,0.528077,2.0,36.0
0,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,0,79.100000,0.0,20.000000,1.0,1.0,0.5,0.923077,0.164384,0.071871,0.548646,0.376516,0.464021,0.194906,0.400709,2.0,6.0
1,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,6,79.600000,0.0,20.000000,1.0,1.0,1.5,0.923077,0.237397,0.071956,0.548307,0.366398,0.403880,0.193367,0.397291,2.0,6.0
6630,0059bc7849aea9522b408fa0ddc60276a36cae00206b87...,0,83.130043,0.0,16.966606,1.0,0.0,0.5,0.846154,0.196301,0.261645,0.345711,0.286043,0.312698,0.276821,0.248579,5.0,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4098,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,0,72.100000,0.0,12.000000,1.0,0.0,0.5,0.884615,0.150685,0.416382,0.602438,0.636654,0.610229,0.743037,0.624631,3.0,24.0
4099,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,12,73.100000,0.0,12.000000,1.0,0.0,1.0,0.961538,0.155205,0.398451,0.608521,0.634650,0.617108,0.729087,0.638477,3.0,24.0
4100,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,24,74.100000,0.0,12.000000,1.0,0.0,1.0,0.846154,0.150685,0.439698,0.598002,0.624603,0.624339,0.719605,0.639374,3.0,24.0
5909,ffa86109ba8684f31325842d0ff26568e105f0f63b366a...,0,66.300000,1.0,13.000000,0.0,0.0,0.0,0.923077,0.118767,0.177669,0.603072,0.335950,0.536760,0.308488,0.445137,2.0,24.0


In [22]:
from hyperimpute.plugins.imputers import Imputers

plugin = Imputers().get(
    "hyperimpute",
    optimizer="simple",
    classifier_seed=["catboost"],
    regression_seed=["xgboost_regressor", "catboost_regressor"],
    class_threshold=cat_limit,
)


imputed_X = plugin.fit_transform(eval_data.copy())
imputed_X[scaled_cols] = scaler.inverse_transform(imputed_X[scaled_cols])

imputed_X

[2022-11-10T19:08:13.023179+0200][913463][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: False
[2022-11-10T19:08:13.059721+0200][913463][INFO]   > HyperImpute using inner optimization
[2022-11-10T19:08:13.060289+0200][913463][INFO]   > Imputation iter 0
[2022-11-10T19:08:13.067952+0200][913463][INFO]   > Imputation iter 1
[2022-11-10T19:08:13.074777+0200][913463][INFO]   > Imputation iter 2
[2022-11-10T19:08:13.081451+0200][913463][INFO]   > Imputation iter 3
[2022-11-10T19:08:13.088031+0200][913463][INFO]   > Imputation iter 4
[2022-11-10T19:08:13.095296+0200][913463][INFO]   > Imputation iter 5
[2022-11-10T19:08:13.102259+0200][913463][INFO]   > Imputation iter 6
[2022-11-10T19:08:13.109510+0200][913463][INFO]   > Imputation iter 7
[2022-11-10T19:08:13.116188+0200][913463][INFO]   > Imputation iter 8
[2022-11-10T19:08:13.122952+0200][913463][INFO]   > Imputation iter 9
[2022-11-10T19:08:13.129687+0200][913463][INFO]   > Imputation iter 10
[2022-1

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit
0,001854e92967164311f3acd5a58be9790f28ab3968bbbc...,0.0,71.400000,1.0,15.000000,0.0,2.0,0.0,29.0,5.67,18668.000000,8008.000000,1.052974e+06,4489.000000,17293.000000,21045.000000,2.0,36.0
1,001854e92967164311f3acd5a58be9790f28ab3968bbbc...,36.0,74.400000,1.0,15.000000,0.0,2.0,0.0,30.0,2.00,19369.000000,7711.312651,1.036106e+06,4479.698111,17585.562739,21145.519578,2.0,36.0
2,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,0.0,79.100000,0.0,20.000000,1.0,1.0,0.5,28.0,12.00,16636.000000,7208.000000,9.790100e+05,3672.000000,12661.000000,18165.000000,2.0,6.0
3,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,6.0,79.600000,0.0,20.000000,1.0,1.0,1.5,28.0,17.33,16649.000000,7205.000000,9.707820e+05,3331.000000,12630.000000,18085.000000,2.0,6.0
4,0059bc7849aea9522b408fa0ddc60276a36cae00206b87...,0.0,83.130043,0.0,16.966606,1.0,0.0,0.5,26.0,14.33,45644.575322,5410.000000,9.054360e+05,2814.000000,14311.000000,14605.000000,5.0,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6894,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,0.0,72.100000,0.0,12.000000,1.0,0.0,0.5,27.0,11.00,69297.300000,7684.600000,1.190560e+06,4501.000000,23702.000000,23405.000000,3.0,24.0
6895,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,12.0,73.100000,0.0,12.000000,1.0,0.0,1.0,29.0,11.33,66556.400000,7738.500000,1.188930e+06,4540.000000,23421.000000,23729.000000,3.0,24.0
6896,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,24.0,74.100000,0.0,12.000000,1.0,0.0,1.0,26.0,11.00,72861.400000,7645.300000,1.180760e+06,4581.000000,23230.000000,23750.000000,3.0,24.0
6897,ffa86109ba8684f31325842d0ff26568e105f0f63b366a...,0.0,66.300000,1.0,13.000000,0.0,0.0,0.0,28.0,8.67,32808.200000,7690.220979,9.460212e+05,4084.430357,14948.865953,19204.643123,2.0,24.0


In [23]:
import numpy as np

results = []


def normalize_output(test_data):
    test_data = test_data.copy()

    factor = test_data["CDRSB"] / 0.5
    factor[factor < 0] = 0
    factor = factor.fillna(-1)
    factor = factor.round(0).astype(int)
    factor = factor.replace(-1, np.nan)
    test_data["CDRSB"] = factor * 0.5
    
    test_data["DX_num"] = test_data["DX_num"].round(0)

    test_data["ADAS13"] = ((test_data["ADAS13"] * 3).round(0) / 3).round(2)
    test_data["MMSE"] = test_data["MMSE"].round(0)

    return test_data

def dump_results(imputed_data: pd.DataFrame, fpath: str):
    for name, data in [
        ("test_A", test_A),
        ("test_B", test_B),
    ]:
        for idx, row in data.iterrows():
            for col in row.index:
                local = row.T
                val = local[col]
                if val == val:
                    continue
                imputed_id = f"{local['RID_HASH']}_{local['VISCODE']}_{col}_{name}"
                imputed_val = imputed_data[
                    (imputed_data["RID_HASH"] == local["RID_HASH"])
                    & (imputed_data["VISCODE"] == local["VISCODE"])
                ][col].values[0]
                
                assert imputed_val == imputed_val
                assert imputed_val != ""
                
                results.append([imputed_id, imputed_val])

    output = pd.DataFrame(results, columns=submission.columns)
    output.to_csv(fpath, index=None)

    return output


version = "v9"
changelog = "automl_AB_comb"
output_normalized = dump_results(
    normalize_output(imputed_X),
    results_dir / f"imputation_results_{version}_{changelog}.csv",
)

output_normalized

Unnamed: 0,Id,Predicted
0,988b6137f4352c01e4b52790505caa0c3ec438f117000a...,58.20
1,988b6137f4352c01e4b52790505caa0c3ec438f117000a...,0.00
2,988b6137f4352c01e4b52790505caa0c3ec438f117000a...,2.00
3,988b6137f4352c01e4b52790505caa0c3ec438f117000a...,3.50
4,988b6137f4352c01e4b52790505caa0c3ec438f117000a...,25.00
...,...,...
15021,8b33cc9dd06fc18f130e185fdf1e6d657dbc80add9ff6e...,1.00
15022,8b33cc9dd06fc18f130e185fdf1e6d657dbc80add9ff6e...,0.00
15023,8b33cc9dd06fc18f130e185fdf1e6d657dbc80add9ff6e...,1.00
15024,8b33cc9dd06fc18f130e185fdf1e6d657dbc80add9ff6e...,29.00


In [24]:
pd.set_option("display.expand_frame_repr", True)

output_normalized.tail(5).values

array([['8b33cc9dd06fc18f130e185fdf1e6d657dbc80add9ff6ea81ced3e328cea9e63_0_DX_num_test_B',
        1.0],
       ['8b33cc9dd06fc18f130e185fdf1e6d657dbc80add9ff6ea81ced3e328cea9e63_0_APOE4_test_B',
        0.0],
       ['8b33cc9dd06fc18f130e185fdf1e6d657dbc80add9ff6ea81ced3e328cea9e63_0_CDRSB_test_B',
        1.0],
       ['8b33cc9dd06fc18f130e185fdf1e6d657dbc80add9ff6ea81ced3e328cea9e63_0_MMSE_test_B',
        29.0],
       ['8b33cc9dd06fc18f130e185fdf1e6d657dbc80add9ff6ea81ced3e328cea9e63_0_ADAS13_test_B',
        18.33]], dtype=object)