In [1]:
import warnings
from pathlib import Path

import pandas as pd
import numpy as np
from hyperimpute.utils.serialization import (load_model_from_file,
                                             save_model_to_file)
from sklearn.preprocessing import MinMaxScaler

workspace = Path("workspace")
workspace.mkdir(parents=True, exist_ok=True)

warnings.filterwarnings("ignore")

cat_limit = 10
n_seeds = 3

version = "take3_v5"
changelog = f"multiple_imputation{n_seeds}_with_augmentation_catlimit{cat_limit}"
results_dir = Path("results")
data_dir = Path("data")

In [2]:
def augment_base_dataset(df):
    df = df.sort_values(["RID_HASH", "VISCODE"])
    
    for rid in df["RID_HASH"].unique():
        visits_len = len(df[df["RID_HASH"] == rid])
        last_visit = df[df["RID_HASH"] == rid]["VISCODE"].max()
        visits = df[df["RID_HASH"] == rid]["VISCODE"].values.tolist()
        prev_visits = [-1] + visits[ : -1]
        next_visits = visits[ 1 : ] + [-1]
        avg_wait = df[df["RID_HASH"] == rid]["VISCODE"].diff().mean()
        if avg_wait != avg_wait:
            avg_wait = 0
        
        df.loc[df["RID_HASH"] == rid, "total_visits"] = visits_len
        df.loc[df["RID_HASH"] == rid, "last_visit"] = last_visit
        #df.loc[df["RID_HASH"] == rid, "prev_visit"] = prev_visits
        #df.loc[df["RID_HASH"] == rid, "next_visit"] = next_visits
        #df.loc[df["RID_HASH"] == rid, "avg_wait"] = avg_wait
        
    return df

In [3]:
dev_set = pd.read_csv(data_dir / "dev_set.csv")
dev_set = augment_base_dataset(dev_set)

scaled_cols = [
    "MMSE",
    "ADAS13",
    "Ventricles",
    "Hippocampus",
    "WholeBrain",
    "Entorhinal",
    "Fusiform",
    "MidTemp",
]

scaler = MinMaxScaler().fit(dev_set[scaled_cols])
dev_set[scaled_cols] = scaler.transform(dev_set[scaled_cols])

dev_set

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit
2163,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,0,79.1,0,20,1.0,1.0,0.5,0.923077,0.164384,0.071871,0.548646,0.376516,0.464021,0.194906,0.400709,2.0,6.0
154,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,6,79.6,0,20,1.0,1.0,1.5,0.923077,0.237397,0.071956,0.548307,0.366398,0.403880,0.193367,0.397291,2.0,6.0
1385,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,0,72.9,1,12,1.0,1.0,1.0,1.000000,0.123288,0.142655,0.525169,0.235599,0.513404,0.356253,0.294774,6.0,60.0
2698,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,6,73.4,1,12,1.0,1.0,1.0,1.000000,0.164384,0.144729,0.549210,0.230361,0.435097,0.322395,0.294175,6.0,60.0
2291,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,12,73.9,1,12,1.0,1.0,1.0,0.961538,0.109589,0.155550,0.527878,0.215944,0.487831,0.342600,0.277552,6.0,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,60,79.8,1,19,1.0,0.0,3.0,0.923077,0.223699,0.170895,0.357020,0.321346,0.310935,0.399047,0.461476,7.0,102.0
2646,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,102,83.3,1,19,1.0,0.0,3.0,0.846154,0.168904,0.178231,0.352043,0.309095,0.256790,0.372685,0.416478,7.0,102.0
1962,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,0,72.1,0,12,1.0,0.0,0.5,0.884615,0.150685,0.416382,0.602438,0.636654,0.610229,0.743037,0.624631,3.0,24.0
122,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,12,73.1,0,12,1.0,0.0,1.0,0.961538,0.155205,0.398451,0.608521,0.634650,0.617108,0.729087,0.638477,3.0,24.0


In [4]:
dev_1 = pd.read_csv(data_dir / "dev_1.csv")
dev_1 = augment_base_dataset(dev_1)

dev_1[scaled_cols] = scaler.transform(dev_1[scaled_cols])

#assert (dev_1["avg_wait"] >= 0).all()

dev_1

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit
2163,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,0,,0.0,20.0,1.0,1.0,0.5,0.923077,0.164384,,,0.376516,,,,2.0,6.0
154,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,6,79.6,0.0,20.0,1.0,1.0,1.5,0.923077,0.237397,0.071956,0.548307,0.366398,0.403880,0.193367,0.397291,2.0,6.0
1385,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,0,,1.0,12.0,,1.0,,,,,0.525169,0.235599,0.513404,0.356253,0.294774,6.0,60.0
2698,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,6,,1.0,12.0,,1.0,,,,,0.549210,0.230361,0.435097,0.322395,0.294175,6.0,60.0
2291,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,12,,1.0,12.0,,1.0,,,,,0.527878,0.215944,0.487831,0.342600,0.277552,6.0,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,60,79.8,1.0,19.0,,0.0,,,,0.170895,,0.321346,,,,7.0,102.0
2646,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,102,83.3,1.0,19.0,,0.0,,,,0.178231,,0.309095,,,,7.0,102.0
1962,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,0,72.1,,12.0,1.0,0.0,0.5,0.884615,0.150685,0.416382,0.602438,,0.610229,0.743037,0.624631,3.0,24.0
122,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,12,73.1,,12.0,1.0,0.0,1.0,0.961538,0.155205,0.398451,0.608521,,0.617108,0.729087,0.638477,3.0,24.0


In [5]:
dev_2 = pd.read_csv(data_dir / "dev_2.csv")
dev_2 = augment_base_dataset(dev_2)

dev_2[scaled_cols] = scaler.transform(dev_2[scaled_cols])

#assert (dev_2["avg_wait"] >= 0).all()

dev_2

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit
2163,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,0,79.1,0.0,20.0,1.0,1.0,0.5,0.923077,0.164384,0.071871,0.548646,0.376516,0.464021,0.194906,0.400709,2.0,6.0
154,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,6,79.6,,,,1.0,,,,0.071956,0.548307,,0.403880,0.193367,0.397291,2.0,6.0
1385,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,0,72.9,,12.0,1.0,1.0,1.0,1.000000,0.123288,0.142655,0.525169,,0.513404,0.356253,0.294774,6.0,60.0
2698,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,6,,,12.0,1.0,1.0,1.0,1.000000,0.164384,,0.549210,,0.435097,0.322395,0.294175,6.0,60.0
2291,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,12,,,12.0,1.0,1.0,1.0,0.961538,0.109589,,,,,,,6.0,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,60,,,19.0,1.0,0.0,3.0,0.923077,0.223699,,0.357020,,0.310935,0.399047,0.461476,7.0,102.0
2646,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,102,,,19.0,1.0,0.0,3.0,0.846154,0.168904,,0.352043,,0.256790,0.372685,0.416478,7.0,102.0
1962,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,0,72.1,,12.0,,0.0,,,,0.416382,0.602438,,0.610229,0.743037,0.624631,3.0,24.0
122,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,12,,,12.0,,0.0,,,,,,,,,,3.0,24.0


In [6]:
submission = pd.read_csv(data_dir / "sample_submission.csv")

submission.values[1]

array(['6b6a7136f42a8dbd469a201b88e2abb54a93667822761357db2f6d620da6af8a_0_Ventricles_test_A',
       40613.0818580834], dtype=object)

In [7]:
test_A = pd.read_csv(data_dir / "test_A.csv")
test_A = augment_base_dataset(test_A)
test_A[scaled_cols] = scaler.transform(test_A[scaled_cols])

#assert (test_A["avg_wait"] >= 0).all()

test_A

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit
247,00d5e0050fbd3b6b610f6673347232eb0862df77b5b7a8...,0,,,16.0,1.0,0.0,0.5,0.961538,0.219178,,,,,,,1.0,0.0
819,013c6f92763546c7ad9c0831f023886c15f05e7332aa0c...,0,72.5,1.0,12.0,,1.0,,,,0.057498,0.612302,0.423268,0.291182,0.433004,0.329131,3.0,12.0
276,013c6f92763546c7ad9c0831f023886c15f05e7332aa0c...,6,73.0,1.0,12.0,,1.0,,,,0.067972,,0.399942,,,,3.0,12.0
350,013c6f92763546c7ad9c0831f023886c15f05e7332aa0c...,12,73.5,1.0,12.0,1.0,1.0,2.0,0.769231,0.365342,0.077516,,0.415324,,,,3.0,12.0
1268,024efbff9265302acd00190e57ee08ba1fe1b90f561f79...,0,,0.0,14.0,1.0,1.0,2.0,1.000000,0.164384,,,0.515223,,,,7.0,102.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,0,,,18.0,1.0,1.0,1.5,0.807692,0.150685,,,,,,,4.0,48.0
330,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,6,,,18.0,1.0,1.0,1.5,0.769231,0.095890,,,,,,,4.0,48.0
939,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,24,,,18.0,1.0,1.0,1.5,0.769231,0.150685,,,,,,,4.0,48.0
119,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,48,70.9,,18.0,1.0,1.0,2.5,0.807692,0.246575,0.307697,0.420993,,0.392416,0.577719,0.403872,4.0,48.0


In [8]:
test_A

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit
247,00d5e0050fbd3b6b610f6673347232eb0862df77b5b7a8...,0,,,16.0,1.0,0.0,0.5,0.961538,0.219178,,,,,,,1.0,0.0
819,013c6f92763546c7ad9c0831f023886c15f05e7332aa0c...,0,72.5,1.0,12.0,,1.0,,,,0.057498,0.612302,0.423268,0.291182,0.433004,0.329131,3.0,12.0
276,013c6f92763546c7ad9c0831f023886c15f05e7332aa0c...,6,73.0,1.0,12.0,,1.0,,,,0.067972,,0.399942,,,,3.0,12.0
350,013c6f92763546c7ad9c0831f023886c15f05e7332aa0c...,12,73.5,1.0,12.0,1.0,1.0,2.0,0.769231,0.365342,0.077516,,0.415324,,,,3.0,12.0
1268,024efbff9265302acd00190e57ee08ba1fe1b90f561f79...,0,,0.0,14.0,1.0,1.0,2.0,1.000000,0.164384,,,0.515223,,,,7.0,102.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,0,,,18.0,1.0,1.0,1.5,0.807692,0.150685,,,,,,,4.0,48.0
330,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,6,,,18.0,1.0,1.0,1.5,0.769231,0.095890,,,,,,,4.0,48.0
939,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,24,,,18.0,1.0,1.0,1.5,0.769231,0.150685,,,,,,,4.0,48.0
119,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,48,70.9,,18.0,1.0,1.0,2.5,0.807692,0.246575,0.307697,0.420993,,0.392416,0.577719,0.403872,4.0,48.0


In [9]:
test_A["RID_HASH"].values[0]



'00d5e0050fbd3b6b610f6673347232eb0862df77b5b7a8f667526b0e4520129b'

In [10]:
test_B = pd.read_csv(data_dir / "test_B.csv")
test_B = augment_base_dataset(test_B)
test_B[scaled_cols] = scaler.transform(test_B[scaled_cols])
#assert (test_B["avg_wait"] >= 0).all()

test_B

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit
1181,001854e92967164311f3acd5a58be9790f28ab3968bbbc...,0,71.4,,15.0,0.0,2.0,0.0,0.961538,0.077671,0.085164,0.638939,,0.608113,0.424862,0.523781,2.0,36.0
1426,001854e92967164311f3acd5a58be9790f28ab3968bbbc...,36,74.4,,15.0,0.0,2.0,0.0,1.000000,0.027397,0.089750,,,,,,2.0,36.0
1201,0059bc7849aea9522b408fa0ddc60276a36cae00206b87...,0,,0.0,,1.0,0.0,0.5,0.846154,0.196301,,0.345711,0.286043,0.312698,0.276821,0.248579,5.0,24.0
757,0059bc7849aea9522b408fa0ddc60276a36cae00206b87...,6,,0.0,,1.0,0.0,1.0,1.000000,0.283151,,0.345147,0.278219,0.378307,0.289480,0.253793,5.0,24.0
763,0059bc7849aea9522b408fa0ddc60276a36cae00206b87...,12,,0.0,,1.0,0.0,2.5,0.807692,0.168904,,0.329233,0.253372,0.352028,0.259842,0.222042,5.0,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1423,ff4eb5a64e2b89861d5dea81190669893070b227f3a335...,0,,,18.0,1.0,1.0,1.5,0.884615,0.114110,,0.502370,,0.394356,0.397160,0.531003,3.0,84.0
558,ff4eb5a64e2b89861d5dea81190669893070b227f3a335...,12,,,18.0,1.0,1.0,1.5,0.923077,0.242055,,0.519639,,0.294356,0.416522,0.545575,3.0,84.0
70,ff4eb5a64e2b89861d5dea81190669893070b227f3a335...,84,,0.0,18.0,1.0,1.0,1.5,1.000000,0.178082,,0.432054,0.483387,0.363316,0.468451,0.508440,3.0,84.0
480,ffa86109ba8684f31325842d0ff26568e105f0f63b366a...,0,66.3,,13.0,0.0,0.0,0.0,0.923077,0.118767,0.177669,,,,,,2.0,24.0


In [11]:
test_A.isna().sum()

RID_HASH          0
VISCODE           0
AGE             612
PTGENDER_num    626
PTEDUCAT         65
DX_num          428
APOE4            49
CDRSB           428
MMSE            428
ADAS13          428
Ventricles      612
Hippocampus     668
WholeBrain      626
Entorhinal      668
Fusiform        668
MidTemp         668
total_visits      0
last_visit        0
dtype: int64

In [12]:
test_A.columns

Index(['RID_HASH', 'VISCODE', 'AGE', 'PTGENDER_num', 'PTEDUCAT', 'DX_num',
       'APOE4', 'CDRSB', 'MMSE', 'ADAS13', 'Ventricles', 'Hippocampus',
       'WholeBrain', 'Entorhinal', 'Fusiform', 'MidTemp', 'total_visits',
       'last_visit'],
      dtype='object')

In [13]:
import numpy as np
from hyperimpute.plugins.prediction import Classifiers, Regression
from hyperimpute.utils.tester import evaluate_estimator, evaluate_regression
from sklearn.preprocessing import LabelEncoder

train_cols = list(dev_set.drop(columns=["RID_HASH"]).columns)

eval_cols = [
    "DX_num",
    "CDRSB",
    "MMSE",
    "ADAS13",
    "Ventricles",
    "Hippocampus",
    "WholeBrain",
    "Entorhinal",
    "Fusiform",
    "MidTemp",
]


def prepare_temporal_data(data, target_col: str, direction: str):
    target_train_data = []
    target_train_labels = []

    for item in data.groupby("RID_HASH"):
        # print(item[0])
        local = item[1]
        local = local.sort_values(["RID_HASH", "VISCODE"])

        rid = local["RID_HASH"]

        prev_cols = [f"prev_{col}" for col in train_cols]
        prev_row = np.zeros(len(prev_cols))

        if direction == "forward":
            rows = local.iterrows()
        else:
            rows = local.iloc[::-1].iterrows()

        for idx, row in rows:
            target_val = row[target_col]
            tmp_row = row[train_cols].copy()
            src_data = tmp_row.to_frame().T.drop(columns=[target_col])

            src_data[prev_cols] = prev_row

            prev_row = tmp_row

            target_train_data.append(src_data)
            target_train_labels.append(target_val)

    target_train_data = pd.concat(target_train_data, ignore_index=True).astype(float)

    return target_train_data, target_train_labels


def evaluate_target(data, target_col: str, direction: str):
    train_data, labels = prepare_temporal_data(data, target_col, direction)
    assert target_col not in train_data.columns

    results = {
        "raw": {},
        "str": {},
        "models": {},
    }
    if len(np.unique(labels)) < cat_limit:
        for src_model in ["catboost", "xgboost"]:
            model = Classifiers().get(src_model)
            encoded_labels = LabelEncoder().fit_transform(labels)

            score = evaluate_estimator(model, train_data, pd.Series(encoded_labels))[
                "str"
            ]["aucroc"]
            raw_score = evaluate_estimator(
                model, train_data, pd.Series(encoded_labels)
            )["clf"]["aucroc"][0]

            results["str"][src_model] = score
            results["raw"][src_model] = raw_score
            results["models"][src_model] = (
                Classifiers().get(src_model).fit(train_data, pd.Series(encoded_labels))
            )

    else:
        for src_model in ["catboost_regressor", "xgboost_regressor"]:
            model = Regression().get(src_model)
            score = evaluate_regression(model, train_data, labels)["str"]["r2"]
            raw_score = evaluate_regression(model, train_data, labels)["clf"]["r2"][0]

            results["str"][src_model] = score
            results["raw"][src_model] = raw_score
            results["models"][src_model] = (
                Regression().get(src_model).fit(train_data, labels)
            )

    return results


def prepare_longitudinal_imputers(data, columns):
    imputers = {}

    for direction in ["forward", "reverse"]:
        imputers[direction] = {}
        for target_col in columns:
            train_data, labels = prepare_temporal_data(data, target_col, direction)
            print("train", target_col, direction, len(np.unique(labels)))

            if len(np.unique(labels)) > cat_limit:
                model = Regression().get("catboost_regressor")
            else:
                model = Classifiers().get("catboost")

            model.fit(train_data, labels)

            imputers[direction][target_col] = model

    return imputers


def prepare_longitudinal_imputers_v2(data, columns):
    imputers = {}

    for direction in ["forward", "reverse"]:
        imputers[direction] = {}
        for target_col in columns:
            benchmarks = evaluate_target(data, target_col, direction=direction)

            best_score = -1
            best_mod = None
            for mod in benchmarks["raw"]:
                if benchmarks["raw"][mod] > best_score:
                    best_score = benchmarks["raw"][mod]
                    best_mod = benchmarks["models"][mod]

            imputers[direction][target_col] = best_mod

    return imputers

In [14]:
# for col in eval_cols:
#    score = evaluate_target(dev_set, col, direction = "forward")
#    print(col, score)
# DX_num {'catboost': '0.9874 +/- 0.0012', 'xgboost': '0.9867 +/- 0.0006'}
# CDRSB {'catboost_regressor': '0.8143 +/- 0.0132', 'xgboost_regressor': '0.7971 +/- 0.0191'}
# MMSE {'catboost_regressor': '0.7303 +/- 0.014', 'xgboost_regressor': '0.7089 +/- 0.0156'}
# ADAS13 {'catboost_regressor': '0.7975 +/- 0.0018', 'xgboost_regressor': '0.7955 +/- 0.0018'}
# Ventricles {'catboost_regressor': '0.7596 +/- 0.0226', 'xgboost_regressor': '0.764 +/- 0.0218'}
# Hippocampus {'catboost_regressor': '0.8483 +/- 0.0073', 'xgboost_regressor': '0.8571 +/- 0.0058'}
# WholeBrain {'catboost_regressor': '0.8574 +/- 0.0122', 'xgboost_regressor': '0.8691 +/- 0.0155'}
# Entorhinal {'catboost_regressor': '0.6859 +/- 0.0071', 'xgboost_regressor': '0.6648 +/- 0.0059'}
# Fusiform {'catboost_regressor': '0.7779 +/- 0.0174', 'xgboost_regressor': '0.7673 +/- 0.0212'}
# MidTemp {'catboost_regressor': '0.8207 +/- 0.0075', 'xgboost_regressor': '0.8157 +/- 0.0121'}

In [15]:
# for col in eval_cols:
#    score = evaluate_target(dev_set, col, direction = "reverse")
#    print(col, score)

# DX_num {'catboost': '0.9824 +/- 0.0031', 'xgboost': '0.9817 +/- 0.0023'}
# CDRSB {'catboost_regressor': '0.788 +/- 0.007', 'xgboost_regressor': '0.7876 +/- 0.0024'}
# MMSE {'catboost_regressor': '0.7291 +/- 0.0272', 'xgboost_regressor': '0.6992 +/- 0.0299'}
# ADAS13 {'catboost_regressor': '0.8016 +/- 0.0142', 'xgboost_regressor': '0.7944 +/- 0.0149'}
# Ventricles {'catboost_regressor': '0.7043 +/- 0.0233', 'xgboost_regressor': '0.6825 +/- 0.0351'}
# Hippocampus {'catboost_regressor': '0.8511 +/- 0.0087', 'xgboost_regressor': '0.8515 +/- 0.0066'}
# WholeBrain {'catboost_regressor': '0.8595 +/- 0.016', 'xgboost_regressor': '0.8658 +/- 0.0184'}
# Entorhinal {'catboost_regressor': '0.6898 +/- 0.0078', 'xgboost_regressor': '0.6646 +/- 0.0054'}
# Fusiform {'catboost_regressor': '0.7785 +/- 0.0197', 'xgboost_regressor': '0.7802 +/- 0.0145'}
# MidTemp {'catboost_regressor': '0.8199 +/- 0.0094', 'xgboost_regressor': '0.8202 +/- 0.0139'}

In [16]:
#imputers_bkp_file = workspace / f"longitudinal_imputers_scaled_cat{cat_limit}_with_augm.bkp"
imputers_bkp_file = workspace / f"longitudinal_imputers_scaled_cat{cat_limit}.bkp"

if imputers_bkp_file.exists():
    longitudinal_imputers = load_model_from_file(imputers_bkp_file)
else:
    longitudinal_imputers = prepare_longitudinal_imputers(dev_set, eval_cols)
    save_model_to_file(imputers_bkp_file, longitudinal_imputers)

## Preprocess data

In [67]:
from hyperimpute.plugins.imputers import Imputers
from typing import Optional

# VISCODE 6 * x -> AGE 0.5 * x

const_by_patient = ["PTGENDER_num", "PTEDUCAT", "APOE4"]


def dataframe_hash(df: pd.DataFrame) -> str:
    return str(abs(pd.util.hash_pandas_object(df).sum()))


def normalize(test_data):
    return test_data

    # factor = test_data["CDRSB"] / 0.5
    # factor = factor.fillna(-1)
    # factor = factor.round(0).astype(int)
    # factor = factor.replace(-1, np.nan)
    # test_data["CDRSB"] = factor * 0.5
    # return test_data


def prepare_consts(train_data, test_data):
    test_data = test_data.copy()
    train_data = train_data.copy()

    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    test_data = test_data.sort_values(["RID_HASH", "VISCODE"])

    for item in test_data.groupby("RID_HASH"):
        local = item[1]

        # fill consts
        for col in const_by_patient:
            if len(local[col].unique()) == 1:
                continue
            rid = local["RID_HASH"].unique()[0]

            val = local[col][~local[col].isna()].unique()[0]
            local[col] = local[col].fillna(val)
            test_data.loc[test_data["RID_HASH"] == rid, col] = test_data[
                test_data["RID_HASH"] == rid
            ][col].fillna(val)
            assert len(local[col].unique()) == 1, col

    return test_data


def prepare_age(train_data, test_data):
    test_data = test_data.copy()
    train_data = train_data.copy()

    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    test_data = test_data.sort_values(["RID_HASH", "VISCODE"])

    col = "AGE"

    for rid in test_data["RID_HASH"].unique():
        local = test_data[test_data["RID_HASH"] == rid]

        # fill age
        ages = local["AGE"]
        if ages.isna().sum() == 0:
            continue

        if ages.isna().sum() == len(ages):
            continue

        # forward impute age
        prev_viscode = 0
        prev_age = 0
        for idx, row in local.iterrows():
            current_viscode = row["VISCODE"]
            local_idx = (test_data["VISCODE"] == current_viscode) & (
                test_data["RID_HASH"] == rid
            )
            if prev_age > 0 and prev_age == prev_age:
                pred_age = (current_viscode - prev_viscode) / 6 * 0.5 + prev_age
            else:
                pred_age = row[col]

            if pred_age == pred_age:
                # print("forward imputed", pred_age, current_viscode)
                test_data.loc[local_idx, col] = test_data.loc[local_idx][col].fillna(
                    pred_age
                )

            prev_viscode = row["VISCODE"]
            prev_age = pred_age

        # reverse impute age
        prev_viscode = 0
        prev_age = 0
        for idx, row in local.iloc[::-1].iterrows():
            current_viscode = row["VISCODE"]
            local_idx = (test_data["VISCODE"] == current_viscode) & (
                test_data["RID_HASH"] == rid
            )

            if prev_age > 0 and prev_age == prev_age:
                pred_age = prev_age - (prev_viscode - current_viscode) / 6 * 0.5
            else:
                pred_age = row[col]

            if pred_age == pred_age:
                # print("reversed imputed", pred_age, current_viscode)
                test_data.loc[local_idx, col] = test_data.loc[local_idx][col].fillna(
                    pred_age
                )

            prev_viscode = row["VISCODE"]
            prev_age = pred_age

        # print(test_data[(test_data["RID_HASH"] == rid)][["VISCODE", "AGE"]])
    return test_data


def impute_longitudinal(
    train_data,
    test_data,
    n_iter=5,
    eval_cols=[
        "DX_num",
        "CDRSB",
        "MMSE",
        "ADAS13",
        "Ventricles",
        "Hippocampus",
        "WholeBrain",
        "Entorhinal",
        "Fusiform",
        "MidTemp",
    ],
    imputed_test_data=None,
    random_state: int = 0,
):
    test_data = test_data.copy()
    train_data = train_data.copy()

    if imputed_test_data is None:
        imputed_test_data = intermediary_imputation(
            train_data, test_data, eval_cols=eval_cols, random_state=random_state
        )

    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    test_data = test_data.sort_values(["RID_HASH", "VISCODE"])
    imputed_test_data = imputed_test_data.sort_values(["RID_HASH", "VISCODE"])

    prev_cols = [f"prev_{col}" for col in train_cols]

    for rid in test_data["RID_HASH"].unique():
        patient = test_data[test_data["RID_HASH"] == rid]
        patient_imputed = imputed_test_data[imputed_test_data["RID_HASH"] == rid]

        prediction_rows = [pd.Series(np.zeros(len(prev_cols)), index=train_cols)]
        for ridx, row in patient.iterrows():
            prediction_rows.append(row[train_cols])
        prediction_rows.append(pd.Series(np.zeros(len(prev_cols)), index=train_cols))

        for col in eval_cols:
            if patient[col].isna().sum() == 0:
                continue

            for ridx, row in enumerate(prediction_rows[1:-1]):
                real_idx = ridx + 1
                if row[col] == row[col]:
                    continue
                current_viscode = row["VISCODE"]
                local_idx = (test_data["VISCODE"] == current_viscode) & (
                    test_data["RID_HASH"] == rid
                )
                local_ref_idx = (imputed_test_data["VISCODE"] == current_viscode) & (
                    imputed_test_data["RID_HASH"] == rid
                )
                
                prev_col_val = prediction_rows[real_idx - 1][col]
                next_col_val = prediction_rows[real_idx + 1][col]

                if next_col_val == next_col_val and ridx + 1 < len(patient_imputed):
                    eval_data = (
                        patient_imputed.iloc[ridx].to_frame().T[train_cols]
                    ).drop(columns = [col]) #row.to_frame().T[train_cols]
                    eval_data[prev_cols] = (
                        patient_imputed.iloc[ridx + 1].to_frame().T[train_cols].values
                    )
                    eval_data = eval_data.astype(float)

                    assert eval_data.isna().sum().sum() == 0
                    assert eval_data[f"prev_{col}"].values[0] == next_col_val

                    imputer = longitudinal_imputers["reverse"][col]
                    imputed_val = imputer.predict(eval_data).values.squeeze()

                    test_data.loc[local_idx, col] = imputed_val
                    imputed_test_data.loc[local_ref_idx, col] = imputed_val
                if prev_col_val == prev_col_val and ridx > 0:
                    # print("Imputing using the prev value", prev_col_val)
                    eval_data = (
                        patient_imputed.iloc[ridx].to_frame().T[train_cols]
                    ).drop(columns = [col])
                    eval_data[prev_cols] = (
                        patient_imputed.iloc[ridx - 1].to_frame().T[train_cols].values
                    )
                    eval_data = eval_data.astype(float)

                    assert eval_data.isna().sum().sum() == 0
                    assert eval_data[f"prev_{col}"].values[0] == prev_col_val

                    imputer = longitudinal_imputers["forward"][col]
                    imputed_val = imputer.predict(eval_data).values.squeeze()
                    test_data.loc[local_idx, col] = imputed_val
                    imputed_test_data.loc[local_ref_idx, col] = imputed_val

                    continue

    return normalize(test_data)


def intermediary_imputation(train_data, test_data, eval_cols, random_state: int = 0):
    imputed_test_data = test_data.copy()
    local_kwargs = {
        "optimizer": "simple",
        "classifier_seed": ["logistic_regression"],
        "regression_seed": ["linear_regression"],
        "class_threshold": cat_limit,
        "random_state": random_state,
    }
    imputer = Imputers().get(
        "hyperimpute",
        **local_kwargs,
    )
    imputation_input = pd.concat([train_data, test_data], ignore_index=True)
    imputed_test_data = imputer.fit_transform(imputation_input)
    imputed_test_data = imputed_test_data.tail(len(test_data))

    return normalize(imputed_test_data)


def full_imputation(train_data, test_data, eval_cols, random_state: int = 0):

    imputed_test_data = test_data.copy()
    imputer_kwargs = {
        "optimizer": "simple",
        "classifier_seed": ["xgboost"],
        "regression_seed": ["catboost_regressor", "xgboost_regressor"],
        #"regression_seed" : ["xgboost_regressor"],
        "class_threshold": cat_limit,
        "random_state": random_state,
    }

    imputer = Imputers().get(
        "hyperimpute",
        **imputer_kwargs,
    )
    imputation_input = pd.concat([train_data, test_data], ignore_index=True)
    imputed_test_data = imputer.fit_transform(imputation_input)
    imputed_test_data = imputed_test_data.tail(len(test_data))

    return normalize(imputed_test_data)


def evaluate_first_visit(train_data, test_data, static_imputation):
    test_data = test_data.copy()
    train_data = train_data.copy()
    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    test_data = test_data.sort_values(["RID_HASH", "VISCODE"])
    # impute  first visit    "
    first_visit_imputed = static_imputation.drop_duplicates("RID_HASH")

    for rid in first_visit_imputed["RID_HASH"].unique():
        row = first_visit_imputed[first_visit_imputed["RID_HASH"] == rid]
        current_viscode = row["VISCODE"].values[0]
        local_idx = (test_data["VISCODE"] == current_viscode) & (
            test_data["RID_HASH"] == rid
        )
        imputed_idx = (first_visit_imputed["VISCODE"] == current_viscode) & (
            first_visit_imputed["RID_HASH"] == rid
        )
        if len(test_data[local_idx]) == 0:
            continue
        for col in test_data.columns:
            val = test_data.loc[local_idx][col].values[0]
            if val == val:
                continue
            imputed_val = first_visit_imputed.loc[imputed_idx][col].values[0]
            test_data.loc[local_idx, col] = imputed_val
            # print(\"imputed\", test_data.loc[local_idx, col]),
    return test_data


def evaluate_static_imputation(train_data, test_data, static_imputation):
    test_data = test_data.copy()
    train_data = train_data.copy()

    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    test_data = test_data.sort_values(["RID_HASH", "VISCODE"])

    for rid in test_data["RID_HASH"].unique():
        patient = test_data[test_data["RID_HASH"] == rid]
        misses = []
        viscodes = []
        for idx, row in patient.iterrows():
            misses.append(row.isna().sum())
            viscodes.append(row["VISCODE"])
        cidx = np.argmin(misses)

        current_viscode = viscodes[cidx]
        local_idx = (test_data["VISCODE"] == current_viscode) & (
            test_data["RID_HASH"] == rid
        )
        imputed_idx = (static_imputation["VISCODE"] == current_viscode) & (
            static_imputation["RID_HASH"] == rid
        )

        if len(test_data[local_idx]) == 0:
            continue

        for col in test_data.columns:
            val = test_data.loc[local_idx][col].values[0]
            if val == val:
                continue
            imputed_val = static_imputation.loc[imputed_idx][col].values[0]
            test_data.loc[local_idx, col] = imputed_val

            # print("imputed", test_data.loc[local_idx, col])

    return normalize(test_data)


def merge_miss_dfs(src, target):
    assert src.isna().sum().sum() == 0
    assert target.isna().sum().sum() != 0
    
    target = target.copy()
    for col in target.columns:
        target[col][target[col].isna()] = src[col][target[col].isna()]
 
    assert target.isna().sum().sum() == 0

    return target

def impute_data_step(
    train_data,
    test_data,
    use_longitudinal=True,
    static_strategy="missmin",
    random_state: int = 0,
    ref_static_imputation: Optional[pd.DataFrame] = None
):
    print("Imputation step using seed", random_state)
    train_data = train_data.copy()
    test_data = test_data.copy()

    test_id = dataframe_hash(test_data)

    print("Evaluate constants", test_id, test_data.isna().sum().sum())
    test_data = prepare_consts(train_data, test_data)
    test_data = prepare_age(train_data, test_data)

    print("Evaluate longitudinals", test_id, test_data.isna().sum().sum())
    while use_longitudinal:
        imputation_seed = None
        if ref_static_imputation is not None:
            imputation_seed = merge_miss_dfs(ref_static_imputation, test_data)
            
        new_test_data = impute_longitudinal(
            train_data, test_data, 
            random_state=random_state,
            imputed_test_data = imputation_seed,
        )
        if new_test_data.isna().sum().sum() == test_data.isna().sum().sum():
            break

        test_data = new_test_data

    print(
        "Evaluate static imputation",
        test_id,
        test_data.isna().sum().sum(),
        static_strategy,
    )
    
    if ref_static_imputation is None:
        static_imputation = full_imputation(
            train_data, test_data, test_data.columns, random_state=random_state
        )
    else:
        static_imputation = merge_miss_dfs(ref_static_imputation, test_data)

    if static_strategy == "first":
        test_data = evaluate_first_visit(train_data, test_data, static_imputation)
    else:
        test_data = evaluate_static_imputation(train_data, test_data, static_imputation)

    print("Evaluate constants take 2", test_id, test_data.isna().sum().sum())
    test_data = prepare_consts(train_data, test_data)
    test_data = prepare_age(train_data, test_data)

    print("Evaluate longitudinals take 2", test_id, test_data.isna().sum().sum())
    while use_longitudinal:
        imputation_seed = merge_miss_dfs(static_imputation, test_data)
       
        new_test_data = impute_longitudinal(
            train_data, test_data, random_state=random_state,
            imputed_test_data = imputation_seed,
        )
        if new_test_data.isna().sum().sum() == test_data.isna().sum().sum():
            break

        test_data = new_test_data

    print("Normalize data", test_id, test_data.isna().sum().sum())
    return normalize(test_data)


            
def merge_imputations(train_data, miss_data, imputed_data):
    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    miss_data = miss_data.sort_values(["RID_HASH", "VISCODE"])
    
    output = miss_data.copy()

    for col in train_data.columns:
        if miss_data[col].isna().sum() == 0:
            continue
        
        col_data = []
        for imputed_version in imputed_data:
            col_data.append(imputed_version[col].values)
        col_data = np.asarray(col_data)
                
        if len(train_data[col].unique()) < cat_limit:
            #print("clf", col, miss_data[col].isna().sum())
            
            most_freq_vals = []
            for row in col_data.T:
                counter = {}
                
                for v in row:
                    if v not in counter:
                        counter[v] = 0
                    counter[v] += 1
                
                col_most_freq = max(counter, key=counter.get)
                most_freq_vals.append(col_most_freq)
            output[col] = most_freq_vals
        else:
            #print("regression", col, miss_data[col].isna().sum())
            output[col] = col_data.mean(axis = 0)

    return output

    
def impute_data(
    train_data,
    test_data,
    use_longitudinal=True,
    static_strategy="missmin",
    seeds: int = n_seeds,
    ref_static_imputation = None,
):
    test_id = dataframe_hash(test_data)
    seed_hash_id = 0
    if ref_static_imputation is not None:
        seed_id = dataframe_hash(ref_static_imputation)

    output = []
    for seed in range(n_seeds):
        bkp_file = workspace / f"multi_imputation_{test_id}_{seed}_catlimit{cat_limit}_iterative_{seed_hash_id}.csv"
        print("Evaluate", bkp_file)
        if bkp_file.exists():
            output.append(pd.read_csv(bkp_file))
            continue
            
        output.append(impute_data_step(
            train_data,
            test_data,
            use_longitudinal=use_longitudinal,
            static_strategy=static_strategy,
            random_state=seed,
            ref_static_imputation = ref_static_imputation,
        ))
            
        output[-1].to_csv(bkp_file, index = None)

    return merge_imputations(train_data, test_data, output)

In [None]:
dev_1_eval = impute_data(dev_set, dev_1)
dev_2_eval = impute_data(dev_set, dev_2)

Evaluate workspace/multi_imputation_8756985925945686816_0_catlimit10_iterative_0.csv
Imputation step using seed 0
Evaluate constants 8756985925945686816 22466
Evaluate longitudinals 8756985925945686816 19844


In [57]:
def merge_miss_dfs(src, target):
    assert src.isna().sum().sum() == 0
    assert target.isna().sum().sum() != 0
    
    target = target.copy()
    for col in target.columns:
        target[col][target[col].isna()] = src[col][target[col].isna()]
 
    assert target.isna().sum().sum() == 0

    return target

dbg_df_1 = merge_miss_dfs(dev_1_eval, dev_1)

dbg_df_1

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit
2163,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,0,79.1,0.0,20.0,1.0,1.0,0.500000,0.923077,0.164384,0.085407,0.541042,0.376516,0.451348,0.278842,0.413814,2.0,6.0
154,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,6,79.6,0.0,20.0,1.0,1.0,1.500000,0.923077,0.237397,0.071956,0.548307,0.366398,0.403880,0.193367,0.397291,2.0,6.0
1385,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,0,72.9,1.0,12.0,1.0,1.0,1.016206,0.957348,0.171612,0.068270,0.525169,0.235599,0.513404,0.356253,0.294774,6.0,60.0
2698,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,6,73.4,1.0,12.0,1.0,1.0,1.284675,0.904950,0.199754,0.086371,0.549210,0.230361,0.435097,0.322395,0.294175,6.0,60.0
2291,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,12,73.9,1.0,12.0,1.0,1.0,1.479527,0.901940,0.208841,0.136255,0.527878,0.215944,0.487831,0.342600,0.277552,6.0,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,60,79.8,1.0,19.0,0.0,0.0,-0.043303,0.969826,0.106455,0.170895,0.472504,0.321346,0.434365,0.438637,0.439923,7.0,102.0
2646,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,102,83.3,1.0,19.0,0.0,0.0,-0.043303,0.969826,0.103725,0.178231,0.465842,0.309095,0.439181,0.438815,0.439923,7.0,102.0
1962,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,0,72.1,0.0,12.0,1.0,0.0,0.500000,0.884615,0.150685,0.416382,0.602438,0.676696,0.610229,0.743037,0.624631,3.0,24.0
122,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,12,73.1,0.0,12.0,1.0,0.0,1.000000,0.961538,0.155205,0.398451,0.608521,0.660304,0.617108,0.729087,0.638477,3.0,24.0


In [60]:
dev_1_eval

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit
2163,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,0,79.1,0.0,20.0,1.0,1.0,0.500000,0.923077,0.164384,0.085407,0.541042,0.376516,0.451348,0.278842,0.413814,2.0,6.0
154,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,6,79.6,0.0,20.0,1.0,1.0,1.500000,0.923077,0.237397,0.071956,0.548307,0.366398,0.403880,0.193367,0.397291,2.0,6.0
1385,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,0,72.9,1.0,12.0,1.0,1.0,1.016206,0.957348,0.171612,0.068270,0.525169,0.235599,0.513404,0.356253,0.294774,6.0,60.0
2698,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,6,73.4,1.0,12.0,1.0,1.0,1.284675,0.904950,0.199754,0.086371,0.549210,0.230361,0.435097,0.322395,0.294175,6.0,60.0
2291,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,12,73.9,1.0,12.0,1.0,1.0,1.479527,0.901940,0.208841,0.136255,0.527878,0.215944,0.487831,0.342600,0.277552,6.0,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,60,79.8,1.0,19.0,0.0,0.0,-0.043303,0.969826,0.106455,0.170895,0.472504,0.321346,0.434365,0.438637,0.439923,7.0,102.0
2646,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,102,83.3,1.0,19.0,0.0,0.0,-0.043303,0.969826,0.103725,0.178231,0.465842,0.309095,0.439181,0.438815,0.439923,7.0,102.0
1962,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,0,72.1,0.0,12.0,1.0,0.0,0.500000,0.884615,0.150685,0.416382,0.602438,0.676696,0.610229,0.743037,0.624631,3.0,24.0
122,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,12,73.1,0.0,12.0,1.0,0.0,1.000000,0.961538,0.155205,0.398451,0.608521,0.660304,0.617108,0.729087,0.638477,3.0,24.0


In [20]:
dev_set.describe()

Unnamed: 0,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit
count,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0
mean,22.719824,74.699829,0.449159,16.098025,0.817118,0.535479,1.66496,0.890027,0.219214,0.228729,0.499418,0.421892,0.441466,0.422304,0.451887,4.727384,52.039503
std,28.994209,7.154518,0.497469,2.781059,0.726643,0.652217,2.217401,0.130379,0.146025,0.14319,0.136845,0.133995,0.14992,0.135363,0.132796,2.423826,41.490374
min,0.0,54.4,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,0.0,70.2,0.0,14.0,0.0,0.0,0.0,0.846154,0.109589,0.126196,0.406005,0.32865,0.343739,0.333664,0.367292,3.0,24.0
50%,12.0,75.0,0.0,16.0,1.0,0.0,1.0,0.923077,0.178082,0.19819,0.509819,0.418718,0.446561,0.421586,0.454126,4.0,36.0
75%,24.0,79.7,1.0,18.0,1.0,1.0,2.5,1.0,0.30137,0.295614,0.596501,0.510674,0.541093,0.512337,0.541302,6.0,78.0
max,186.0,97.4,1.0,20.0,2.0,2.0,16.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,12.0,186.0


In [21]:
# from hyperimpute.plugins.utils.metrics import RMSE
# from scipy.stats import wasserstein_distance

# for col in train_gt.columns[1:]:
#     print(
#         col,
#         RMSE(train_eval_data[col], train_gt[col], train_mask[col]),
#         wasserstein_distance(train_gt[col], train_eval_data[col]),
#     )

In [22]:
### scaled + normalization + selection imputation src
# VISCODE nan 0.0
# AGE 3.5293405996205034 0.2212305692934138
# PTGENDER_num 0.34203737779552684 0.0099975615703487
# PTEDUCAT 1.067897475082619 0.03275989984231571
# DX_num 0.660742187439313 0.07559131919044132
# APOE4 0.5425608669746597 0.005527107209623572
# CDRSB 1.7661061814479229 0.15207672925302748
# MMSE 0.10642074995195923 0.010027549607631962
# ADAS13 0.1051318279283219 0.010215993775999688
# Ventricles 0.07000662900412913 0.005347858518635116
# Hippocampus 0.07186086087482552 0.006532167124149632
# WholeBrain 0.06710736037726597 0.004621577106687779
# Entorhinal 0.0989326732983036 0.010885303270980275
# Fusiform 0.07554859903589055 0.00708604167882555
# MidTemp 0.07211511404572751 0.006110878052264325

In [23]:
# # single visits
# biggest_err = 0
# biggest_err_rid = None
# cnt_single_visits = 0

# for rid in dev_set["RID_HASH"].unique():
#     patient = dev_set[dev_set["RID_HASH"] == rid]
#     patient_imputed = dev_2_eval[dev_2_eval["RID_HASH"] == rid]
#     patient_orig = dev_2[dev_2["RID_HASH"] == rid]

#     patient_mask = patient_orig.isna().astype(int)

#     patient_err = RMSE(
#         patient_imputed.drop(columns=["RID_HASH"]).values,
#         patient.drop(columns=["RID_HASH"]).values,
#         patient_mask.drop(columns=["RID_HASH"]).values,
#     )

#     if len(patient) == 1:
#         cnt_single_visits += 1

#     if patient_err != patient_err:
#         continue

#     if patient_err > biggest_err and len(patient) == 1:
#         biggest_err = patient_err
#         biggest_err_rid = rid

# # worst err : 6.792
# biggest_err_rid, biggest_err, cnt_single_visits

In [24]:
# biggest_err = 0
# biggest_err_rid = None
# cnt_multiple_visits = 0

# for rid in dev_set["RID_HASH"].unique():
#     patient = dev_set[dev_set["RID_HASH"] == rid]
#     patient_imputed = dev_2_eval[dev_2_eval["RID_HASH"] == rid]
#     patient_orig = dev_2[dev_2["RID_HASH"] == rid]

#     patient_mask = patient_orig.isna().astype(int)

#     patient_err = RMSE(
#         patient_imputed.drop(columns=["RID_HASH"]).values,
#         patient.drop(columns=["RID_HASH"]).values,
#         patient_mask.drop(columns=["RID_HASH"]).values,
#     )
#     if patient_err != patient_err:
#         continue
#     if len(patient) > 1:
#         cnt_multiple_visits += 1

#     if patient_err > biggest_err and len(patient) > 1:
#         biggest_err = patient_err
#         biggest_err_rid = rid

# # worst err : 6.224
# biggest_err_rid, biggest_err, cnt_multiple_visits

In [25]:
from hyperimpute.plugins.utils.metrics import RMSE
from hyperimpute.utils.benchmarks import ws_score

def benchmark(X, X_miss, imputed):
    return RMSE(imputed.values, X.values, X_miss.isna().values), ws_score(imputed.drop(columns = ["RID_HASH"]), X.drop(columns = ["RID_HASH"]))

benchmark(dev_set, dev_1, dev_1_eval)

(0.9872841001988745, 0.7806748336244881)

In [26]:
benchmark(dev_set, dev_2, dev_2_eval)

(1.1091635460031402, 0.8814278524124514)

In [34]:
### Multiple imputation
# # starting from prev best
# repeat 0 score (1.151455789790849, 0.516480422870422)
# repeat 1 score (1.152654881598934, 0.5338278576193356)
# repeat 2 score (0.9959925001104103, 0.47663288871399406)
# repeat 3 score (0.9769156323753989, 0.42132535304981367)
# repeat 4 score (0.9683792265110069, 0.4541283160204732)



In [35]:
# no preprocessing: (28256.01247357673, 13772.1483031795)
# missforest: (26900.321450057505, 11422.800036863018)
# missforest + catboost: (26818.913956820044, 11172.534584559302)
# impute first visit by consts + AGE : (22801.31764624424, 4530.5431159416)
# long inputing for DX_NUM (20140.313132594343, 4911.72658229064)


# scaled impute by first visit: (1.2100147872665876, 0.6437290692806962)
# scaled impute + mean + long imputation DX_NUM : (1.1785409955690953, 0.5420526563648462)
# scaled impute + mean + long imputation DX_NUM + MMSE: (1.1668009083992619, 0.7318445766107088)
# scaled impute + mean + long imputation DX_NUM + ADAS13: (1.1030186731907161, 0.5537261491649679)
# scaled impute + mean + long imputation DX_NUM + ADAS13 + Ventricles: (1.1205398362774392, 0.5763027452010369)
# scaled impute + mean + long imputation DX_NUM + ADAS13 + Hippocampus: (1.2369516583030933, 0.6736382987041747)

# cat40 simple : (1.4568880717344705, 0.9934616930026703)
# cat10 + scaled impute + hyperimpute(catboost) + long imputation full:  (1.0241908722005213, 0.4927460867941309)
# cat10 + scaled impute + long imputation full + normalization:  (1.065308437510163, 0.4953959370803312)

# cat10 + scaled impute + long imputation full + static imputation + normalization:  (1.1640316504107393, 0.5580105554943448)
# cat10 + scaled impute + long imputation full + static imputation:  (1.06, 0.53)

# cat10 + visit cnt + long imputation + linear interm imputation: (0.9921784212883535, 0.44433424863837917)
# cat10 + visit cnt + long imputation + rf interm imputation: (1.14, 0.77)
# cat10 + visit cnt + long imputation + catboost interm imputation: (1.10, 0.61)

## Submission data

In [28]:
import numpy as np


def normalize_output(test_data):
    test_data = test_data.copy()
    factor = test_data["CDRSB"] / 0.5
    factor[factor < 0] = 0
    
    factor = factor.fillna(-1)
    factor = factor.round(0).astype(int)
    factor = factor.replace(-1, np.nan)
    test_data["CDRSB"] = factor * 0.5

    test_data["ADAS13"] = ((test_data["ADAS13"] * 3).round(0) / 3).round(2)
    test_data["MMSE"] = test_data["MMSE"].round(0)

    return test_data


def dump_results(imputed_data: pd.DataFrame, fpath: str):
    results = []

    for name, data in [
        ("test_A", test_A.sort_index()),
        ("test_B", test_B.sort_index()),
    ]:
        for idx, row in data.iterrows():
            for col in row.index:
                local = row.T
                val = local[col]
                if val == val:
                    continue
                imputed_id = f"{local['RID_HASH']}_{local['VISCODE']}_{col}_{name}"
                imputed_val = imputed_data[
                    (imputed_data["RID_HASH"] == local["RID_HASH"])
                    & (imputed_data["VISCODE"] == local["VISCODE"])
                ][col].values[0]
                
                assert imputed_val == imputed_val
                assert imputed_val != ""
                
                results.append([imputed_id, imputed_val])

    output = pd.DataFrame(results, columns=submission.columns)
    output.to_csv(fpath, index=None)

    return output

def impute_data_dbg(
    train_data,
    test_data,
    seeds: list,
    use_longitudinal=True,
    static_strategy="missmin",
    static_imputation = None,
):
    test_id = dataframe_hash(test_data)
    
    output = []
    for seed in seeds:
        bkp_file = workspace / f"multi_imputation_{test_id}_{seed}_catlimit{cat_limit}_iter2.csv"
        #print("Evaluate", bkp_file)
        if bkp_file.exists():
            output.append(pd.read_csv(bkp_file))
        else:
            output.append(impute_data_step(
                train_data,
                test_data,
                use_longitudinal=use_longitudinal,
                static_strategy=static_strategy,
                random_state=seed,
                static_imputation = static_imputation,
            ))

            output[-1].to_csv(bkp_file, index = None)

    return merge_imputations(train_data, test_data, output)

def get_submission_data(seeds):
    test_A_eval = impute_data_dbg(dev_set, test_A, seeds).sort_index()
    test_B_eval = impute_data_dbg(dev_set, test_B, seeds).sort_index()

    eval_data = pd.concat([dev_set, test_A_eval, test_B_eval], ignore_index=True)
    eval_data[scaled_cols] = scaler.inverse_transform(eval_data[scaled_cols])

    output_fpath = results_dir / f"imputation_results_{version}_{changelog}_normalized.csv"

    output_normalized = dump_results(normalize_output(eval_data), output_fpath)
    
    return output_fpath, output_normalized

def decode_output(output):
    output_A = test_A.copy()
    output_B = test_B.copy()
    
    for idx, row in output.iterrows():
        tokens = row["Id"].split("_")
        rid = tokens[0]
        viscode = int(tokens[1])
        testcase = None
        if row["Id"].endswith("_test_A"):
            testcase = "test_A"
        elif row["Id"].endswith("_test_B"):
            testcase = "test_B" 
        else:
            raise ValueError("...")
        feature = row["Id"].split(f"{rid}_{viscode}_")[1]
        feature = feature.split(f"_{testcase}")[0]
        predicted_value = row["Predicted"]
        
        if testcase == "test_A":            
            local_idx = (output_A["VISCODE"] == viscode) & (
                output_A["RID_HASH"] == rid
            )
            output_A.loc[local_idx, feature] = predicted_value
        elif testcase == "test_B":            
            local_idx = (output_B["VISCODE"] == viscode) & (
                output_B["RID_HASH"] == rid
            )
            output_B.loc[local_idx, feature] = predicted_value
    return output_A, output_B

In [42]:
from scipy.stats import wasserstein_distance
from sklearn.metrics import mean_squared_error
import numpy as np

def benchmark_output(predicted, reference):
    pred = predicted["Predicted"].values
    ref = reference["Predicted"].values
    return np.sqrt(mean_squared_error(ref, pred)), wasserstein_distance(pred, ref)


debug_f_good = (
    results_dir / f"imputation_results_take2_v1_static_imputation_tweaks_cat10_normalized.csv"
)

output_good = pd.read_csv(debug_f_good)

output_path, output = get_submission_data(seeds = [12])

benchmark_output(output, output_good)

(9126.102659192105, 230.1177186993746)

In [39]:
def decode_output(output):
    output_A = test_A.copy()
    output_B = test_B.copy()
    
    for idx, row in output.iterrows():
        tokens = row["Id"].split("_")
        rid = tokens[0]
        viscode = int(tokens[1])
        testcase = None
        if row["Id"].endswith("_test_A"):
            testcase = "test_A"
        elif row["Id"].endswith("_test_B"):
            testcase = "test_B" 
        else:
            raise ValueError("...")
        feature = row["Id"].split(f"{rid}_{viscode}_")[1]
        feature = feature.split(f"_{testcase}")[0]
        predicted_value = row["Predicted"]
        
        if testcase == "test_A":            
            local_idx = (output_A["VISCODE"] == viscode) & (
                output_A["RID_HASH"] == rid
            )
            output_A.loc[local_idx, feature] = predicted_value
        elif testcase == "test_B":            
            local_idx = (output_B["VISCODE"] == viscode) & (
                output_B["RID_HASH"] == rid
            )
            output_B.loc[local_idx, feature] = predicted_value
    return output_A, output_B

ref_test_A, ref_test_B = decode_output(output_good)


ref_test_A

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit
247,00d5e0050fbd3b6b610f6673347232eb0862df77b5b7a8...,0,74.069710,0.0,16.0,1.0,0.0,0.5,0.961538,0.219178,52510.851198,7171.471083,1.103726e+06,4052.409514,20244.638172,21469.796802,1.0,0.0
819,013c6f92763546c7ad9c0831f023886c15f05e7332aa0c...,0,72.500000,1.0,12.0,1.0,1.0,3.5,25.000000,25.000000,0.057498,0.612302,4.232680e-01,0.291182,0.433004,0.329131,3.0,12.0
276,013c6f92763546c7ad9c0831f023886c15f05e7332aa0c...,6,73.000000,1.0,12.0,1.0,1.0,3.5,25.000000,25.330000,0.067972,7516.286553,3.999422e-01,2980.638621,16690.044558,15873.996593,3.0,12.0
350,013c6f92763546c7ad9c0831f023886c15f05e7332aa0c...,12,73.500000,1.0,12.0,1.0,1.0,2.0,0.769231,0.365342,0.077516,6930.744766,4.153243e-01,3090.753425,16333.066494,15592.721715,3.0,12.0
1268,024efbff9265302acd00190e57ee08ba1fe1b90f561f79...,0,60.700000,0.0,14.0,1.0,1.0,2.0,1.000000,0.164384,20100.863201,8134.076420,5.152234e-01,4047.522752,19840.417245,21751.199201,7.0,102.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,0,66.900000,0.0,18.0,1.0,1.0,1.5,0.807692,0.150685,38517.017377,7512.716882,1.150658e+06,3682.770722,20749.705173,20014.081992,4.0,48.0
330,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,6,67.400000,0.0,18.0,1.0,1.0,1.5,0.769231,0.095890,43577.148875,7202.097943,1.140350e+06,3682.770722,20677.692639,19607.201161,4.0,48.0
939,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,24,68.900000,0.0,18.0,1.0,1.0,1.5,0.769231,0.150685,46256.503480,6912.526685,1.107629e+06,3633.844003,20677.692639,18748.511758,4.0,48.0
119,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,48,70.900000,0.0,18.0,1.0,1.0,2.5,0.807692,0.246575,0.307697,0.420993,1.119249e+06,0.392416,0.577719,0.403872,4.0,48.0


In [41]:
test_A

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit
247,00d5e0050fbd3b6b610f6673347232eb0862df77b5b7a8...,0,,,16.0,1.0,0.0,0.5,0.961538,0.219178,,,,,,,1.0,0.0
819,013c6f92763546c7ad9c0831f023886c15f05e7332aa0c...,0,72.5,1.0,12.0,,1.0,,,,0.057498,0.612302,0.423268,0.291182,0.433004,0.329131,3.0,12.0
276,013c6f92763546c7ad9c0831f023886c15f05e7332aa0c...,6,73.0,1.0,12.0,,1.0,,,,0.067972,,0.399942,,,,3.0,12.0
350,013c6f92763546c7ad9c0831f023886c15f05e7332aa0c...,12,73.5,1.0,12.0,1.0,1.0,2.0,0.769231,0.365342,0.077516,,0.415324,,,,3.0,12.0
1268,024efbff9265302acd00190e57ee08ba1fe1b90f561f79...,0,,0.0,14.0,1.0,1.0,2.0,1.000000,0.164384,,,0.515223,,,,7.0,102.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,0,,,18.0,1.0,1.0,1.5,0.807692,0.150685,,,,,,,4.0,48.0
330,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,6,,,18.0,1.0,1.0,1.5,0.769231,0.095890,,,,,,,4.0,48.0
939,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,24,,,18.0,1.0,1.0,1.5,0.769231,0.150685,,,,,,,4.0,48.0
119,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,48,70.9,,18.0,1.0,1.0,2.5,0.807692,0.246575,0.307697,0.420993,,0.392416,0.577719,0.403872,4.0,48.0


In [None]:
# multiple imputation + full augmentation, 
# 3 seeds: (11044.268584346595, 724.4889170045255)

# multiple imputation + full augmentation
# seed 0:(12195.72028744357, 849.6744932411233)
# seed 1: (12252.98431308476, 839.6951067086126)
# seed 2: (12817.035855967388, 864.2393249925908)

# multiple imputation + simple augmentation, 
# 3 seeds: (7939.815096789333, 410.5754059777448)
# 5 seeds: (7780.216832789914, 378.1805604727539)
# 15 seeds: (7505.431662425454, 418.9018232185804)
# 0 (8516.698134455688, 360.04423131698417)
# 1 (9143.339489207558, 237.5922086244903)
# 2 (10185.790487124097, 544.932563784818)
# 3 (9864.608676225727, 265.1980511801534)
# 4 (9675.987505084768, 398.3931331202419)
# 5 (9448.198395996853, 275.3706331329214)
# 6 (9556.750066355977, 304.65348731749185)
# 7 (11084.274782247903, 652.8893371151566)
# 8 (8449.793120663277, 248.93412663048176)
# 9 (8811.549605682894, 287.5423413769863)
# 10 (10850.517282785027, 585.7398521348043)
# 11 (8945.628672465282, 250.58086492430945)
# 12 (9126.102659192105, 230.1177186993746)
# 13 (10538.935693995254, 387.5523116160385)
# 14 (9656.782069797828, 394.485427855781)
[1, 12]

# 

In [None]:
diffs = {}
similars = {}
for idx, bad_row in output_bad.iterrows():
    good_row = output_good[output_good["Id"] == bad_row["Id"]]

    rid = bad_row["Id"]
    good_value = good_row["Predicted"].values[0]
    bad_value = bad_row["Predicted"]
    key = "_".join(rid.split("_")[2:])
    if np.abs(good_value - bad_value) < 1:
        if key not in similars:
            similars[key] = 0
        similars[key] += 1
        continue

    if key not in diffs:
        diffs[key] = 0
    diffs[key] += 1

In [None]:
# prev v bad
[('ADAS13_test_A', 17),
 ('ADAS13_test_B', 15),
 ('AGE_test_A', 382),
 ('AGE_test_B', 469),
 ('APOE4_test_A', 43),
 ('APOE4_test_B', 64),
 ('CDRSB_test_A', 161),
 ('CDRSB_test_B', 184),
 ('DX_num_test_A', 356),
 ('DX_num_test_B', 382),
 ('MMSE_test_A', 216),
 ('MMSE_test_B', 232),
 ('PTEDUCAT_test_A', 34),
 ('PTEDUCAT_test_B', 179),
 ('PTGENDER_num_test_A', 587),
 ('PTGENDER_num_test_B', 665)]

#mice 5-seed - 0.071 catlimit 10
[('ADAS13_test_A', 299),
 ('ADAS13_test_B', 353),
 ('AGE_test_A', 470),
 ('AGE_test_B', 580),
 ('APOE4_test_A', 49),
 ('APOE4_test_B', 63),
 ('CDRSB_test_A', 409),
 ('CDRSB_test_B', 468),
 ('DX_num_test_A', 416),
 ('DX_num_test_B', 472),
 ('Entorhinal_test_A', 301),
 ('Entorhinal_test_B', 322),
 ('Fusiform_test_A', 314),
 ('Fusiform_test_B', 323),
 ('Hippocampus_test_A', 306),
 ('Hippocampus_test_B', 319),
 ('MMSE_test_A', 355),
 ('MMSE_test_B', 410),
 ('MidTemp_test_A', 312),
 ('MidTemp_test_B', 316),
 ('PTEDUCAT_test_A', 65),
 ('PTEDUCAT_test_B', 256),
 ('PTGENDER_num_test_A', 609),
 ('PTGENDER_num_test_B', 682),
 ('Ventricles_test_A', 332),
 ('Ventricles_test_B', 406),
 ('WholeBrain_test_A', 322),
 ('WholeBrain_test_B', 401)]

# mice 15-seed catlimit 10
[('ADAS13_test_A', 300),
 ('ADAS13_test_B', 370),
 ('AGE_test_A', 479),
 ('AGE_test_B', 614),
 ('APOE4_test_A', 49),
 ('APOE4_test_B', 67),
 ('CDRSB_test_A', 410),
 ('CDRSB_test_B', 467),
 ('DX_num_test_A', 417),
 ('DX_num_test_B', 471),
 ('Entorhinal_test_A', 255),
 ('Entorhinal_test_B', 280),
 ('Fusiform_test_A', 290),
 ('Fusiform_test_B', 292),
 ('Hippocampus_test_A', 269),
 ('Hippocampus_test_B', 268),
 ('MMSE_test_A', 361),
 ('MMSE_test_B', 411),
 ('MidTemp_test_A', 276),
 ('MidTemp_test_B', 276),
 ('PTEDUCAT_test_A', 65),
 ('PTEDUCAT_test_B', 264),
 ('PTGENDER_num_test_A', 604),
 ('PTGENDER_num_test_B', 682),
 ('Ventricles_test_A', 310),
 ('Ventricles_test_B', 382),
 ('WholeBrain_test_A', 285),
 ('WholeBrain_test_B', 346)]
sorted(similars.items())

In [None]:
# prev v bad
[('ADAS13_test_A', 411),
 ('ADAS13_test_B', 463),
 ('AGE_test_A', 230),
 ('AGE_test_B', 298),
 ('APOE4_test_A', 6),
 ('APOE4_test_B', 13),
 ('CDRSB_test_A', 267),
 ('CDRSB_test_B', 294),
 ('DX_num_test_A', 72),
 ('DX_num_test_B', 96),
 ('Entorhinal_test_A', 668),
 ('Entorhinal_test_B', 710),
 ('Fusiform_test_A', 668),
 ('Fusiform_test_B', 710),
 ('Hippocampus_test_A', 668),
 ('Hippocampus_test_B', 710),
 ('MMSE_test_A', 212),
 ('MMSE_test_B', 246),
 ('MidTemp_test_A', 668),
 ('MidTemp_test_B', 710),
 ('PTEDUCAT_test_A', 31),
 ('PTEDUCAT_test_B', 94),
 ('PTGENDER_num_test_A', 39),
 ('PTGENDER_num_test_B', 43),
 ('Ventricles_test_A', 612),
 ('Ventricles_test_B', 767),
 ('WholeBrain_test_A', 626),
 ('WholeBrain_test_B', 708)]

# mice 5-seeds catlimit 10
[('ADAS13_test_A', 129),
 ('ADAS13_test_B', 125),
 ('AGE_test_A', 142),
 ('AGE_test_B', 187),
 ('APOE4_test_B', 14),
 ('CDRSB_test_A', 19),
 ('CDRSB_test_B', 10),
 ('DX_num_test_A', 12),
 ('DX_num_test_B', 6),
 ('Entorhinal_test_A', 367),
 ('Entorhinal_test_B', 388),
 ('Fusiform_test_A', 354),
 ('Fusiform_test_B', 387),
 ('Hippocampus_test_A', 362),
 ('Hippocampus_test_B', 391),
 ('MMSE_test_A', 73),
 ('MMSE_test_B', 68),
 ('MidTemp_test_A', 356),
 ('MidTemp_test_B', 394),
 ('PTEDUCAT_test_B', 17),
 ('PTGENDER_num_test_A', 17),
 ('PTGENDER_num_test_B', 26),
 ('Ventricles_test_A', 280),
 ('Ventricles_test_B', 361),
 ('WholeBrain_test_A', 304),
 ('WholeBrain_test_B', 307)]
# mice 15-seed catlimit 10
[('ADAS13_test_A', 128),
 ('ADAS13_test_B', 108),
 ('AGE_test_A', 133),
 ('AGE_test_B', 153),
 ('APOE4_test_B', 10),
 ('CDRSB_test_A', 18),
 ('CDRSB_test_B', 11),
 ('DX_num_test_A', 11),
 ('DX_num_test_B', 7),
 ('Entorhinal_test_A', 413),
 ('Entorhinal_test_B', 430),
 ('Fusiform_test_A', 378),
 ('Fusiform_test_B', 418),
 ('Hippocampus_test_A', 399),
 ('Hippocampus_test_B', 442),
 ('MMSE_test_A', 67),
 ('MMSE_test_B', 67),
 ('MidTemp_test_A', 392),
 ('MidTemp_test_B', 434),
 ('PTEDUCAT_test_B', 9),
 ('PTGENDER_num_test_A', 22),
 ('PTGENDER_num_test_B', 26),
 ('Ventricles_test_A', 302),
 ('Ventricles_test_B', 385),
 ('WholeBrain_test_A', 341),
 ('WholeBrain_test_B', 362)]

sorted(diffs.items())