In [1]:
import pandas as pd
import warnings
from hyperimpute.utils.serialization import load_model_from_file, save_model_to_file
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler

workspace = Path("workspace")
workspace.mkdir(parents=True, exist_ok=True)

warnings.filterwarnings("ignore")

cat_limit = 10

In [2]:
def augment_base_dataset(df):
    df = df.sort_values(["RID_HASH", "VISCODE"])

    for rid in df["RID_HASH"].unique():
        visits_len = len(df[df["RID_HASH"] == rid])
        last_visit = df[df["RID_HASH"] == rid]["VISCODE"].max()
        visits = df[df["RID_HASH"] == rid]["VISCODE"].values.tolist()
        prev_visits = [-1] + visits[:-1]
        next_visits = visits[1:] + [-1]
        avg_wait = df[df["RID_HASH"] == rid]["VISCODE"].diff().mean()
        if avg_wait != avg_wait:
            avg_wait = 0

        df.loc[df["RID_HASH"] == rid, "total_visits"] = visits_len
        df.loc[df["RID_HASH"] == rid, "last_visit"] = last_visit
        df.loc[df["RID_HASH"] == rid, "prev_visit"] = prev_visits
        df.loc[df["RID_HASH"] == rid, "next_visit"] = next_visits
        df.loc[df["RID_HASH"] == rid, "avg_wait"] = avg_wait

    return df


def augment_scaled_dataset(df):
    return df

In [3]:
dev_set = pd.read_csv("dev_set.csv")
dev_set = dev_set.sort_values(["RID_HASH", "VISCODE"])
dev_set = augment_base_dataset(dev_set)

scaled_cols = [
    "MMSE",
    "ADAS13",
    "Ventricles",
    "Hippocampus",
    "WholeBrain",
    "Entorhinal",
    "Fusiform",
    "MidTemp",
]

scaler = MinMaxScaler().fit(dev_set[scaled_cols])
dev_set[scaled_cols] = scaler.transform(dev_set[scaled_cols])
dev_set = augment_scaled_dataset(dev_set)

dev_set

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,...,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit,prev_visit,next_visit,avg_wait
2163,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,0,79.1,0,20,1.0,1.0,0.5,0.923077,0.164384,...,0.548646,0.376516,0.464021,0.194906,0.400709,2.0,6.0,-1.0,6.0,6.0
154,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,6,79.6,0,20,1.0,1.0,1.5,0.923077,0.237397,...,0.548307,0.366398,0.403880,0.193367,0.397291,2.0,6.0,0.0,-1.0,6.0
1385,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,0,72.9,1,12,1.0,1.0,1.0,1.000000,0.123288,...,0.525169,0.235599,0.513404,0.356253,0.294774,6.0,60.0,-1.0,6.0,12.0
2698,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,6,73.4,1,12,1.0,1.0,1.0,1.000000,0.164384,...,0.549210,0.230361,0.435097,0.322395,0.294175,6.0,60.0,0.0,12.0,12.0
2291,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,12,73.9,1,12,1.0,1.0,1.0,0.961538,0.109589,...,0.527878,0.215944,0.487831,0.342600,0.277552,6.0,60.0,6.0,24.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,60,79.8,1,19,1.0,0.0,3.0,0.923077,0.223699,...,0.357020,0.321346,0.310935,0.399047,0.461476,7.0,102.0,36.0,102.0,17.0
2646,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,102,83.3,1,19,1.0,0.0,3.0,0.846154,0.168904,...,0.352043,0.309095,0.256790,0.372685,0.416478,7.0,102.0,60.0,-1.0,17.0
1962,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,0,72.1,0,12,1.0,0.0,0.5,0.884615,0.150685,...,0.602438,0.636654,0.610229,0.743037,0.624631,3.0,24.0,-1.0,12.0,12.0
122,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,12,73.1,0,12,1.0,0.0,1.0,0.961538,0.155205,...,0.608521,0.634650,0.617108,0.729087,0.638477,3.0,24.0,0.0,24.0,12.0


In [4]:
dev_1 = pd.read_csv("dev_1.csv")
dev_1 = dev_1.sort_values(["RID_HASH", "VISCODE"])
dev_1 = augment_base_dataset(dev_1)
dev_1[scaled_cols] = scaler.transform(dev_1[scaled_cols])
dev_1 = augment_scaled_dataset(dev_1)

dev_1

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,...,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit,prev_visit,next_visit,avg_wait
2163,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,0,,0.0,20.0,1.0,1.0,0.5,0.923077,0.164384,...,,0.376516,,,,2.0,6.0,-1.0,6.0,6.0
154,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,6,79.6,0.0,20.0,1.0,1.0,1.5,0.923077,0.237397,...,0.548307,0.366398,0.403880,0.193367,0.397291,2.0,6.0,0.0,-1.0,6.0
1385,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,0,,1.0,12.0,,1.0,,,,...,0.525169,0.235599,0.513404,0.356253,0.294774,6.0,60.0,-1.0,6.0,12.0
2698,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,6,,1.0,12.0,,1.0,,,,...,0.549210,0.230361,0.435097,0.322395,0.294175,6.0,60.0,0.0,12.0,12.0
2291,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,12,,1.0,12.0,,1.0,,,,...,0.527878,0.215944,0.487831,0.342600,0.277552,6.0,60.0,6.0,24.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,60,79.8,1.0,19.0,,0.0,,,,...,,0.321346,,,,7.0,102.0,36.0,102.0,17.0
2646,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,102,83.3,1.0,19.0,,0.0,,,,...,,0.309095,,,,7.0,102.0,60.0,-1.0,17.0
1962,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,0,72.1,,12.0,1.0,0.0,0.5,0.884615,0.150685,...,0.602438,,0.610229,0.743037,0.624631,3.0,24.0,-1.0,12.0,12.0
122,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,12,73.1,,12.0,1.0,0.0,1.0,0.961538,0.155205,...,0.608521,,0.617108,0.729087,0.638477,3.0,24.0,0.0,24.0,12.0


In [5]:
dev_2 = pd.read_csv("dev_2.csv")
dev_2 = dev_2.sort_values(["RID_HASH", "VISCODE"])
dev_2 = augment_base_dataset(dev_2)
dev_2[scaled_cols] = scaler.transform(dev_2[scaled_cols])
dev_2 = augment_scaled_dataset(dev_2)

dev_1

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,...,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit,prev_visit,next_visit,avg_wait
2163,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,0,,0.0,20.0,1.0,1.0,0.5,0.923077,0.164384,...,,0.376516,,,,2.0,6.0,-1.0,6.0,6.0
154,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,6,79.6,0.0,20.0,1.0,1.0,1.5,0.923077,0.237397,...,0.548307,0.366398,0.403880,0.193367,0.397291,2.0,6.0,0.0,-1.0,6.0
1385,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,0,,1.0,12.0,,1.0,,,,...,0.525169,0.235599,0.513404,0.356253,0.294774,6.0,60.0,-1.0,6.0,12.0
2698,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,6,,1.0,12.0,,1.0,,,,...,0.549210,0.230361,0.435097,0.322395,0.294175,6.0,60.0,0.0,12.0,12.0
2291,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,12,,1.0,12.0,,1.0,,,,...,0.527878,0.215944,0.487831,0.342600,0.277552,6.0,60.0,6.0,24.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,60,79.8,1.0,19.0,,0.0,,,,...,,0.321346,,,,7.0,102.0,36.0,102.0,17.0
2646,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,102,83.3,1.0,19.0,,0.0,,,,...,,0.309095,,,,7.0,102.0,60.0,-1.0,17.0
1962,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,0,72.1,,12.0,1.0,0.0,0.5,0.884615,0.150685,...,0.602438,,0.610229,0.743037,0.624631,3.0,24.0,-1.0,12.0,12.0
122,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,12,73.1,,12.0,1.0,0.0,1.0,0.961538,0.155205,...,0.608521,,0.617108,0.729087,0.638477,3.0,24.0,0.0,24.0,12.0


In [6]:
submission = pd.read_csv("sample_submission.csv")

submission.values[1]

array(['6b6a7136f42a8dbd469a201b88e2abb54a93667822761357db2f6d620da6af8a_0_Ventricles_test_A',
       40613.0818580834], dtype=object)

In [7]:
test_A = pd.read_csv("test_A.csv")
test_A = augment_base_dataset(test_A)
test_A[scaled_cols] = scaler.transform(test_A[scaled_cols])
test_A = augment_scaled_dataset(test_A)

test_A

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,...,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit,prev_visit,next_visit,avg_wait
0,988b6137f4352c01e4b52790505caa0c3ec438f117000a...,24,,,18.0,,0.0,,,,...,,,,,,6.0,36.0,-1.0,0.0,2.400000
1,fb640cef87a6af00053e632140ce18f5722431bb92576b...,12,66.4,1.0,18.0,1.0,1.0,1.5,0.961538,0.077671,...,,0.542904,,,,5.0,24.0,-1.0,0.0,1.500000
2,f24f78d62c90319b575dfb48a482159c4d0df14cb71530...,66,74.5,0.0,14.0,0.0,0.0,0.0,0.961538,0.050274,...,0.565102,0.753302,0.641093,0.911086,0.866886,5.0,96.0,-1.0,36.0,3.000000
3,da4cbd3f09e8ddc87cc72e542d43f072e7df288face65e...,0,,,16.0,0.0,0.0,0.0,1.000000,0.191781,...,,,,,,2.0,36.0,-1.0,36.0,36.000000
4,f665c6ee86356bdd135be03c61348607cabd64ed8433ba...,12,82.7,,13.0,,1.0,,,,...,0.353047,,0.208289,0.188006,0.363489,7.0,60.0,-1.0,48.0,8.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1323,51923c5d7573ef46aa9197cae78c3305abea5b3479331f...,6,83.5,,18.0,1.0,0.0,2.5,0.846154,0.150685,...,0.541648,,0.454850,0.480663,0.479467,5.0,60.0,48.0,-1.0,1.500000
1324,06407d9ec85d62cd38189108ddffec23822f421b3db357...,0,,,20.0,,0.0,,,,...,,,,,,1.0,0.0,-1.0,-1.0,0.000000
1325,e5015703a58ccd5582a46d9f4a779edf062d683f3ae873...,132,83.0,0.0,20.0,0.0,1.0,0.0,0.961538,0.205479,...,0.533115,0.491958,0.513933,0.508464,0.573437,9.0,132.0,0.0,-1.0,12.000000
1326,cf6ea2601bb119113371df79931cc3734b77218f734ad0...,12,82.4,,18.0,1.0,0.0,0.5,0.884615,0.205479,...,0.572799,,0.597707,0.380480,0.361608,3.0,12.0,0.0,-1.0,3.000000


In [8]:
test_B = pd.read_csv("test_B.csv")
test_B = augment_base_dataset(test_B)
test_B[scaled_cols] = scaler.transform(test_B[scaled_cols])
test_B = augment_scaled_dataset(test_B)

test_B

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,...,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit,prev_visit,next_visit,avg_wait
0,90a4f1869cf459af5fe39e53f1c328540f1dcf5a1908f7...,60,67.9,1.0,20.0,0.0,1.0,0.0,1.000000,0.123288,...,,0.330562,,,,6.0,60.0,-1.0,12.0,-10.8
1,fad8ca8f903cf3ddf566926eabdb8718e8568962675519...,30,69.1,,16.0,0.0,0.0,0.0,0.961538,0.059315,...,0.749086,,0.899471,0.724619,0.481817,3.0,30.0,-1.0,0.0,-9.0
2,d342fb7689e49c754709870c77e1aa3ed770dd193e9f9c...,12,,,12.0,,1.0,,,,...,,,,,,1.0,12.0,-1.0,-1.0,0.0
3,5319e7ba149f0f81715b5e7f854036fc937141840bbd52...,6,,,18.0,,0.0,,,,...,0.476637,,0.430159,0.520727,0.415281,5.0,126.0,-1.0,24.0,-1.5
4,6eef135d8c4eca67b0e130b8f4aedbc37a99938224d661...,0,,0.0,16.0,0.0,0.0,0.0,0.961538,0.095890,...,0.605756,0.508509,0.629277,0.604379,0.449383,5.0,72.0,-1.0,24.0,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,fbf6267bf7d92b507feb4957d7aa90ea5bb50893bb79d4...,0,,,12.0,,2.0,,,,...,0.492325,,0.497354,0.374770,0.460151,7.0,60.0,18.0,-1.0,-1.0
1466,03e8ddc654f8e27332c5b09618b355d7f9529d614adb0f...,12,81.5,0.0,15.0,,0.0,,,,...,,0.208488,,,,3.0,12.0,6.0,-1.0,6.0
1467,1156748dfd6e69e1f364c31584e957d3b1ef656b898942...,0,,0.0,18.0,0.0,0.0,0.0,0.961538,0.136986,...,,0.686284,,,,4.0,84.0,6.0,-1.0,-28.0
1468,0c7e17c442e715e067bd472c1e472b4937914d7fb8d492...,12,,,16.0,,0.0,,,,...,0.553883,,0.753086,0.606563,0.458784,4.0,42.0,24.0,-1.0,4.0


In [9]:
test_A.isna().sum()

RID_HASH          0
VISCODE           0
AGE             612
PTGENDER_num    626
PTEDUCAT         65
DX_num          428
APOE4            49
CDRSB           428
MMSE            428
ADAS13          428
Ventricles      612
Hippocampus     668
WholeBrain      626
Entorhinal      668
Fusiform        668
MidTemp         668
total_visits      0
last_visit        0
prev_visit        0
next_visit        0
avg_wait          0
dtype: int64

In [10]:
test_A.columns

Index(['RID_HASH', 'VISCODE', 'AGE', 'PTGENDER_num', 'PTEDUCAT', 'DX_num',
       'APOE4', 'CDRSB', 'MMSE', 'ADAS13', 'Ventricles', 'Hippocampus',
       'WholeBrain', 'Entorhinal', 'Fusiform', 'MidTemp', 'total_visits',
       'last_visit', 'prev_visit', 'next_visit', 'avg_wait'],
      dtype='object')

In [11]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
from hyperimpute.plugins.prediction import Classifiers, Regression
from hyperimpute.utils.tester import evaluate_regression, evaluate_estimator


train_cols = list(dev_set.drop(columns=["RID_HASH"]).columns)

eval_cols = [
    "DX_num",
    "CDRSB",
    "MMSE",
    "ADAS13",
    "Ventricles",
    "Hippocampus",
    "WholeBrain",
    "Entorhinal",
    "Fusiform",
    "MidTemp",
]


def prepare_temporal_data(data, target_col: str, direction: str):
    target_train_data = []
    target_train_labels = []

    for item in data.groupby("RID_HASH"):
        # print(item[0])
        local = item[1]
        local = local.sort_values(["RID_HASH", "VISCODE"])

        rid = local["RID_HASH"]

        prev_cols = [f"prev_{col}" for col in train_cols]
        prev_row = np.zeros(len(prev_cols))

        if direction == "forward":
            rows = local.iterrows()
        else:
            rows = local.iloc[::-1].iterrows()

        for idx, row in rows:
            target_val = row[target_col]
            tmp_row = row[train_cols].copy()
            src_data = tmp_row.to_frame().T.drop(columns=[target_col])

            src_data[prev_cols] = prev_row

            prev_row = tmp_row

            target_train_data.append(src_data)
            target_train_labels.append(target_val)

    target_train_data = pd.concat(target_train_data, ignore_index=True).astype(float)

    return target_train_data, target_train_labels


def evaluate_target(data, target_col: str, direction: str):
    train_data, labels = prepare_temporal_data(data, target_col, direction)
    assert target_col not in train_data.columns

    results = {
        "raw": {},
        "str": {},
        "models": {},
    }
    if len(np.unique(labels)) < cat_limit:
        for src_model in ["catboost", "xgboost"]:
            model = Classifiers().get(src_model)
            encoded_labels = LabelEncoder().fit_transform(labels)

            score = evaluate_estimator(model, train_data, pd.Series(encoded_labels))[
                "str"
            ]["aucroc"]
            raw_score = evaluate_estimator(
                model, train_data, pd.Series(encoded_labels)
            )["clf"]["aucroc"][0]

            results["str"][src_model] = score
            results["raw"][src_model] = raw_score
            results["models"][src_model] = (
                Classifiers().get(src_model).fit(train_data, pd.Series(encoded_labels))
            )

    else:
        for src_model in ["catboost_regressor", "xgboost_regressor"]:
            model = Regression().get(src_model)
            score = evaluate_regression(model, train_data, labels)["str"]["r2"]
            raw_score = evaluate_regression(model, train_data, labels)["clf"]["r2"][0]

            results["str"][src_model] = score
            results["raw"][src_model] = raw_score
            results["models"][src_model] = (
                Regression().get(src_model).fit(train_data, labels)
            )

    return results


def prepare_longitudinal_imputers(data, columns, base_model="catboost"):
    imputers = {}

    for direction in ["forward", "reverse"]:
        imputers[direction] = {}
        for target_col in columns:
            train_data, labels = prepare_temporal_data(data, target_col, direction)
            print("train", target_col, direction, len(np.unique(labels)))

            if len(np.unique(labels)) > cat_limit:
                model = Regression().get(f"{base_model}_regressor")
            else:
                model = Classifiers().get(base_model)

            model.fit(train_data, labels)

            imputers[direction][target_col] = model

    return imputers


def prepare_longitudinal_imputers_v2(data, columns):
    imputers = {}

    for direction in ["forward", "reverse"]:
        imputers[direction] = {}
        for target_col in columns:
            benchmarks = evaluate_target(data, target_col, direction=direction)

            best_score = -1
            best_mod = None
            for mod in benchmarks["raw"]:
                if benchmarks["raw"][mod] > best_score:
                    best_score = benchmarks["raw"][mod]
                    best_mod = benchmarks["models"][mod]

            imputers[direction][target_col] = best_mod

    return imputers

In [12]:
# for col in ["CDRSB"]:
#    score = evaluate_target(dev_set, col, direction = "forward")
#    print(col, score["str"])
# DX_num {'catboost': '0.9874 +/- 0.0012', 'xgboost': '0.9867 +/- 0.0006'}
# CDRSB {'catboost_regressor': '0.8143 +/- 0.0132', 'xgboost_regressor': '0.7971 +/- 0.0191'}
# MMSE {'catboost_regressor': '0.7303 +/- 0.014', 'xgboost_regressor': '0.7089 +/- 0.0156'}
# ADAS13 {'catboost_regressor': '0.7975 +/- 0.0018', 'xgboost_regressor': '0.7955 +/- 0.0018'}
# Ventricles {'catboost_regressor': '0.7596 +/- 0.0226', 'xgboost_regressor': '0.764 +/- 0.0218'}
# Hippocampus {'catboost_regressor': '0.8483 +/- 0.0073', 'xgboost_regressor': '0.8571 +/- 0.0058'}
# WholeBrain {'catboost_regressor': '0.8574 +/- 0.0122', 'xgboost_regressor': '0.8691 +/- 0.0155'}
# Entorhinal {'catboost_regressor': '0.6859 +/- 0.0071', 'xgboost_regressor': '0.6648 +/- 0.0059'}
# Fusiform {'catboost_regressor': '0.7779 +/- 0.0174', 'xgboost_regressor': '0.7673 +/- 0.0212'}
# MidTemp {'catboost_regressor': '0.8207 +/- 0.0075', 'xgboost_regressor': '0.8157 +/- 0.0121'}

In [13]:
# for col in eval_cols:
#    score = evaluate_target(dev_set, col, direction = "reverse")
#    print(col, score)

# DX_num {'catboost': '0.9824 +/- 0.0031', 'xgboost': '0.9817 +/- 0.0023'}
# CDRSB {'catboost_regressor': '0.788 +/- 0.007', 'xgboost_regressor': '0.7876 +/- 0.0024'}
# MMSE {'catboost_regressor': '0.7291 +/- 0.0272', 'xgboost_regressor': '0.6992 +/- 0.0299'}
# ADAS13 {'catboost_regressor': '0.8016 +/- 0.0142', 'xgboost_regressor': '0.7944 +/- 0.0149'}
# Ventricles {'catboost_regressor': '0.7043 +/- 0.0233', 'xgboost_regressor': '0.6825 +/- 0.0351'}
# Hippocampus {'catboost_regressor': '0.8511 +/- 0.0087', 'xgboost_regressor': '0.8515 +/- 0.0066'}
# WholeBrain {'catboost_regressor': '0.8595 +/- 0.016', 'xgboost_regressor': '0.8658 +/- 0.0184'}
# Entorhinal {'catboost_regressor': '0.6898 +/- 0.0078', 'xgboost_regressor': '0.6646 +/- 0.0054'}
# Fusiform {'catboost_regressor': '0.7785 +/- 0.0197', 'xgboost_regressor': '0.7802 +/- 0.0145'}
# MidTemp {'catboost_regressor': '0.8199 +/- 0.0094', 'xgboost_regressor': '0.8202 +/- 0.0139'}

In [14]:
imputers_bkp_file = (
    workspace / f"longitudinal_imputers_scaled_cat{cat_limit}_aug_wait_v5.bkp"
)
# imputers_bkp_file = workspace / f"longitudinal_imputers_scaled_cat{cat_limit}.bkp"

if imputers_bkp_file.exists():
    longitudinal_imputers = load_model_from_file(imputers_bkp_file)
else:
    longitudinal_imputers = prepare_longitudinal_imputers(dev_set, eval_cols)
    save_model_to_file(imputers_bkp_file, longitudinal_imputers)

## Preprocess data

In [15]:
from hyperimpute.plugins.imputers import Imputers

# VISCODE 6 * x -> AGE 0.5 * x

const_by_patient = ["PTGENDER_num", "PTEDUCAT", "APOE4"]


def dataframe_hash(df: pd.DataFrame) -> str:
    return str(abs(pd.util.hash_pandas_object(df).sum()))


def normalize(test_data):
    test_data.loc[test_data["CDRSB"] < 0, "CDRSB"] = 0

    return test_data


def prepare_consts(train_data, test_data):
    test_data = test_data.copy()
    train_data = train_data.copy()

    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    test_data = test_data.sort_values(["RID_HASH", "VISCODE"])

    for item in test_data.groupby("RID_HASH"):
        local = item[1]

        # fill consts
        for col in const_by_patient:
            if len(local[col].unique()) == 1:
                continue
            rid = local["RID_HASH"].unique()[0]

            val = local[col][~local[col].isna()].unique()[0]
            local[col] = local[col].fillna(val)
            test_data.loc[test_data["RID_HASH"] == rid, col] = test_data[
                test_data["RID_HASH"] == rid
            ][col].fillna(val)
            assert len(local[col].unique()) == 1, col

    return test_data


def prepare_age(train_data, test_data):
    test_data = test_data.copy()
    train_data = train_data.copy()

    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    test_data = test_data.sort_values(["RID_HASH", "VISCODE"])

    col = "AGE"

    for rid in test_data["RID_HASH"].unique():
        local = test_data[test_data["RID_HASH"] == rid]

        # fill age
        ages = local["AGE"]
        if ages.isna().sum() == 0:
            continue

        if ages.isna().sum() == len(ages):
            continue

        # forward impute age
        prev_viscode = 0
        prev_age = 0
        for idx, row in local.iterrows():
            current_viscode = row["VISCODE"]
            local_idx = (test_data["VISCODE"] == current_viscode) & (
                test_data["RID_HASH"] == rid
            )
            if prev_age > 0 and prev_age == prev_age:
                pred_age = (current_viscode - prev_viscode) / 6 * 0.5 + prev_age
            else:
                pred_age = row[col]

            if pred_age == pred_age:
                # print("forward imputed", pred_age, current_viscode)
                test_data.loc[local_idx, col] = test_data.loc[local_idx][col].fillna(
                    pred_age
                )

            prev_viscode = row["VISCODE"]
            prev_age = pred_age

        # reverse impute age
        prev_viscode = 0
        prev_age = 0
        for idx, row in local.iloc[::-1].iterrows():
            current_viscode = row["VISCODE"]
            local_idx = (test_data["VISCODE"] == current_viscode) & (
                test_data["RID_HASH"] == rid
            )

            if prev_age > 0 and prev_age == prev_age:
                pred_age = prev_age - (prev_viscode - current_viscode) / 6 * 0.5
            else:
                pred_age = row[col]

            if pred_age == pred_age:
                # print("reversed imputed", pred_age, current_viscode)
                test_data.loc[local_idx, col] = test_data.loc[local_idx][col].fillna(
                    pred_age
                )

            prev_viscode = row["VISCODE"]
            prev_age = pred_age

        # print(test_data[(test_data["RID_HASH"] == rid)][["VISCODE", "AGE"]])
    return test_data


def impute_longitudinal(
    train_data,
    test_data,
    n_iter=5,
    eval_cols=[
        "DX_num",
        "CDRSB",
        "MMSE",
        "ADAS13",
        "Ventricles",
        "Hippocampus",
        "WholeBrain",
        "Entorhinal",
        "Fusiform",
        "MidTemp",
    ],
    imputed_test_data=None,
):
    test_data = test_data.copy()
    train_data = train_data.copy()

    if imputed_test_data is None:
        imputed_test_data = intermediary_imputation(
            train_data, test_data, eval_cols=eval_cols
        )

    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    test_data = test_data.sort_values(["RID_HASH", "VISCODE"])
    imputed_test_data = imputed_test_data.sort_values(["RID_HASH", "VISCODE"])

    prev_cols = [f"prev_{col}" for col in train_cols]

    for rid in test_data["RID_HASH"].unique():
        patient = test_data[test_data["RID_HASH"] == rid]
        patient_imputed = imputed_test_data[imputed_test_data["RID_HASH"] == rid]

        prediction_rows = [pd.Series(np.zeros(len(prev_cols)), index=train_cols)]
        for ridx, row in patient.iterrows():
            prediction_rows.append(row[train_cols])
        prediction_rows.append(pd.Series(np.zeros(len(prev_cols)), index=train_cols))

        for col in eval_cols:
            if patient[col].isna().sum() == 0:
                continue

            for ridx, row in enumerate(prediction_rows[1:-1]):
                real_idx = ridx + 1
                if row[col] == row[col]:
                    continue
                current_viscode = row["VISCODE"]
                local_idx = (test_data["VISCODE"] == current_viscode) & (
                    test_data["RID_HASH"] == rid
                )

                prev_col_val = prediction_rows[real_idx - 1][col]
                next_col_val = prediction_rows[real_idx + 1][col]

                if next_col_val == next_col_val and ridx + 1 < len(patient_imputed):
                    eval_data = (
                        patient_imputed.iloc[ridx]
                        .to_frame()
                        .T[train_cols]
                        .drop(columns=[col])
                    )  # row.to_frame().T[train_cols]
                    eval_data[prev_cols] = (
                        patient_imputed.iloc[ridx + 1].to_frame().T[train_cols].values
                    )
                    eval_data = eval_data.astype(float)

                    assert eval_data.isna().sum().sum() == 0
                    assert eval_data[f"prev_{col}"].values[0] == next_col_val

                    imputer = longitudinal_imputers["reverse"][col]
                    imputed_val = imputer.predict(eval_data).values.squeeze()

                    test_data.loc[local_idx, col] = imputed_val

                if prev_col_val == prev_col_val and ridx > 0:
                    # print("Imputing using the prev value", prev_col_val)
                    eval_data = (
                        patient_imputed.iloc[ridx]
                        .to_frame()
                        .T[train_cols]
                        .drop(columns=[col])
                    )
                    eval_data[prev_cols] = (
                        patient_imputed.iloc[ridx - 1].to_frame().T[train_cols].values
                    )
                    eval_data = eval_data.astype(float)

                    assert eval_data.isna().sum().sum() == 0
                    assert eval_data[f"prev_{col}"].values[0] == prev_col_val

                    imputer = longitudinal_imputers["forward"][col]
                    imputed_val = imputer.predict(eval_data).values.squeeze()
                    test_data.loc[local_idx, col] = imputed_val

                    continue

    return normalize(test_data)


def intermediary_imputation(train_data, test_data, eval_cols):
    imputed_test_data = test_data.copy()
    local_kwargs = {
        "optimizer": "simple",
        "classifier_seed": ["logistic_regression"],
        "regression_seed": ["linear_regression"],
        "class_threshold": cat_limit,
    }
    imputer = Imputers().get(
        "hyperimpute",
        **local_kwargs,
    )
    imputation_input = pd.concat([train_data, test_data], ignore_index=True)
    imputed_test_data = imputer.fit_transform(imputation_input)
    imputed_test_data = imputed_test_data.tail(len(test_data))

    return normalize(imputed_test_data)


def full_imputation(train_data, test_data, eval_cols):
    imputed_test_data = test_data.copy()
    imputer_kwargs = {
        "optimizer": "simple",
        "classifier_seed": [
            "catboost",
        ],
        # "classifier_seed" : ["xgboost",],
        # "regression_seed" : ["xgboost_regressor"],
        "regression_seed": ["catboost_regressor", "xgboost_regressor"],  # rmse 0.99
        "class_threshold": cat_limit,
    }

    imputer = Imputers().get(
        "hyperimpute",
        **imputer_kwargs,
    )
    imputation_input = pd.concat([train_data, test_data], ignore_index=True)
    imputed_test_data = imputer.fit_transform(imputation_input)
    imputed_test_data = imputed_test_data.tail(len(test_data))

    return normalize(imputed_test_data)


def evaluate_first_visit(train_data, test_data, static_imputation):
    print("Static impute using first row")
    test_data = test_data.copy()
    train_data = train_data.copy()
    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    test_data = test_data.sort_values(["RID_HASH", "VISCODE"])
    # impute  first visit    "
    first_visit_imputed = static_imputation.drop_duplicates("RID_HASH")

    for rid in first_visit_imputed["RID_HASH"].unique():
        row = first_visit_imputed[first_visit_imputed["RID_HASH"] == rid]
        current_viscode = row["VISCODE"].values[0]
        local_idx = (test_data["VISCODE"] == current_viscode) & (
            test_data["RID_HASH"] == rid
        )
        imputed_idx = (first_visit_imputed["VISCODE"] == current_viscode) & (
            first_visit_imputed["RID_HASH"] == rid
        )
        if len(test_data[local_idx]) == 0:
            continue
        for col in test_data.columns:
            val = test_data.loc[local_idx][col].values[0]
            if val == val:
                continue
            imputed_val = first_visit_imputed.loc[imputed_idx][col].values[0]
            test_data.loc[local_idx, col] = imputed_val
            # print(\"imputed\", test_data.loc[local_idx, col]),
    return test_data


def evaluate_static_imputation(train_data, test_data, static_imputation):
    print("Static impute using least miss row")
    test_data = test_data.copy()
    train_data = train_data.copy()

    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    test_data = test_data.sort_values(["RID_HASH", "VISCODE"])

    for rid in test_data["RID_HASH"].unique():
        patient = test_data[test_data["RID_HASH"] == rid]
        misses = []
        viscodes = []
        for idx, row in patient.iterrows():
            misses.append(row.isna().sum())
            viscodes.append(row["VISCODE"])
        cidx = np.argmin(misses)

        current_viscode = viscodes[cidx]
        local_idx = (test_data["VISCODE"] == current_viscode) & (
            test_data["RID_HASH"] == rid
        )
        imputed_idx = (static_imputation["VISCODE"] == current_viscode) & (
            static_imputation["RID_HASH"] == rid
        )

        if len(test_data[local_idx]) == 0:
            continue

        for col in test_data.columns:
            val = test_data.loc[local_idx][col].values[0]
            if val == val:
                continue
            imputed_val = static_imputation.loc[imputed_idx][col].values[0]
            test_data.loc[local_idx, col] = imputed_val

            # print("imputed", test_data.loc[local_idx, col])

    return normalize(test_data)


def impute_data(
    train_data, test_data, use_longitudinal=True, static_strategy="missmin"
):
    test_id = dataframe_hash(test_data)

    print("Evaluate constants", test_id, test_data.isna().sum().sum())
    test_data = prepare_consts(train_data, test_data)
    test_data = prepare_age(train_data, test_data)
    assert (test_data["CDRSB"] < 0).sum() == 0

    print("Evaluate longitudinals", test_id, test_data.isna().sum().sum())
    while use_longitudinal:
        new_test_data = impute_longitudinal(train_data, test_data)
        if new_test_data.isna().sum().sum() == test_data.isna().sum().sum():
            break

        test_data = new_test_data
    assert (test_data["CDRSB"] < 0).sum() == 0

    print(
        "Evaluate static imputation",
        test_id,
        test_data.isna().sum().sum(),
        static_strategy,
    )
    test_hash = dataframe_hash(test_data)
    static_imputers_bkp_file = workspace / f"static_imputation_{test_hash}.bkp"

    static_imputation = full_imputation(train_data, test_data, test_data.columns)
    save_model_to_file(static_imputers_bkp_file, static_imputation)

    if static_strategy == "first":
        test_data = evaluate_first_visit(train_data, test_data, static_imputation)
    else:
        test_data = evaluate_static_imputation(train_data, test_data, static_imputation)

    assert (test_data["CDRSB"] < 0).sum() == 0

    print("Evaluate constants take 2", test_id, test_data.isna().sum().sum())
    test_data = prepare_consts(train_data, test_data)
    test_data = prepare_age(train_data, test_data)

    assert (test_data["CDRSB"] < 0).sum() == 0

    print("Evaluate longitudinals take 2", test_id, test_data.isna().sum().sum())
    while use_longitudinal:
        new_test_data = impute_longitudinal(train_data, test_data)
        if new_test_data.isna().sum().sum() == test_data.isna().sum().sum():
            break

        test_data = new_test_data

    assert (test_data["CDRSB"] < 0).sum() == 0

    print("Normalize data", test_id, test_data.isna().sum().sum())
    return normalize(test_data)

In [16]:
dev_1_eval = impute_data(dev_set, dev_1)
dev_2_eval = impute_data(dev_set, dev_2)

Evaluate constants 7372239728249779769 22466
Evaluate longitudinals 7372239728249779769 19844
Evaluate static imputation 7372239728249779769 9126 missmin
Static impute using least miss row
Evaluate constants take 2 7372239728249779769 5492
Evaluate longitudinals take 2 7372239728249779769 4386
Normalize data 7372239728249779769 0
Evaluate constants 1652573492839247800 22991
Evaluate longitudinals 1652573492839247800 20000
Evaluate static imputation 1652573492839247800 9192 missmin
Static impute using least miss row
Evaluate constants take 2 1652573492839247800 5410
Evaluate longitudinals take 2 1652573492839247800 4115
Normalize data 1652573492839247800 0


In [17]:
# dev_1_eval_static_first = impute_data(dev_set, dev_1, static_strategy = "first")
# dev_2_eval_static_first = impute_data(dev_set, dev_2, static_strategy = "first")

In [18]:
dev_2_eval

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,...,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit,prev_visit,next_visit,avg_wait
2163,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,0,79.100000,0.0,20.0,1.0,1.0,0.500000,0.923077,0.164384,...,0.548646,0.376516,0.464021,0.194906,0.400709,2.0,6.0,-1.0,6.0,6.0
154,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,6,79.600000,0.0,20.0,1.0,1.0,1.596528,0.913749,0.255431,...,0.548307,0.349684,0.403880,0.193367,0.397291,2.0,6.0,0.0,-1.0,6.0
1385,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,0,72.900000,1.0,12.0,1.0,1.0,1.000000,1.000000,0.123288,...,0.525169,0.296583,0.513404,0.356253,0.294774,6.0,60.0,-1.0,6.0,12.0
2698,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,6,73.400000,1.0,12.0,1.0,1.0,1.000000,1.000000,0.164384,...,0.549210,0.296583,0.435097,0.322395,0.294175,6.0,60.0,0.0,12.0,12.0
2291,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,12,73.900000,1.0,12.0,1.0,1.0,1.000000,0.961538,0.109589,...,0.556283,0.274814,0.434747,0.344282,0.324948,6.0,60.0,6.0,24.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,60,80.988266,1.0,19.0,1.0,0.0,3.000000,0.923077,0.223699,...,0.357020,0.297969,0.310935,0.399047,0.461476,7.0,102.0,36.0,102.0,17.0
2646,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,102,84.488266,1.0,19.0,1.0,0.0,3.000000,0.846154,0.168904,...,0.352043,0.282995,0.256790,0.372685,0.416478,7.0,102.0,60.0,-1.0,17.0
1962,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,0,72.100000,0.0,12.0,1.0,0.0,1.207149,0.908718,0.210524,...,0.602438,0.661561,0.610229,0.743037,0.624631,3.0,24.0,-1.0,12.0,12.0
122,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,12,73.100000,0.0,12.0,1.0,0.0,0.933420,0.923737,0.243457,...,0.555504,0.631893,0.560849,0.612140,0.638031,3.0,24.0,0.0,24.0,12.0


In [19]:
dev_set.describe()

Unnamed: 0,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit,prev_visit,next_visit,avg_wait
count,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0,4101.0
mean,22.719824,74.699829,0.449159,16.098025,0.817118,0.535479,1.66496,0.890027,0.219214,0.228729,0.499418,0.421892,0.441466,0.422304,0.451887,4.727384,52.039503,12.346257,21.841502,12.802142
std,28.994209,7.154518,0.497469,2.781059,0.726643,0.652217,2.217401,0.130379,0.146025,0.14319,0.136845,0.133995,0.14992,0.135363,0.132796,2.423826,41.490374,22.317893,29.361444,8.605123
min,0.0,54.4,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,-1.0,0.0
25%,0.0,70.2,0.0,14.0,0.0,0.0,0.0,0.846154,0.109589,0.126196,0.406005,0.32865,0.343739,0.333664,0.367292,3.0,24.0,-1.0,-1.0,8.0
50%,12.0,75.0,0.0,16.0,1.0,0.0,1.0,0.923077,0.178082,0.19819,0.509819,0.418718,0.446561,0.421586,0.454126,4.0,36.0,0.0,12.0,12.0
75%,24.0,79.7,1.0,18.0,1.0,1.0,2.5,1.0,0.30137,0.295614,0.596501,0.510674,0.541093,0.512337,0.541302,6.0,78.0,12.0,24.0,16.8
max,186.0,97.4,1.0,20.0,2.0,2.0,16.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,12.0,186.0,162.0,186.0,156.0


In [20]:
train_eval_data = pd.concat([dev_set, dev_1_eval, dev_2_eval], ignore_index=True).drop(
    columns=["RID_HASH"]
)
# train_eval_data_static_first = pd.concat([dev_set, dev_1_eval_static_first, dev_2_eval_static_first], ignore_index = True).drop(columns = ["RID_HASH"])

train_eval_data_raw = pd.concat([dev_set, dev_1, dev_2], ignore_index=True).drop(
    columns=["RID_HASH"]
)

train_gt = pd.concat([dev_set, dev_set, dev_set], ignore_index=True).drop(
    columns=["RID_HASH"]
)

train_mask = train_eval_data_raw.isna().astype(int)
train_mask_bool = train_eval_data_raw.isna()

In [21]:
from hyperimpute.plugins.utils.metrics import RMSE
from scipy.stats import wasserstein_distance

for col in train_gt.columns[1:]:
    print(
        col,
        RMSE(train_eval_data[col], train_gt[col], train_mask[col]),
        wasserstein_distance(train_gt[col], train_eval_data[col]),
    )

AGE 2.940661258706246 0.19741663764979034
PTGENDER_num 0.3335816263534847 0.011054214419247366
PTEDUCAT 0.8969927328515065 0.024540893277491732
DX_num 0.6925876445692277 0.053889295293830686
APOE4 0.5866479729410511 0.004795578314232163
CDRSB 1.7302055046225928 0.10509617772084945
MMSE 0.10413108808272098 0.007254619715139899
ADAS13 0.10432793319981153 0.009580961560025443
Ventricles 0.06821720560702225 0.004679886064861257
Hippocampus 0.0692764235456744 0.005286183096243876
WholeBrain 0.06873711226743885 0.00395599067568829
Entorhinal 0.09675548970235617 0.01135874297607796
Fusiform 0.07499619409842515 0.007429988847485466
MidTemp 0.07039458867653241 0.005765495785030097
total_visits nan 0.0
last_visit nan 0.0
prev_visit nan 0.0
next_visit nan 0.0
avg_wait nan 0.0


In [22]:
### scaled + normalization + selection imputation src
# VISCODE nan 0.0
# AGE 3.5293405996205034 0.2212305692934138
# PTGENDER_num 0.34203737779552684 0.0099975615703487
# PTEDUCAT 1.067897475082619 0.03275989984231571
# DX_num 0.660742187439313 0.07559131919044132
# APOE4 0.5425608669746597 0.005527107209623572
# CDRSB 1.7661061814479229 0.15207672925302748
# MMSE 0.10642074995195923 0.010027549607631962
# ADAS13 0.1051318279283219 0.010215993775999688
# Ventricles 0.07000662900412913 0.005347858518635116
# Hippocampus 0.07186086087482552 0.006532167124149632
# WholeBrain 0.06710736037726597 0.004621577106687779
# Entorhinal 0.0989326732983036 0.010885303270980275
# Fusiform 0.07554859903589055 0.00708604167882555
# MidTemp 0.07211511404572751 0.006110878052264325

In [23]:
# single visits
biggest_err = 0
biggest_err_rid = None
cnt_single_visits = 0

for rid in dev_set["RID_HASH"].unique():
    patient = dev_set[dev_set["RID_HASH"] == rid]
    patient_imputed = dev_2_eval[dev_2_eval["RID_HASH"] == rid]
    patient_orig = dev_2[dev_2["RID_HASH"] == rid]

    patient_mask = patient_orig.isna().astype(int)

    patient_err = RMSE(
        patient_imputed.drop(columns=["RID_HASH"]).values,
        patient.drop(columns=["RID_HASH"]).values,
        patient_mask.drop(columns=["RID_HASH"]).values,
    )
    if len(patient) == 1:
        cnt_single_visits += 1

    if patient_err != patient_err:
        continue

    if patient_err > biggest_err and len(patient) == 1:
        biggest_err = patient_err
        biggest_err_rid = rid

# worst err : 7.729079504009584
biggest_err_rid, biggest_err, cnt_single_visits

('aca26b5afac001acf3936b66115ada177258f275e34c8c65c1dacea8b18b76fd',
 7.015776166208143,
 255)

In [24]:
biggest_err = 0
biggest_err_rid = None
cnt_multiple_visits = 0

for rid in dev_set["RID_HASH"].unique():
    patient = dev_set[dev_set["RID_HASH"] == rid]
    patient_imputed = dev_2_eval[dev_2_eval["RID_HASH"] == rid]
    patient_orig = dev_2[dev_2["RID_HASH"] == rid]

    patient_mask = patient_orig.isna().astype(int)

    patient_err = RMSE(
        patient_imputed.drop(columns=["RID_HASH"]).values,
        patient.drop(columns=["RID_HASH"]).values,
        patient_mask.drop(columns=["RID_HASH"]).values,
    )
    if patient_err != patient_err:
        continue
    if len(patient) > 1:
        cnt_multiple_visits += 1

    if patient_err > biggest_err and len(patient) > 1:
        biggest_err = patient_err
        biggest_err_rid = rid

# worst err : 6.09123256387527
biggest_err_rid, biggest_err, cnt_multiple_visits

('32de9128e1e16f45260b6cec3b0de027c6482d17328c8419324d2a17ed598db8',
 8.102578352470672,
 964)

In [25]:
# use static missmin visit strategy

from hyperimpute.plugins.imputers import Imputers
from hyperimpute.utils.benchmarks import benchmark_model
from sklearn.preprocessing import LabelEncoder

plugin = Imputers().get(
    "hyperimpute",
    optimizer="simple",
    classifier_seed=["catboost"],
    regression_seed=["xgboost_regressor", "catboost_regressor"],
    class_threshold=cat_limit,
)

benchmark_model("missforest", plugin, train_gt, train_eval_data, train_mask)

(1.0046816699333012, 0.45210466539599414)

In [26]:
# use static first visit strategy
from hyperimpute.utils.benchmarks import benchmark_model
from sklearn.preprocessing import LabelEncoder

# benchmark_model("missforest", plugin, train_gt, train_eval_data_static_first, train_mask)

In [27]:
# no preprocessing: (28256.01247357673, 13772.1483031795)
# missforest: (26900.321450057505, 11422.800036863018)
# missforest + catboost: (26818.913956820044, 11172.534584559302)
# impute first visit by consts + AGE : (22801.31764624424, 4530.5431159416)
# long inputing for DX_NUM (20140.313132594343, 4911.72658229064)


# scaled impute by first visit: (1.2100147872665876, 0.6437290692806962)
# scaled impute + mean + long imputation DX_NUM : (1.1785409955690953, 0.5420526563648462)
# scaled impute + mean + long imputation DX_NUM + MMSE: (1.1668009083992619, 0.7318445766107088)
# scaled impute + mean + long imputation DX_NUM + ADAS13: (1.1030186731907161, 0.5537261491649679)
# scaled impute + mean + long imputation DX_NUM + ADAS13 + Ventricles: (1.1205398362774392, 0.5763027452010369)
# scaled impute + mean + long imputation DX_NUM + ADAS13 + Hippocampus: (1.2369516583030933, 0.6736382987041747)

# cat40 simple : (1.4568880717344705, 0.9934616930026703)
# cat10 + scaled impute + hyperimpute(catboost) + long imputation full:  (1.0241908722005213, 0.4927460867941309)
# cat10 + scaled impute + long imputation full + normalization:  (1.065308437510163, 0.4953959370803312)

# cat10 + scaled impute + long imputation full + static imputation + normalization:  (1.1640316504107393, 0.5580105554943448)
# cat10 + scaled impute + long imputation full + static imputation:  (1.06, 0.53)

# cat10 + visit cnt + long imputation + linear interm imputation: (0.9921784212883535, 0.44433424863837917)
# cat10 + visit cnt + long imputation + rf interm imputation: (1.14, 0.77)
# cat10 + visit cnt + long imputation + catboost interm imputation: (1.10, 0.61)
# cat10 + visit cnt + long imputation + xgnoost interm imputation: (1.11, 0.51)
# cat50 + visit cnt + long imputation: (0.989985663637702, 0.42404567112877595)

# cat10 + catboost clf + xgb regressor : (0.9741738538218577, 0.430161877468509)
# cat10 + catboost full + xgb regressor + prev/next visit : (0.9864849558376909, 0.4271620092167154)
# cat10 + catboost clf + xgb regressor + prev/next visit : (0.9889412273219121, 0.414719032464237)

## Submission data

In [None]:
test_A_eval = impute_data(dev_set, test_A)
test_B_eval = impute_data(dev_set, test_B)

Evaluate constants 6703621200038434803 6974
Evaluate longitudinals 6703621200038434803 6161
Evaluate static imputation 6703621200038434803 2766 missmin
Static impute using least miss row
Evaluate constants take 2 6703621200038434803 1532
Evaluate longitudinals take 2 6703621200038434803 1239
Normalize data 6703621200038434803 0
Evaluate constants 8721716048509625602 8052
Evaluate longitudinals 8721716048509625602 6925
Evaluate static imputation 8721716048509625602 3221 missmin


In [None]:
eval_data = pd.concat([dev_set, test_A_eval, test_B_eval], ignore_index=True)


eval_data

In [None]:
from hyperimpute.plugins.imputers import Imputers

plugin = Imputers().get(
    "hyperimpute",
    optimizer="simple",
    classifier_seed=["catboost"],
    regression_seed=["xgboost_regressor"],
)


imputed_X = plugin.fit_transform(eval_data.copy())
imputed_X[scaled_cols] = scaler.inverse_transform(imputed_X[scaled_cols])

imputed_X

In [None]:
imputed_X["CDRSB"].isna().sum()

In [None]:
import numpy as np


def normalize_output(test_data):
    test_data = test_data.copy()
    factor = test_data["CDRSB"] / 0.5
    # factor[factor < 0] = 0
    assert (factor < 0).sum() == 0

    factor = factor.fillna(-1)
    factor = factor.round(0).astype(int)
    factor = factor.replace(-1, np.nan)
    test_data["CDRSB"] = factor * 0.5

    test_data["ADAS13"] = ((test_data["ADAS13"] * 3).round(0) / 3).round(2)
    test_data["MMSE"] = test_data["MMSE"].round(0)

    return test_data


def dump_results(imputed_data: pd.DataFrame, fpath: str):
    results = []

    for name, data in [
        ("test_A", test_A),
        ("test_B", test_B),
    ]:
        for idx, row in data.iterrows():
            for col in row.index:
                local = row.T
                val = local[col]
                if val == val:
                    continue
                imputed_id = f"{local['RID_HASH']}_{local['VISCODE']}_{col}_{name}"
                imputed_val = imputed_data[
                    (imputed_data["RID_HASH"] == local["RID_HASH"])
                    & (imputed_data["VISCODE"] == local["VISCODE"])
                ][col].values[0]
                assert imputed_val == imputed_val
                assert imputed_val != ""
                results.append([imputed_id, imputed_val])

    output = pd.DataFrame(results, columns=submission.columns)
    output.to_csv(fpath, index=None)

    return output

In [None]:
version = "v21"
changelog = f"cb_full_visits_freq"
output = dump_results(imputed_X, f"imputation_results_{version}_{changelog}.csv")
output_normalized = dump_results(
    normalize_output(imputed_X),
    f"imputation_results_{version}_{changelog}_normalized.csv",
)

output

In [None]:
output_normalized

In [None]:
pd.set_option("display.expand_frame_repr", True)

output.tail(5).values

In [None]:
pd.set_option("display.expand_frame_repr", True)

output_normalized.tail(5).values