In [1]:
import pandas as pd
import warnings
from hyperimpute.utils.serialization import load_model_from_file, save_model_to_file
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler

workspace = Path("workspace")
results_dir = Path("results")
data_dir = Path("data")

workspace.mkdir(parents=True, exist_ok=True)

warnings.filterwarnings("ignore")

cat_limit = 10
n_seeds = 5

version = "take6_v1"
changelog = f"multiple_imputation{n_seeds}_with_augmentation_catlimit{cat_limit}"


In [2]:
def augment_base_dataset(df):
    df = df.sort_values(["RID_HASH", "VISCODE"])
    
    for rid in df["RID_HASH"].unique():
        patient = df[df["RID_HASH"] == rid]
        
        last_visit = patient["VISCODE"].max()
        visits = df[df["RID_HASH"] == rid]["VISCODE"]
        total_visits = (patient["VISCODE"] / 6 + 1).astype(int).max()
        
        df.loc[df["RID_HASH"] == rid, "total_visits"] = total_visits
        df.loc[df["RID_HASH"] == rid, "last_visit"] = last_visit
        df.loc[df["RID_HASH"] == rid, "is_first_visit"] = (visits == 0).astype(int)
        
    return df

In [3]:
dev_set = pd.read_csv(data_dir / "dev_set.csv")
dev_set = augment_base_dataset(dev_set)

scaled_cols = [
    "MMSE",
    "ADAS13",
    "Ventricles",
    "Hippocampus",
    "WholeBrain",
    "Entorhinal",
    "Fusiform",
    "MidTemp",
]

scaler = MinMaxScaler().fit(dev_set[scaled_cols])
dev_set[scaled_cols] = scaler.transform(dev_set[scaled_cols])
dev_set

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit,is_first_visit
2163,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,0,79.1,0,20,1.0,1.0,0.5,0.923077,0.164384,0.071871,0.548646,0.376516,0.464021,0.194906,0.400709,2.0,6.0,1.0
154,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,6,79.6,0,20,1.0,1.0,1.5,0.923077,0.237397,0.071956,0.548307,0.366398,0.403880,0.193367,0.397291,2.0,6.0,0.0
1385,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,0,72.9,1,12,1.0,1.0,1.0,1.000000,0.123288,0.142655,0.525169,0.235599,0.513404,0.356253,0.294774,11.0,60.0,1.0
2698,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,6,73.4,1,12,1.0,1.0,1.0,1.000000,0.164384,0.144729,0.549210,0.230361,0.435097,0.322395,0.294175,11.0,60.0,0.0
2291,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,12,73.9,1,12,1.0,1.0,1.0,0.961538,0.109589,0.155550,0.527878,0.215944,0.487831,0.342600,0.277552,11.0,60.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,60,79.8,1,19,1.0,0.0,3.0,0.923077,0.223699,0.170895,0.357020,0.321346,0.310935,0.399047,0.461476,18.0,102.0,0.0
2646,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,102,83.3,1,19,1.0,0.0,3.0,0.846154,0.168904,0.178231,0.352043,0.309095,0.256790,0.372685,0.416478,18.0,102.0,0.0
1962,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,0,72.1,0,12,1.0,0.0,0.5,0.884615,0.150685,0.416382,0.602438,0.636654,0.610229,0.743037,0.624631,5.0,24.0,1.0
122,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,12,73.1,0,12,1.0,0.0,1.0,0.961538,0.155205,0.398451,0.608521,0.634650,0.617108,0.729087,0.638477,5.0,24.0,0.0


In [4]:
dev_1 = pd.read_csv(data_dir / "dev_1.csv")
dev_1 = augment_base_dataset(dev_1)
dev_1[scaled_cols] = scaler.transform(dev_1[scaled_cols])

dev_1

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit,is_first_visit
2163,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,0,,0.0,20.0,1.0,1.0,0.5,0.923077,0.164384,,,0.376516,,,,2.0,6.0,1.0
154,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,6,79.6,0.0,20.0,1.0,1.0,1.5,0.923077,0.237397,0.071956,0.548307,0.366398,0.403880,0.193367,0.397291,2.0,6.0,0.0
1385,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,0,,1.0,12.0,,1.0,,,,,0.525169,0.235599,0.513404,0.356253,0.294774,11.0,60.0,1.0
2698,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,6,,1.0,12.0,,1.0,,,,,0.549210,0.230361,0.435097,0.322395,0.294175,11.0,60.0,0.0
2291,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,12,,1.0,12.0,,1.0,,,,,0.527878,0.215944,0.487831,0.342600,0.277552,11.0,60.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,60,79.8,1.0,19.0,,0.0,,,,0.170895,,0.321346,,,,18.0,102.0,0.0
2646,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,102,83.3,1.0,19.0,,0.0,,,,0.178231,,0.309095,,,,18.0,102.0,0.0
1962,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,0,72.1,,12.0,1.0,0.0,0.5,0.884615,0.150685,0.416382,0.602438,,0.610229,0.743037,0.624631,5.0,24.0,1.0
122,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,12,73.1,,12.0,1.0,0.0,1.0,0.961538,0.155205,0.398451,0.608521,,0.617108,0.729087,0.638477,5.0,24.0,0.0


In [5]:
dev_2 = pd.read_csv(data_dir / "dev_2.csv")
dev_2 = augment_base_dataset(dev_2)
dev_2[scaled_cols] = scaler.transform(dev_2[scaled_cols])

dev_2

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit,is_first_visit
2163,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,0,79.1,0.0,20.0,1.0,1.0,0.5,0.923077,0.164384,0.071871,0.548646,0.376516,0.464021,0.194906,0.400709,2.0,6.0,1.0
154,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,6,79.6,,,,1.0,,,,0.071956,0.548307,,0.403880,0.193367,0.397291,2.0,6.0,0.0
1385,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,0,72.9,,12.0,1.0,1.0,1.0,1.000000,0.123288,0.142655,0.525169,,0.513404,0.356253,0.294774,11.0,60.0,1.0
2698,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,6,,,12.0,1.0,1.0,1.0,1.000000,0.164384,,0.549210,,0.435097,0.322395,0.294175,11.0,60.0,0.0
2291,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,12,,,12.0,1.0,1.0,1.0,0.961538,0.109589,,,,,,,11.0,60.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,60,,,19.0,1.0,0.0,3.0,0.923077,0.223699,,0.357020,,0.310935,0.399047,0.461476,18.0,102.0,0.0
2646,ff59785f0d6b12fc51a07f09bb3a02790e54d04bb0803b...,102,,,19.0,1.0,0.0,3.0,0.846154,0.168904,,0.352043,,0.256790,0.372685,0.416478,18.0,102.0,0.0
1962,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,0,72.1,,12.0,,0.0,,,,0.416382,0.602438,,0.610229,0.743037,0.624631,5.0,24.0,1.0
122,ff98c50c3e97b776ab61db883cf1c8fd5a6d304d7165c8...,12,,,12.0,,0.0,,,,,,,,,,5.0,24.0,0.0


In [6]:
submission = pd.read_csv(data_dir / "sample_submission.csv")

submission.values[1]

array(['6b6a7136f42a8dbd469a201b88e2abb54a93667822761357db2f6d620da6af8a_0_Ventricles_test_A',
       40613.0818580834], dtype=object)

In [7]:
test_A = pd.read_csv(data_dir / "test_A.csv")
test_A = augment_base_dataset(test_A)
test_A[scaled_cols] = scaler.transform(test_A[scaled_cols])

test_A

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit,is_first_visit
247,00d5e0050fbd3b6b610f6673347232eb0862df77b5b7a8...,0,,,16.0,1.0,0.0,0.5,0.961538,0.219178,,,,,,,1.0,0.0,1.0
819,013c6f92763546c7ad9c0831f023886c15f05e7332aa0c...,0,72.5,1.0,12.0,,1.0,,,,0.057498,0.612302,0.423268,0.291182,0.433004,0.329131,3.0,12.0,1.0
276,013c6f92763546c7ad9c0831f023886c15f05e7332aa0c...,6,73.0,1.0,12.0,,1.0,,,,0.067972,,0.399942,,,,3.0,12.0,0.0
350,013c6f92763546c7ad9c0831f023886c15f05e7332aa0c...,12,73.5,1.0,12.0,1.0,1.0,2.0,0.769231,0.365342,0.077516,,0.415324,,,,3.0,12.0,0.0
1268,024efbff9265302acd00190e57ee08ba1fe1b90f561f79...,0,,0.0,14.0,1.0,1.0,2.0,1.000000,0.164384,,,0.515223,,,,18.0,102.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,0,,,18.0,1.0,1.0,1.5,0.807692,0.150685,,,,,,,9.0,48.0,1.0
330,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,6,,,18.0,1.0,1.0,1.5,0.769231,0.095890,,,,,,,9.0,48.0,0.0
939,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,24,,,18.0,1.0,1.0,1.5,0.769231,0.150685,,,,,,,9.0,48.0,0.0
119,ff2966461950ba81280a0189ed2d504a8bd503d9f6b078...,48,70.9,,18.0,1.0,1.0,2.5,0.807692,0.246575,0.307697,0.420993,,0.392416,0.577719,0.403872,9.0,48.0,0.0


In [8]:
test_B = pd.read_csv(data_dir / "test_B.csv")
test_B = augment_base_dataset(test_B)
test_B[scaled_cols] = scaler.transform(test_B[scaled_cols])

test_B

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,total_visits,last_visit,is_first_visit
1181,001854e92967164311f3acd5a58be9790f28ab3968bbbc...,0,71.4,,15.0,0.0,2.0,0.0,0.961538,0.077671,0.085164,0.638939,,0.608113,0.424862,0.523781,7.0,36.0,1.0
1426,001854e92967164311f3acd5a58be9790f28ab3968bbbc...,36,74.4,,15.0,0.0,2.0,0.0,1.000000,0.027397,0.089750,,,,,,7.0,36.0,0.0
1201,0059bc7849aea9522b408fa0ddc60276a36cae00206b87...,0,,0.0,,1.0,0.0,0.5,0.846154,0.196301,,0.345711,0.286043,0.312698,0.276821,0.248579,5.0,24.0,1.0
757,0059bc7849aea9522b408fa0ddc60276a36cae00206b87...,6,,0.0,,1.0,0.0,1.0,1.000000,0.283151,,0.345147,0.278219,0.378307,0.289480,0.253793,5.0,24.0,0.0
763,0059bc7849aea9522b408fa0ddc60276a36cae00206b87...,12,,0.0,,1.0,0.0,2.5,0.807692,0.168904,,0.329233,0.253372,0.352028,0.259842,0.222042,5.0,24.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1423,ff4eb5a64e2b89861d5dea81190669893070b227f3a335...,0,,,18.0,1.0,1.0,1.5,0.884615,0.114110,,0.502370,,0.394356,0.397160,0.531003,15.0,84.0,1.0
558,ff4eb5a64e2b89861d5dea81190669893070b227f3a335...,12,,,18.0,1.0,1.0,1.5,0.923077,0.242055,,0.519639,,0.294356,0.416522,0.545575,15.0,84.0,0.0
70,ff4eb5a64e2b89861d5dea81190669893070b227f3a335...,84,,0.0,18.0,1.0,1.0,1.5,1.000000,0.178082,,0.432054,0.483387,0.363316,0.468451,0.508440,15.0,84.0,0.0
480,ffa86109ba8684f31325842d0ff26568e105f0f63b366a...,0,66.3,,13.0,0.0,0.0,0.0,0.923077,0.118767,0.177669,,,,,,5.0,24.0,1.0


In [9]:
test_A.isna().sum()

RID_HASH            0
VISCODE             0
AGE               612
PTGENDER_num      626
PTEDUCAT           65
DX_num            428
APOE4              49
CDRSB             428
MMSE              428
ADAS13            428
Ventricles        612
Hippocampus       668
WholeBrain        626
Entorhinal        668
Fusiform          668
MidTemp           668
total_visits        0
last_visit          0
is_first_visit      0
dtype: int64

In [10]:
test_A.columns

Index(['RID_HASH', 'VISCODE', 'AGE', 'PTGENDER_num', 'PTEDUCAT', 'DX_num',
       'APOE4', 'CDRSB', 'MMSE', 'ADAS13', 'Ventricles', 'Hippocampus',
       'WholeBrain', 'Entorhinal', 'Fusiform', 'MidTemp', 'total_visits',
       'last_visit', 'is_first_visit'],
      dtype='object')

In [11]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
from hyperimpute.plugins.prediction import Classifiers, Regression
from hyperimpute.utils.tester import evaluate_regression, evaluate_estimator


train_cols = list(dev_set.drop(columns=["RID_HASH"]).columns)

eval_cols = [
    "DX_num",
    "CDRSB",
    "MMSE",
    "ADAS13",
    "Ventricles",
    "Hippocampus",
    "WholeBrain",
    "Entorhinal",
    "Fusiform",
    "MidTemp",
]

def dataframe_hash(df: pd.DataFrame) -> str:
    return str(abs(pd.util.hash_pandas_object(df).sum()))

def prepare_temporal_data(data, target_col: str, direction: str):
    target_train_data = []
    target_train_labels = []

    for item in data.groupby("RID_HASH"):
        # print(item[0])
        local = item[1]
        local = local.sort_values(["RID_HASH", "VISCODE"])

        rid = local["RID_HASH"]

        prev_cols = [f"prev_{col}" for col in train_cols]
        prev_row = np.zeros(len(prev_cols))

        if direction == "forward":
            rows = local.iterrows()
        else:
            rows = local.iloc[::-1].iterrows()

        for idx, row in rows:
            target_val = row[target_col]
            tmp_row = row[train_cols].copy()
            src_data = tmp_row.to_frame().T.drop(columns=[target_col])

            src_data[prev_cols] = prev_row

            prev_row = tmp_row

            target_train_data.append(src_data)
            target_train_labels.append(target_val)

    target_train_data = pd.concat(target_train_data, ignore_index=True).astype(float)

    return target_train_data, target_train_labels


def evaluate_target(data, target_col: str, direction: str):
    train_data, labels = prepare_temporal_data(data, target_col, direction)
    assert target_col not in train_data.columns

    results = {
        "raw": {},
        "str": {},
        "models": {},
    }
    if len(np.unique(labels)) < cat_limit:
        for src_model in ["catboost", "xgboost"]:
            model = Classifiers().get(src_model)
            encoded_labels = LabelEncoder().fit_transform(labels)

            score = evaluate_estimator(model, train_data, pd.Series(encoded_labels))[
                "str"
            ]["aucroc"]
            raw_score = evaluate_estimator(
                model, train_data, pd.Series(encoded_labels)
            )["clf"]["aucroc"][0]

            results["str"][src_model] = score
            results["raw"][src_model] = raw_score
            results["models"][src_model] = (
                Classifiers().get(src_model).fit(train_data, pd.Series(encoded_labels))
            )

    else:
        for src_model in ["catboost_regressor", "xgboost_regressor"]:
            model = Regression().get(src_model)
            score = evaluate_regression(model, train_data, labels)["str"]["r2"]
            raw_score = evaluate_regression(model, train_data, labels)["clf"]["r2"][0]

            results["str"][src_model] = score
            results["raw"][src_model] = raw_score
            results["models"][src_model] = (
                Regression().get(src_model).fit(train_data, labels)
            )

    return results


def prepare_longitudinal_imputers(data, columns):
    imputers = {}

    for direction in ["forward", "reverse"]:
        imputers[direction] = {}
        for target_col in columns:
            train_data, labels = prepare_temporal_data(data, target_col, direction)
            print("train", target_col, direction, len(np.unique(labels)))

            if len(np.unique(labels)) > cat_limit:
                model = Regression().get("catboost_regressor")
            else:
                model = Classifiers().get("catboost")

            model.fit(train_data, labels)

            imputers[direction][target_col] = model

    return imputers


def prepare_longitudinal_imputers_v2(data, columns):
    imputers = {}

    for direction in ["forward", "reverse"]:
        imputers[direction] = {}
        for target_col in columns:
            benchmarks = evaluate_target(data, target_col, direction=direction)

            best_score = -1
            best_mod = None
            for mod in benchmarks["raw"]:
                if benchmarks["raw"][mod] > best_score:
                    best_score = benchmarks["raw"][mod]
                    best_mod = benchmarks["models"][mod]

            imputers[direction][target_col] = best_mod

    return imputers

In [12]:
# for col in eval_cols:
#    score = evaluate_target(dev_set, col, direction = "forward")
#    print(col, score)
# DX_num {'catboost': '0.9874 +/- 0.0012', 'xgboost': '0.9867 +/- 0.0006'}
# CDRSB {'catboost_regressor': '0.8143 +/- 0.0132', 'xgboost_regressor': '0.7971 +/- 0.0191'}
# MMSE {'catboost_regressor': '0.7303 +/- 0.014', 'xgboost_regressor': '0.7089 +/- 0.0156'}
# ADAS13 {'catboost_regressor': '0.7975 +/- 0.0018', 'xgboost_regressor': '0.7955 +/- 0.0018'}
# Ventricles {'catboost_regressor': '0.7596 +/- 0.0226', 'xgboost_regressor': '0.764 +/- 0.0218'}
# Hippocampus {'catboost_regressor': '0.8483 +/- 0.0073', 'xgboost_regressor': '0.8571 +/- 0.0058'}
# WholeBrain {'catboost_regressor': '0.8574 +/- 0.0122', 'xgboost_regressor': '0.8691 +/- 0.0155'}
# Entorhinal {'catboost_regressor': '0.6859 +/- 0.0071', 'xgboost_regressor': '0.6648 +/- 0.0059'}
# Fusiform {'catboost_regressor': '0.7779 +/- 0.0174', 'xgboost_regressor': '0.7673 +/- 0.0212'}
# MidTemp {'catboost_regressor': '0.8207 +/- 0.0075', 'xgboost_regressor': '0.8157 +/- 0.0121'}

In [13]:
# for col in eval_cols:
#    score = evaluate_target(dev_set, col, direction = "reverse")
#    print(col, score)

# DX_num {'catboost': '0.9824 +/- 0.0031', 'xgboost': '0.9817 +/- 0.0023'}
# CDRSB {'catboost_regressor': '0.788 +/- 0.007', 'xgboost_regressor': '0.7876 +/- 0.0024'}
# MMSE {'catboost_regressor': '0.7291 +/- 0.0272', 'xgboost_regressor': '0.6992 +/- 0.0299'}
# ADAS13 {'catboost_regressor': '0.8016 +/- 0.0142', 'xgboost_regressor': '0.7944 +/- 0.0149'}
# Ventricles {'catboost_regressor': '0.7043 +/- 0.0233', 'xgboost_regressor': '0.6825 +/- 0.0351'}
# Hippocampus {'catboost_regressor': '0.8511 +/- 0.0087', 'xgboost_regressor': '0.8515 +/- 0.0066'}
# WholeBrain {'catboost_regressor': '0.8595 +/- 0.016', 'xgboost_regressor': '0.8658 +/- 0.0184'}
# Entorhinal {'catboost_regressor': '0.6898 +/- 0.0078', 'xgboost_regressor': '0.6646 +/- 0.0054'}
# Fusiform {'catboost_regressor': '0.7785 +/- 0.0197', 'xgboost_regressor': '0.7802 +/- 0.0145'}
# MidTemp {'catboost_regressor': '0.8199 +/- 0.0094', 'xgboost_regressor': '0.8202 +/- 0.0139'}

In [14]:
dev_set_id = dataframe_hash(dev_set)

imputers_bkp_file = workspace / f"longitudinal_imputers_scaled_cat{cat_limit}_{dev_set_id}.bkp"

if imputers_bkp_file.exists():
    longitudinal_imputers = load_model_from_file(imputers_bkp_file)
else:
    longitudinal_imputers = prepare_longitudinal_imputers(dev_set, eval_cols)
    save_model_to_file(imputers_bkp_file, longitudinal_imputers)

train DX_num forward 3
train CDRSB forward 25
train MMSE forward 26
train ADAS13 forward 175
train Ventricles forward 4029
train Hippocampus forward 3129
train WholeBrain forward 3986
train Entorhinal forward 2353
train Fusiform forward 3333
train MidTemp forward 3446
train DX_num reverse 3
train CDRSB reverse 25
train MMSE reverse 26
train ADAS13 reverse 175
train Ventricles reverse 4029
train Hippocampus reverse 3129
train WholeBrain reverse 3986
train Entorhinal reverse 2353
train Fusiform reverse 3333
train MidTemp reverse 3446


## Preprocess data

In [17]:
from hyperimpute.plugins.imputers import Imputers

# VISCODE 6 * x -> AGE 0.5 * x

const_by_patient = ["PTGENDER_num", "PTEDUCAT", "APOE4"]


def dataframe_hash(df: pd.DataFrame) -> str:
    return str(abs(pd.util.hash_pandas_object(df).sum()))


def normalize(test_data):
    return test_data

    # factor = test_data["CDRSB"] / 0.5
    # factor = factor.fillna(-1)
    # factor = factor.round(0).astype(int)
    # factor = factor.replace(-1, np.nan)
    # test_data["CDRSB"] = factor * 0.5
    # return test_data


def prepare_consts(train_data, test_data):
    test_data = test_data.copy()
    train_data = train_data.copy()

    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    test_data = test_data.sort_values(["RID_HASH", "VISCODE"])

    for item in test_data.groupby("RID_HASH"):
        local = item[1]

        # fill consts
        for col in const_by_patient:
            if len(local[col].unique()) == 1:
                continue
            rid = local["RID_HASH"].unique()[0]

            val = local[col][~local[col].isna()].unique()[0]
            local[col] = local[col].fillna(val)
            test_data.loc[test_data["RID_HASH"] == rid, col] = test_data[
                test_data["RID_HASH"] == rid
            ][col].fillna(val)
            assert len(local[col].unique()) == 1, col

    return test_data


def prepare_age(train_data, test_data):
    test_data = test_data.copy()
    train_data = train_data.copy()

    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    test_data = test_data.sort_values(["RID_HASH", "VISCODE"])

    col = "AGE"

    for rid in test_data["RID_HASH"].unique():
        local = test_data[test_data["RID_HASH"] == rid]

        # fill age
        ages = local["AGE"]
        if ages.isna().sum() == 0:
            continue

        if ages.isna().sum() == len(ages):
            continue

        # forward impute age
        prev_viscode = 0
        prev_age = 0
        for idx, row in local.iterrows():
            current_viscode = row["VISCODE"]
            local_idx = (test_data["VISCODE"] == current_viscode) & (
                test_data["RID_HASH"] == rid
            )
            if prev_age > 0 and prev_age == prev_age:
                pred_age = (current_viscode - prev_viscode) / 6 * 0.5 + prev_age
            else:
                pred_age = row[col]

            if pred_age == pred_age:
                # print("forward imputed", pred_age, current_viscode)
                test_data.loc[local_idx, col] = test_data.loc[local_idx][col].fillna(
                    pred_age
                )

            prev_viscode = row["VISCODE"]
            prev_age = pred_age

        # reverse impute age
        prev_viscode = 0
        prev_age = 0
        for idx, row in local.iloc[::-1].iterrows():
            current_viscode = row["VISCODE"]
            local_idx = (test_data["VISCODE"] == current_viscode) & (
                test_data["RID_HASH"] == rid
            )

            if prev_age > 0 and prev_age == prev_age:
                pred_age = prev_age - (prev_viscode - current_viscode) / 6 * 0.5
            else:
                pred_age = row[col]

            if pred_age == pred_age:
                # print("reversed imputed", pred_age, current_viscode)
                test_data.loc[local_idx, col] = test_data.loc[local_idx][col].fillna(
                    pred_age
                )

            prev_viscode = row["VISCODE"]
            prev_age = pred_age

        # print(test_data[(test_data["RID_HASH"] == rid)][["VISCODE", "AGE"]])
    return test_data


def impute_longitudinal(
    train_data,
    test_data,
    n_iter=5,
    eval_cols=[
        "DX_num",
        "CDRSB",
        "MMSE",
        "ADAS13",
        "Ventricles",
        "Hippocampus",
        "WholeBrain",
        "Entorhinal",
        "Fusiform",
        "MidTemp",
    ],
    random_state: int = 0,
):
    test_data = test_data.copy()
    train_data = train_data.copy()

    imputed_test_data = intermediary_imputation(
            train_data, test_data, eval_cols=eval_cols, random_state = random_state,
        )

    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    test_data = test_data.sort_values(["RID_HASH", "VISCODE"])
    imputed_test_data = imputed_test_data.sort_values(["RID_HASH", "VISCODE"])

    prev_cols = [f"prev_{col}" for col in train_cols]

    for rid in test_data["RID_HASH"].unique():
        patient = test_data[test_data["RID_HASH"] == rid]
        patient_imputed = imputed_test_data[imputed_test_data["RID_HASH"] == rid]

        prediction_rows = [pd.Series(np.zeros(len(prev_cols)), index=train_cols)]
        for ridx, row in patient.iterrows():
            prediction_rows.append(row[train_cols])
        prediction_rows.append(pd.Series(np.zeros(len(prev_cols)), index=train_cols))

        for col in eval_cols:
            if patient[col].isna().sum() == 0:
                continue

            for ridx, row in enumerate(prediction_rows[1:-1]):
                real_idx = ridx + 1
                if row[col] == row[col]:
                    continue
                current_viscode = row["VISCODE"]
                local_idx = (test_data["VISCODE"] == current_viscode) & (
                    test_data["RID_HASH"] == rid
                )

                prev_col_val = prediction_rows[real_idx - 1][col]
                next_col_val = prediction_rows[real_idx + 1][col]

                if next_col_val == next_col_val and ridx + 1 < len(patient_imputed):
                    eval_data = (
                        patient_imputed.iloc[ridx].to_frame().T[train_cols]
                    )  # .drop(columns = [col]) #row.to_frame().T[train_cols]
                    eval_data[prev_cols] = (
                        patient_imputed.iloc[ridx + 1].to_frame().T[train_cols].values
                    )
                    # eval_data = eval_data.astype(float)

                    assert eval_data.isna().sum().sum() == 0
                    assert eval_data[f"prev_{col}"].values[0] == next_col_val

                    imputer = longitudinal_imputers["reverse"][col]
                    imputed_val = imputer.predict(eval_data).values.squeeze()

                    test_data.loc[local_idx, col] = imputed_val

                if prev_col_val == prev_col_val and ridx > 0:
                    # print("Imputing using the prev value", prev_col_val)
                    eval_data = (
                        patient_imputed.iloc[ridx].to_frame().T[train_cols]
                    )  # .drop(columns = [col])
                    eval_data[prev_cols] = (
                        patient_imputed.iloc[ridx - 1].to_frame().T[train_cols].values
                    )
                    # eval_data = eval_data.astype(float)

                    assert eval_data.isna().sum().sum() == 0
                    assert eval_data[f"prev_{col}"].values[0] == prev_col_val

                    imputer = longitudinal_imputers["forward"][col]
                    imputed_val = imputer.predict(eval_data).values.squeeze()
                    test_data.loc[local_idx, col] = imputed_val

                    continue

    return normalize(test_data)


def intermediary_imputation(train_data, test_data, eval_cols, random_state: int = 0):
    imputed_test_data = test_data.copy()
    local_kwargs = {
        "optimizer": "simple",
        "classifier_seed": ["logistic_regression"],
        "regression_seed": ["linear_regression"],
        "class_threshold": cat_limit,
        "random_state" : random_state,
    }
    imputer = Imputers().get(
        "hyperimpute",
        **local_kwargs,
    )
    imputation_input = pd.concat([train_data, test_data], ignore_index=True)
    imputed_test_data = imputer.fit_transform(imputation_input)
    imputed_test_data = imputed_test_data.tail(len(test_data))

    return normalize(imputed_test_data)


def full_imputation(train_data, test_data, eval_cols, random_state: int = 0):

    imputed_test_data = test_data.copy()
    imputer_kwargs = {
        "optimizer": "simple",
        "classifier_seed": ["catboost"],
        "regression_seed": ["catboost_regressor", "xgboost_regressor"],
        "class_threshold": cat_limit,
        "random_state" : random_state,
    }

    imputer = Imputers().get(
        "hyperimpute",
        **imputer_kwargs,
    )
    imputation_input = pd.concat([train_data, test_data], ignore_index=True)
    imputed_test_data = imputer.fit_transform(imputation_input)
    imputed_test_data = imputed_test_data.tail(len(test_data))

    return normalize(imputed_test_data)

def evaluate_static_imputation(train_data, test_data, static_imputation):
    test_data = test_data.copy()
    train_data = train_data.copy()

    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    test_data = test_data.sort_values(["RID_HASH", "VISCODE"])

    for rid in test_data["RID_HASH"].unique():
        patient = test_data[test_data["RID_HASH"] == rid]
        misses = []
        viscodes = []
        for idx, row in patient.iterrows():
            misses.append(row.isna().sum())
            viscodes.append(row["VISCODE"])
        cidx = np.argmin(misses)

        current_viscode = viscodes[cidx]
        local_idx = (test_data["VISCODE"] == current_viscode) & (
            test_data["RID_HASH"] == rid
        )
        imputed_idx = (static_imputation["VISCODE"] == current_viscode) & (
            static_imputation["RID_HASH"] == rid
        )

        if len(test_data[local_idx]) == 0:
            continue

        for col in test_data.columns:
            val = test_data.loc[local_idx][col].values[0]
            if val == val:
                continue
            imputed_val = static_imputation.loc[imputed_idx][col].values[0]
            test_data.loc[local_idx, col] = imputed_val

            # print("imputed", test_data.loc[local_idx, col])

    return normalize(test_data)


def impute_data_step(
    train_data, test_data, 
    random_state: int = 0,
):
    print("Imputation step using seed", random_state)

    test_id = dataframe_hash(test_data)

    print(" >>> Evaluate constants", test_id, test_data.isna().sum().sum())
    test_data = prepare_consts(train_data, test_data)
    test_data = prepare_age(train_data, test_data)

    print(" >>> Evaluate longitudinals", test_id, test_data.isna().sum().sum())
    while True:
        new_test_data = impute_longitudinal(train_data, test_data, random_state = random_state)
        if new_test_data.isna().sum().sum() == test_data.isna().sum().sum():
            break

        test_data = new_test_data

    print(
        " >>> Evaluate static imputation",
        test_id,
        test_data.isna().sum().sum(),
    )
    static_imputation = full_imputation(train_data, test_data, test_data.columns, random_state = random_state)

    test_data = evaluate_static_imputation(train_data, test_data, static_imputation)

    print(" >>> Evaluate constants take 2", test_id, test_data.isna().sum().sum())
    test_data = prepare_consts(train_data, test_data)
    test_data = prepare_age(train_data, test_data)

    print(" >>> Evaluate longitudinals take 2", test_id, test_data.isna().sum().sum())
    while True:
        new_test_data = impute_longitudinal(train_data, test_data, random_state = random_state)
        if new_test_data.isna().sum().sum() == test_data.isna().sum().sum():
            break

        test_data = new_test_data

    print(" >>> Normalize data", test_id, test_data.isna().sum().sum())
    return normalize(test_data)
                                           
                                        
def merge_imputations(train_data, miss_data, imputed_data):
    train_data = train_data.sort_values(["RID_HASH", "VISCODE"])
    miss_data = miss_data.sort_values(["RID_HASH", "VISCODE"])
    
    output = miss_data.copy()

    for col in train_data.columns:
        if miss_data[col].isna().sum() == 0:
            continue
        
        col_data = []
        for imputed_version in imputed_data:
            col_data.append(imputed_version[col].values)
        col_data = np.asarray(col_data)
                
        if len(train_data[col].unique()) < cat_limit:
            #print("clf", col, miss_data[col].isna().sum())
            
            most_freq_vals = []
            for row in col_data.T:
                counter = {}
                
                for v in row:
                    if v not in counter:
                        counter[v] = 0
                    counter[v] += 1
                
                col_most_freq = max(counter, key=counter.get)
                most_freq_vals.append(col_most_freq)
            output[col] = most_freq_vals
        else:
            #print("regression", col, miss_data[col].isna().sum())
            output[col] = col_data.mean(axis = 0)

    return output

    
def impute_data(
    train_data,
    test_data,
    seeds: int = n_seeds,
):
    test_id = dataframe_hash(test_data)
    
    output = []
    for seed in range(seeds):
        bkp_file = workspace / f"multi_imputation_{test_id}_{seed}_catlimit{cat_limit}.csv"
        print("Evaluate", bkp_file)
        if bkp_file.exists():
            output.append(pd.read_csv(bkp_file))
            continue
            
        output.append(impute_data_step(
            train_data,
            test_data,
            random_state=seed,
        ))
            
        output[-1].to_csv(bkp_file, index = None)

    return merge_imputations(train_data, test_data, output)

In [18]:
dev_1_eval = impute_data(dev_set, dev_1)
dev_2_eval = impute_data(dev_set, dev_2)

Evaluate workspace/multi_imputation_5359550845739047994_0_catlimit10.csv
Imputation step using seed 0
 >>> Evaluate constants 5359550845739047994 22466
 >>> Evaluate longitudinals 5359550845739047994 19844
 >>> Evaluate static imputation 5359550845739047994 9126
 >>> Evaluate constants take 2 5359550845739047994 5492
 >>> Evaluate longitudinals take 2 5359550845739047994 4386
 >>> Normalize data 5359550845739047994 0
Evaluate workspace/multi_imputation_5359550845739047994_1_catlimit10.csv
Imputation step using seed 1
 >>> Evaluate constants 5359550845739047994 22466
 >>> Evaluate longitudinals 5359550845739047994 19844
 >>> Evaluate static imputation 5359550845739047994 9126
 >>> Evaluate constants take 2 5359550845739047994 5492
 >>> Evaluate longitudinals take 2 5359550845739047994 4386
 >>> Normalize data 5359550845739047994 0
Evaluate workspace/multi_imputation_5359550845739047994_2_catlimit10.csv
Imputation step using seed 2
 >>> Evaluate constants 5359550845739047994 22466
 >>> E

In [19]:
from hyperimpute.plugins.imputers import Imputers
from hyperimpute.utils.benchmarks import benchmark_model
from sklearn.preprocessing import LabelEncoder

train_eval_data = pd.concat([dev_set, dev_1_eval, dev_2_eval], ignore_index=True)

train_eval_data_raw = pd.concat([dev_set, dev_1, dev_2], ignore_index=True)

train_gt = pd.concat([dev_set, dev_set, dev_set], ignore_index=True)

train_mask = train_eval_data_raw.isna().astype(int)
train_mask_bool = train_eval_data_raw.isna()

le = LabelEncoder().fit(train_gt["RID_HASH"])
train_gt["RID_HASH"] = le.transform(train_gt["RID_HASH"])
train_eval_data["RID_HASH"] = le.transform(train_eval_data["RID_HASH"])

plugin = Imputers().get(
    "hyperimpute",
    optimizer="simple",
    classifier_seed=["catboost"],
    regression_seed=["xgboost_regressor", "catboost_regressor"],
    class_threshold=cat_limit,
)

benchmark_model("missforest", plugin, train_gt, train_eval_data, train_mask)

(1.0483269589332584, 0.5957524749137005)

In [28]:
# no preprocessing: (28256.01247357673, 13772.1483031795)
# missforest: (26900.321450057505, 11422.800036863018)
# missforest + catboost: (26818.913956820044, 11172.534584559302)
# impute first visit by consts + AGE : (22801.31764624424, 4530.5431159416)
# long inputing for DX_NUM (20140.313132594343, 4911.72658229064)


# scaled impute by first visit: (1.2100147872665876, 0.6437290692806962)
# scaled impute + mean + long imputation DX_NUM : (1.1785409955690953, 0.5420526563648462)
# scaled impute + mean + long imputation DX_NUM + MMSE: (1.1668009083992619, 0.7318445766107088)
# scaled impute + mean + long imputation DX_NUM + ADAS13: (1.1030186731907161, 0.5537261491649679)
# scaled impute + mean + long imputation DX_NUM + ADAS13 + Ventricles: (1.1205398362774392, 0.5763027452010369)
# scaled impute + mean + long imputation DX_NUM + ADAS13 + Hippocampus: (1.2369516583030933, 0.6736382987041747)

# cat40 simple : (1.4568880717344705, 0.9934616930026703)
# cat10 + scaled impute + hyperimpute(catboost) + long imputation full:  (1.0241908722005213, 0.4927460867941309)
# cat10 + scaled impute + long imputation full + normalization:  (1.065308437510163, 0.4953959370803312)

# cat10 + scaled impute + long imputation full + static imputation + normalization:  (1.1640316504107393, 0.5580105554943448)
# cat10 + scaled impute + long imputation full + static imputation:  (1.06, 0.53)

# cat10 + visit cnt + long imputation + linear interm imputation: (0.9921784212883535, 0.44433424863837917)


## Submission data

In [None]:
import numpy as np


def normalize_output(test_data):
    test_data = test_data.copy()
    factor = test_data["CDRSB"] / 0.5
    factor[factor < 0] = 0
    
    factor = factor.fillna(-1)
    factor = factor.round(0).astype(int)
    factor = factor.replace(-1, np.nan)
    test_data["CDRSB"] = factor * 0.5

    test_data["ADAS13"] = ((test_data["ADAS13"] * 3).round(0) / 3).round(2)
    test_data["MMSE"] = test_data["MMSE"].round(0)

    return test_data


def dump_results(imputed_data: pd.DataFrame, fpath: str):
    results = []

    for name, data in [
        ("test_A", test_A.sort_index()),
        ("test_B", test_B.sort_index()),
    ]:
        for idx, row in data.iterrows():
            for col in row.index:
                local = row.T
                val = local[col]
                if val == val:
                    continue
                imputed_id = f"{local['RID_HASH']}_{local['VISCODE']}_{col}_{name}"
                imputed_val = imputed_data[
                    (imputed_data["RID_HASH"] == local["RID_HASH"])
                    & (imputed_data["VISCODE"] == local["VISCODE"])
                ][col].values[0]
                
                assert imputed_val == imputed_val
                assert imputed_val != ""
                
                results.append([imputed_id, imputed_val])

    output = pd.DataFrame(results, columns=submission.columns)
    output.to_csv(fpath, index=None)

    return output

def get_submission_data():
    test_A_eval = impute_data(dev_set, test_A,).sort_index()
    test_B_eval = impute_data(dev_set, test_B).sort_index()

    eval_data = pd.concat([dev_set, test_A_eval, test_B_eval], ignore_index=True)
    eval_data[scaled_cols] = scaler.inverse_transform(eval_data[scaled_cols])

    output_fpath = results_dir / f"imputation_results_{version}_{changelog}_normalized.csv"

    output_normalized = dump_results(normalize_output(eval_data), output_fpath)
    
    return output_fpath, output_normalized

In [34]:
test_A_eval = impute_data(dev_set, test_A)
test_B_eval = impute_data(dev_set, test_B)

longitudinal imputation 4562 6161
longitudinal imputation 3497 4562
longitudinal imputation 3015 3497
longitudinal imputation 2881 3015
longitudinal imputation 2820 2881
longitudinal imputation 2778 2820
longitudinal imputation 2770 2778
longitudinal imputation 2766 2770
longitudinal imputation 2766 2766
longitudinal imputation 827 1239
longitudinal imputation 486 827
longitudinal imputation 272 486
longitudinal imputation 120 272
longitudinal imputation 50 120
longitudinal imputation 23 50
longitudinal imputation 8 23
longitudinal imputation 2 8
longitudinal imputation 1 2
longitudinal imputation 0 1
longitudinal imputation 0 0
longitudinal imputation 5320 6925
longitudinal imputation 4139 5320
longitudinal imputation 3527 4139
longitudinal imputation 3365 3527
longitudinal imputation 3284 3365
longitudinal imputation 3237 3284
longitudinal imputation 3226 3237
longitudinal imputation 3221 3226
longitudinal imputation 3221 3221
longitudinal imputation 1121 1579
longitudinal imputation

In [35]:
eval_data = pd.concat([dev_set, test_A_eval, test_B_eval], ignore_index=True)


eval_data

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp
0,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,0,79.100000,0.0,20.0,1.0,1.0,0.5,28.0,12.00,16636.000000,7208.000000,9.790100e+05,3672.00000,12661.000000,18165.000000
1,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,6,79.600000,0.0,20.0,1.0,1.0,1.5,28.0,17.33,16649.000000,7205.000000,9.707820e+05,3331.00000,12630.000000,18085.000000
2,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,0,72.900000,1.0,12.0,1.0,1.0,1.0,30.0,9.00,27456.000000,7000.000000,8.644140e+05,3952.00000,15911.000000,15686.000000
3,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,6,73.400000,1.0,12.0,1.0,1.0,1.0,30.0,12.00,27773.000000,7213.000000,8.601540e+05,3508.00000,15229.000000,15672.000000
4,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,12,73.900000,1.0,12.0,1.0,1.0,1.0,29.0,8.00,29427.000000,7024.000000,8.484300e+05,3807.00000,15636.000000,15283.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6894,ff4eb5a64e2b89861d5dea81190669893070b227f3a335...,0,76.101067,0.0,18.0,1.0,1.0,1.5,27.0,8.33,53084.507860,6798.000000,1.064457e+06,3277.00000,16735.000000,21214.000000
6895,ff4eb5a64e2b89861d5dea81190669893070b227f3a335...,12,77.101067,0.0,18.0,1.0,1.0,1.5,28.0,17.67,52990.262062,6951.000000,1.062454e+06,2710.00000,17125.000000,21555.000000
6896,ff4eb5a64e2b89861d5dea81190669893070b227f3a335...,84,83.101067,0.0,18.0,1.0,1.0,1.5,30.0,13.00,55108.796426,6175.000000,1.065920e+06,3101.00000,18171.000000,20686.000000
6897,ffa86109ba8684f31325842d0ff26568e105f0f63b366a...,0,66.300000,1.0,13.0,0.0,0.0,0.0,28.0,8.67,32808.200000,7835.021953,9.652050e+05,4240.24395,15636.551261,20517.297776


In [36]:
from hyperimpute.plugins.imputers import Imputers

plugin = Imputers().get(
    "hyperimpute",
    optimizer="simple",
    classifier_seed=["catboost"],
    regression_seed=["xgboost_regressor", "catboost_regressor"],
    class_threshold=cat_limit,
)


imputed_X = plugin.fit_transform(eval_data.copy())
imputed_X[scaled_cols] = scaler.inverse_transform(imputed_X[scaled_cols])

imputed_X

Unnamed: 0,RID_HASH,VISCODE,AGE,PTGENDER_num,PTEDUCAT,DX_num,APOE4,CDRSB,MMSE,ADAS13,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp
0,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,0.0,79.100000,0.0,20.0,1.0,1.0,0.5,28.0,12.00,16636.000000,7208.000000,9.790100e+05,3672.00000,12661.000000,18165.000000
1,001c7955017f905ccf78d55c94e81070a1cca7b1efb5bd...,6.0,79.600000,0.0,20.0,1.0,1.0,1.5,28.0,17.33,16649.000000,7205.000000,9.707820e+05,3331.00000,12630.000000,18085.000000
2,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,0.0,72.900000,1.0,12.0,1.0,1.0,1.0,30.0,9.00,27456.000000,7000.000000,8.644140e+05,3952.00000,15911.000000,15686.000000
3,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,6.0,73.400000,1.0,12.0,1.0,1.0,1.0,30.0,12.00,27773.000000,7213.000000,8.601540e+05,3508.00000,15229.000000,15672.000000
4,00e6fb56250581a8c8b5133f91443dd8c037e3cd8d0ba8...,12.0,73.900000,1.0,12.0,1.0,1.0,1.0,29.0,8.00,29427.000000,7024.000000,8.484300e+05,3807.00000,15636.000000,15283.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6894,ff4eb5a64e2b89861d5dea81190669893070b227f3a335...,0.0,76.101067,0.0,18.0,1.0,1.0,1.5,27.0,8.33,53084.507860,6798.000000,1.064457e+06,3277.00000,16735.000000,21214.000000
6895,ff4eb5a64e2b89861d5dea81190669893070b227f3a335...,12.0,77.101067,0.0,18.0,1.0,1.0,1.5,28.0,17.67,52990.262062,6951.000000,1.062454e+06,2710.00000,17125.000000,21555.000000
6896,ff4eb5a64e2b89861d5dea81190669893070b227f3a335...,84.0,83.101067,0.0,18.0,1.0,1.0,1.5,30.0,13.00,55108.796426,6175.000000,1.065920e+06,3101.00000,18171.000000,20686.000000
6897,ffa86109ba8684f31325842d0ff26568e105f0f63b366a...,0.0,66.300000,1.0,13.0,0.0,0.0,0.0,28.0,8.67,32808.200000,7835.021953,9.652050e+05,4240.24395,15636.551261,20517.297776


In [39]:
import numpy as np

results = []


def normalize_output(test_data):
    test_data = test_data.copy()
    factor = test_data["CDRSB"] / 0.5
    factor = factor.fillna(-1)
    factor = factor.round(0).astype(int)
    factor = factor.replace(-1, np.nan)
    test_data["CDRSB"] = factor * 0.5

    test_data["ADAS13"] = ((test_data["ADAS13"] * 3).round(0) / 3).round(2)
    test_data["MMSE"] = test_data["MMSE"].round(0)

    return test_data


def dump_results(imputed_data: pd.DataFrame, fpath: str):
    for name, data in [
        ("test_A", test_A),
        ("test_B", test_B),
    ]:
        for idx, row in data.iterrows():
            for col in row.index:
                local = row.T
                val = local[col]
                if val == val:
                    continue
                imputed_id = f"{local['RID_HASH']}_{local['VISCODE']}_{col}_{name}"
                imputed_val = imputed_data[
                    (imputed_data["RID_HASH"] == local["RID_HASH"])
                    & (imputed_data["VISCODE"] == local["VISCODE"])
                ][col].values[0]
                results.append([imputed_id, imputed_val])

    output = pd.DataFrame(results, columns=submission.columns)
    output.to_csv(fpath, index=None)

    return output


version = "v15"
changelog = "static_imputation_tweaks_cat10"
output = dump_results(imputed_X, f"imputation_results_{version}_{changelog}.csv")
output_normalized = dump_results(
    normalize_output(imputed_X),
    f"imputation_results_{version}_{changelog}_normalized.csv",
)

output

Unnamed: 0,Id,Predicted
0,988b6137f4352c01e4b52790505caa0c3ec438f117000a...,58.2
1,988b6137f4352c01e4b52790505caa0c3ec438f117000a...,0.0
2,988b6137f4352c01e4b52790505caa0c3ec438f117000a...,0.0
3,988b6137f4352c01e4b52790505caa0c3ec438f117000a...,0.5
4,988b6137f4352c01e4b52790505caa0c3ec438f117000a...,29.0
...,...,...
15021,8b33cc9dd06fc18f130e185fdf1e6d657dbc80add9ff6e...,0.0
15022,8b33cc9dd06fc18f130e185fdf1e6d657dbc80add9ff6e...,0.0
15023,8b33cc9dd06fc18f130e185fdf1e6d657dbc80add9ff6e...,0.5
15024,8b33cc9dd06fc18f130e185fdf1e6d657dbc80add9ff6e...,29.0


In [40]:
pd.set_option("display.expand_frame_repr", True)

output.tail(5).values

array([['8b33cc9dd06fc18f130e185fdf1e6d657dbc80add9ff6ea81ced3e328cea9e63_0_DX_num_test_B',
        0.0],
       ['8b33cc9dd06fc18f130e185fdf1e6d657dbc80add9ff6ea81ced3e328cea9e63_0_APOE4_test_B',
        0.0],
       ['8b33cc9dd06fc18f130e185fdf1e6d657dbc80add9ff6ea81ced3e328cea9e63_0_CDRSB_test_B',
        0.5],
       ['8b33cc9dd06fc18f130e185fdf1e6d657dbc80add9ff6ea81ced3e328cea9e63_0_MMSE_test_B',
        29.0],
       ['8b33cc9dd06fc18f130e185fdf1e6d657dbc80add9ff6ea81ced3e328cea9e63_0_ADAS13_test_B',
        17.0]], dtype=object)