In [None]:
import os
import sys

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport

# Add parent folder to syspath to include local util functions
sys.path.insert(0, os.path.abspath('..'))
from utils.plot_utils import plot_hist
from preprocess_utils.preprocessing_utils import apply_yeojohnson

In [None]:
data_dir = "../../data/"
plots_dir = "../../plots/"

# Reload data that was processed for a bit before:
df = pd.read_csv(data_dir + "investigated.csv")

In [None]:
# Create a Pandas_profile Report
profile = ProfileReport(df, minimal=True)
profile.to_widgets()

In [None]:
# Save the report to disk
profile.to_file(plots_dir + "profile_report.html")

In [None]:
# Plot the feature distributions
plot_hist(df, plots_dir=plots_dir)

### Fix IL6 and SORL:

In [None]:
df.loc[df["T1_SORL1"] == 0, "T1_SORL1"] = np.nan

In [None]:
df.loc[df["T1_IL6"] == 0, "T1_IL6"] = np.nan

## Transform distributions

In [None]:
df = apply_yeojohnson(df)

In [None]:
# Plot histograms:
plot_hist(df, name="Yeojohnsoned", plots_dir=plots_dir)

## Removing outliers

In [None]:
weirdos = df.loc[:, (df.columns.str.contains("MissingRepl|Unreife|Troponin"))]
plot_hist(weirdos, name="weirdos", plots_dir=plots_dir)

In [None]:
df = df.loc[:, df.columns.str.contains("[^T1_Troponin$T1_NTproBNP$T1_MDA$T1_Leptin_Lab$]")]
plot_hist(df, name="without dropped params", plots_dir=plots_dir)

In [None]:
def remove_outliers(df, name, quantile=0.99):
    print(name)
    q95m = df[df["sex"] == 0][name].quantile(quantile)
    q95f = df[df["sex"] == 1][name].quantile(quantile)
    print(df[name].describe())
    print(f"Male: {q95m}")
    print(f"Female: {q95f}")
    # replace outliers with 95 quantile cut-off value for respective sex
    df.loc[(df["sex"] == 0) & (df[name] > q95m), name] = q95m
    df.loc[(df["sex"] == 1) & (df[name] > q95f), name] = q95f
    # check the result
    print("Std now: ", df[name].std())
    print()
    return df

In [None]:
fragile_params = ['T1_oxLDL', 'T1_S100A12_plasma',
       'T1_Calprotectinn', 'T1_KNYAcid', 'T1_NTproBNP',
       'T1_NTproBNP_MissingRepl', 'T1_CRP_InclExtrapol', 'T1_S100A12',
       'T1_ALAT_GPT_U_L', 'T1_ASAT_GOT_U_L', 'T1_gammaGTSe', "T1_Triglycerides_mmolL",
       'T1_Triglyc_mmolL_Reanalysis', 'T1_UnreifeGranulozytenabsolut', 'T1_UnreifeGranulozyten_Percent',
       'T1_IL2_pgml', 'T1_IL6', 'T1_Cpeptide_total', 'T1_Leptin_total',
       'T1_Leptin_SLR_Ratio', 'T1_SORL1', 'T1_IL18_pgml',
       'Final_T1_TP42_40', "T1_FinalTG_mmolL"]
for param in df.columns:
    df = remove_outliers(df, param, quantile=0.9999)


In [None]:
# normal range T1_ASAT_GOT_U_L: 35 m / 30 f (<=)

## Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler

def normalize(df):
    scaler = MinMaxScaler()
    scaler.fit(df)

    # Apply normalization:
    df_normalized = scaler.transform(df)
    df = pd.DataFrame(df_normalized, columns=df.columns)
    return df

# Save means and stds:
#np.save("data/means", df_means.to_numpy())
#np.save("data/stds", df_stds.to_numpy())

In [None]:
df_normalized = normalize(df)

In [None]:
df = pd.DataFrame(df_normalized, columns=df.columns)

## Stratified Train-test-val split and k-fold split

In [None]:
def create_labels(df):
    df_minuses = df.fillna(-1)
    #df_minuses.loc[df["PreCI_dichotomous_T0"] == -1, "PreCI_dichotomous_T0"] = 0
    mean_age = df["Alter"].mean()
    return [(df_minuses.iloc[idx]["Alter"] < mean_age).astype(int).astype(str) +
            df_minuses.iloc[idx]["sex"].astype(int).astype(str) +
            df_minuses.iloc[idx]["POD"].astype(int).astype(str) +
            df_minuses.iloc[idx]["POCD_dichotomous_T2"].astype(int).astype(str) #+
            #df_minuses.iloc[idx]["PreCI_dichotomous_T0"].astype(int).astype(str)
            #PreCI_dichotomous_T0
            for idx in range(len(df))]

In [None]:
def create_balanced_split(df, test_size, hard_threshold, soft_threshold, num_allowed):
    count = 0
    names = np.array(df.columns)
    print("df mean: ", df.mean().mean())
    outliers = num_allowed + 1
    max_diff = hard_threshold + 1
    while outliers > num_allowed or max_diff > hard_threshold:
        # Create split:
        indices = np.arange(len(df))
        labels = create_labels(df)
        train_data, test_data, train_idcs, test_idcs = train_test_split(df, indices, test_size=test_size, stratify=labels)
        # Test if split is good enough:
        print(test_data.shape)
        diffs = np.array([0])#np.abs(test_data.mean(axis=0) - train_data.mean(axis=0)) / np.abs(train_data.mean(axis=0))
        max_diff = max(diffs)
        #print("first: ", np.abs(test_data.mean(axis=0) - train_data.mean(axis=0)))
        print(test_data.mean().mean(), train_data.mean().mean(), df.mean().mean())
        print(list(np.round(diffs[diffs > soft_threshold], 2)))
        #print(names[diffs > soft_threshold])
        print("Mean train data: ", train_data.mean(), "Mean test data: ", test_data.mean())
        print("Mean deviation: ", np.mean(diffs), "Max deviation:", max_diff)
        outliers = (diffs > soft_threshold).sum()
        count += 1
        if count == 100:
            raise StopIteration("Can't find balanced split")
        print("Num outliers: ", outliers)
        print()
    return train_idcs, test_idcs

In [None]:
from sklearn.model_selection import train_test_split
def split_df(df, test_size, val_size, **kwargs):
    print("Test split:")
    train_idcs, _ = create_balanced_split(df, test_size, **kwargs)
    print("Validation split:")
    _, val_idcs = create_balanced_split(df.iloc[train_idcs], test_size, **kwargs)
    return train_idcs, val_idcs

In [None]:
from sklearn.model_selection import StratifiedKFold
def create_k_fold(train_data, k):
    skf = StratifiedKFold(n_splits=k, shuffle=True)
    train_labels = create_labels(train_data)
    splits = skf.split(train_data, train_labels)
    splits = [split[1] for split in splits]
    return splits

In [None]:
# Create and save train-test indices:
train_idcs, val_idcs = split_df(df, test_size=0.2, val_size=0.2, hard_threshold=0.15, soft_threshold=0.1, num_allowed=2)
np.save(data_dir + "/train_idcs", train_idcs)
np.save(data_dir + "/val_idcs", val_idcs)

# Create and save k-fold indices:
k = 5
splits = create_k_fold(df.iloc[train_idcs], 5)
split_path = data_dir + "/" + str(k) + "_folds/"
os.makedirs(split_path, exist_ok=True)
for idx, split in enumerate(splits):
    np.save(split_path + str(idx), split)

In [None]:
def store_df(df, name):
    """Stores a fully processed df (filled NANs etc.)"""
    path = data_dir + name + "/"
    os.makedirs(path, exist_ok=True)
    # Extract and store outcomes:
    POD = df["POD"].to_numpy()
    POCD = df["POCD_dichotomous_T2"].to_numpy()
    np.save(path + "POD", POD)
    np.save(path + "POCD", POCD)
    df_no_outcomes = df.drop(columns=["POD", "POCD_dichotomous_T2"])
    # Extract inputs separately:
    blood_names = [col for col in df_no_outcomes.columns if "T1_" in col]
    blood_vals = df_no_outcomes[blood_names].to_numpy()
    static_names = [col for col in df_no_outcomes.columns if "T1_" not in col]
    static_vals = df_no_outcomes[static_names].to_numpy()
    #print(blood_names)
    #print(static_names)
    np.save(path + "blood_names", blood_names)
    np.save(path + "blood_vals", blood_vals)
    np.save(path + "static_names", static_names)
    np.save(path + "static_vals", static_vals)
                    

In [None]:
# Fill missing target values as minus ones:
df[["POD", "POCD_dichotomous_T2"]] = df[["POD", "POCD_dichotomous_T2"]].fillna(-1)

In [None]:
# Create and store differently filled dfs:
# Mean imputation:
df_means = df.mean(axis=0)
df_mean_filled = df.copy()
df_mean_filled["PreCI_dichotomous_T0"].fillna(df["PreCI_dichotomous_T0"].mode()[0], inplace=True)
print(df_mean_filled["PreCI_dichotomous_T0"].unique())
df_mean_filled = df_mean_filled.fillna(df_means)
store_df(df_mean_filled, "data_mean_filled")
# Median imputation:
df_mean_filled = df.fillna(df.median())
store_df(df_mean_filled, "data_median_filled")
# Minuse one imputation:
df_minuses = df.fillna(-1)
store_df(df_minuses, "data_minus_filled")
# IterativeImputer:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
imputer = IterativeImputer()#estimator=None, missing_values=np.nan, sample_posterior=False, 
                           #max_iter=10, tol=0.001, n_nearest_features=None, initial_strategy='mean', 
                           #imputation_order='ascending', skip_complete=False, min_value=None, 
                           #max_value=None, verbose=0, random_state=None, add_indicator=False)
#imputer.fit(df.to_numpy())
#df_imputed = df.transform(df)
#store_df(df_imputed, "IterativeImputed")

In [None]:
# Plot and save cleaned histograms
plot_hist(df, name="Cleaned", plots_dir=plots_dir)