In [None]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt

In [None]:
non_yeo = pd.DataFrame(data=np.load("../../data/non-transformed_minmax/iterative-filled/blood_vals.npy"), 
                        columns=np.load("../../data/non-transformed_minmax/iterative-filled/blood_names.npy"))

# !! Although the path says 'minmax' I actually commented out the normalization for this one

In [None]:
static = pd.DataFrame(data=np.load("../../data/non-transformed_minmax/iterative-filled/static_vals.npy", allow_pickle=True), 
                        columns=np.load("../../data/non-transformed_minmax/iterative-filled/static_names.npy"))
static

In [None]:
pod = np.load("../../data/non_transformed_minmax/iterative_filled/POD.npy")
pocd = np.load("../../data/non_transformed_minmax/iterative_filled/POCD.npy")

In [None]:
non_yeo["POD"] = pod
non_yeo["POCD"] = pocd

# not Yeo transformed data - overview

In [None]:
non_yeo

# Get univariate outliers via quantile

In [None]:
def univariate_outliers(df, quantile):
    df["any_uni_low_outlier"] = pd.Series([0] * len(df))
    df["any_uni_high_outlier"] = pd.Series([0] * len(df))
    upq = quantile
    loq = 1 - quantile
    quant_thresh = [loq, upq]
    count_total = 0
    num_cases = len(df)

    for name in df.columns[:-1]:
        df[f"{name}_uni_low_outlier"] = pd.Series([0] * len(df))
        df[f"{name}_uni_high_outlier"] = pd.Series([0] * len(df))
        # get upper and lower quantiles

        quants = df[name].quantile(quant_thresh)

        # lower
        df.loc[(df[name] < quants[loq]), [f"{name}_uni_low_outlier"]] = 1
        df.loc[(df[name] > quants[upq]), [f"{name}_uni_high_outlier"]] = 1
        df.loc[(df[name] < quants[loq]), ["any_uni_low_outlier"]] += 1
        df.loc[(df[name] > quants[upq]), ["any_uni_high_outlier"]] += 1
  
        count_low = df.loc[:, f"{name}_uni_low_outlier"].sum()
        count_high = df.loc[:, f"{name}_uni_high_outlier"].sum()
        count_total += (df[f"{name}_uni_low_outlier"] | df[f"{name}_uni_high_outlier"]).sum()
        if count_high == 0:
            del df[f"{name}_uni_high_outlier"]
        if count_low == 0:
            del df[f"{name}_uni_low_outlier"]
        percent_low = np.round((count_low / num_cases) * 100, 2)
        percent_high = np.round((count_high / num_cases) * 100, 2)
        print(f"{name}\n: {count_low} abs., {percent_low}% 'low' outliers\n{count_high} abs., {percent_high}% 'high' outliers")

    percent_total = np.round((count_total / num_cases) * 100, 2)
    print(f"\n{count_total} abs. cases with at least one outlier, {percent_total}% outliers in total")

    return df

In [None]:
non_yeo_uni = univariate_outliers(non_yeo, 0.9999)

# Investigate distribution of outlier count per case

In [None]:
print("Number of low outliers per person with any low outlier.")
non_yeo_uni.loc[non_yeo_uni["any_uni_low_outlier"] > 0, "any_uni_low_outlier"].hist()

In [None]:
print("Number of high outliers per person with any high outlier.")
non_yeo_uni.loc[non_yeo_uni["any_uni_high_outlier"] > 0, "any_uni_high_outlier"].hist()

In [None]:
print("Number of cases that show both high and low outliers.")
(non_yeo_uni["any_uni_high_outlier"] & non_yeo_uni["any_uni_low_outlier"]).sum()

In [None]:
high_outlier_cols = non_yeo_uni.columns[non_yeo_uni.columns.str.contains("high_outlier")]
low_outlier_cols = non_yeo_uni.columns[non_yeo_uni.columns.str.contains("low_outlier")]

# Isolation forest

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
non_yeo.columns

In [None]:
out_inliers = IsolationForest(random_state=0).fit_predict(non_yeo.loc[:, ~non_yeo.columns.str.contains("uni|POD|POCD")])
# -1 are outliers

In [None]:
print("Number of outliers in a multivariate sense as classified by the IsolationForest")
np.sum(out_inliers == -1)

## Distribution of the number of outliers (univariate) for the multivariate outliers

### Do they have in general more univariate outlier values than others? -> in tendency yes

In [None]:
print("Number of high univariate outliers for cases **that are** multivariate outliers")
non_yeo_uni.loc[out_inliers == -1, "any_uni_high_outlier"].hist()

In [None]:
print("Number of high univariate outliers for cases that are **not** multivariate outliers")
non_yeo_uni.loc[out_inliers == 1, "any_uni_high_outlier"].hist()

In [None]:
print("Number of low univariate outliers for cases **that are** multivariate outliers")
non_yeo_uni.loc[out_inliers == -1, "any_uni_low_outlier"].hist()

In [None]:
print("Number of low univariate outliers for cases that are **not** multivariate outliers")
non_yeo_uni.loc[out_inliers == 1, "any_uni_low_outlier"].hist()

## Closer look at the multivariate outliers

In [None]:
np.where(out_inliers == -1)

### In sum, they all have at least three outlying values (univariate) 

In [None]:
non_yeo_uni.loc[out_inliers == -1, non_yeo_uni.columns.str.contains("_outlier")]

### Do they have more missings than others? -> no

In [None]:
missings = static.loc[:, "Missingness"].values

In [None]:

sums = np.zeros(len(static))
for i, m in enumerate(missings):
    sums[i] = np.sum(m)

plt.hist(sums)

In [None]:
print(f"Average amount of missings per person {np.round(np.mean(sums), 1)} and standard dev. {np.round(np.std(sums), 1)}")

In [None]:
print("Small/ normal amount of missings for the multivariate outliers.")
sums[out_inliers == -1] 

### Do they show patterns wrt. age, gender, height, etc.? -> gender balanced, values within normal range, weight diverse, no obvious pattern here

In [None]:
static.loc[out_inliers == -1, : "Gewicht"]

### What are the outlying values?

In [None]:
outlier_cols = non_yeo_uni.columns[non_yeo_uni.columns.str.contains("_outlier")]


In [None]:
outlying_cols_all = []
outliers_df = non_yeo_uni.loc[out_inliers == -1, :].copy()

for idx in non_yeo.index[out_inliers == -1]:
    outlying_cols = [col.split('_uni_', 1)[0] for col in outlier_cols if not 'any' in col and non_yeo_uni.loc[idx, col] == 1]
    outliers_df.loc[idx, ~outliers_df.columns.isin(outlying_cols)] = 'normal'
    outlying_cols_all += outlying_cols
outlying_cols_all = set(outlying_cols_all)

In [None]:
outliers_df[outlying_cols_all]

In [None]:
for idx in outliers_df.index[10:]:
    print(f"Case {idx}:\n", outliers_df.loc[idx, outlying_cols_all], "\n\n")