In [None]:
import os
import sys

import math
import pandas as pd
import matplotlib.pyplot as plt
import umap.plot
import bokeh

# Add parent folder to syspath to include local util functions
sys.path.insert(0, os.path.abspath('..'))
from utils.plot_utils import plot_hist

# Very basic processing:

In [None]:
data_dir = '../../data/'
plots_dir = '../../plots/'

# Read data:
df = pd.read_excel(data_dir + "blutdaten.xlsx")
df

In [None]:
# Make nameing uniform by having each T1 variable start by "T1_*" instead of some starting with "T1*":
cols = list(df.columns)
def rename_start(name):
    if name[:2] == "T1" and name[2] != "_":
        name = "T1_" + name[2:]
    return name
df = df.rename(rename_start, axis=1)

In [None]:
# Replace NULL string by NANs:
df = df.replace("#NULL!", math.nan)

In [None]:
# Drop data points where we have no target:
print("Data points without POD: ", df["POD"].isna().sum())
df = df.dropna(subset=["POD"])

In [None]:
# Set male to 0 and female to 1:
print(df["sex"].unique())
df.loc[df["sex"] == "male", "sex"] = 0
df.loc[df["sex"] == "female", "sex"] = 1
print(df["sex"].unique())

# Looking at weird values:

In [None]:
NTproBNP = df.loc[:,df.columns.str.contains("NTproBNP")]
print("min_vals\n", NTproBNP.min(), "\nmax_vals\n", NTproBNP.max())
print(NTproBNP.info())

In [None]:
Troponin = df.loc[:,df.columns.str.contains("Troponin")]
print("min_vals\n", Troponin.min(), "\nmax_vals\n", Troponin.max())
print(Troponin.info())

In [None]:
UnreifeGranulozyten = df.loc[:,df.columns.str.contains("UnreifeGranulozyten")]
print("min_vals\n", UnreifeGranulozyten.min(), "\nmax_vals\n", UnreifeGranulozyten.max())
print(UnreifeGranulozyten.info())

## All non-pathological values! (bc. <10%)

In [None]:
IL6 = df.loc[:,df.columns.str.contains("IL6")]
print("min_vals\n", IL6.min(), "\nmax_vals\n", IL6.max())
print(IL6.info())

In [None]:
SORL1 = df.loc[:,df.columns.str.contains("SORL1")]
print("min_vals\n", SORL1.min(), "\nmax_vals\n", SORL1.max())
print(SORL1.info())

In [None]:
Leptin = df.loc[:,df.columns.str.contains("Leptin_Lab")]
print("min_vals\n", Leptin.min(), "\nmax_vals\n", Leptin.max())
print(Leptin.info())

# Histograms:

In [None]:
# Plot histograms:
plot_hist(df, plots_dir=plots_dir)

In [None]:
# Check for negative values:
df.min()[df.min() < 0]

In [None]:
# All values == 0...
# Therefore filter it out
print(df["T1_Erythroblasten_Percent"].describe())
df = df.drop(columns=["T1_Erythroblasten_Percent"])

In [None]:
# Weird distribution for T1_ALAT_GPT_U_L -> outliers?
print(df["T1_ALAT_GPT_U_L"].describe())
df["T1_ALAT_GPT_U_L"].hist()
plt.title("All of T1_ALAT_GPT_U_L")
plt.show()
df.loc[df["T1_ALAT_GPT_U_L"] < 200 ,"T1_ALAT_GPT_U_L"].hist()
plt.title("T1_ALAT_GPT_U_L < 200")

In [None]:
# Weird distribution for T1_ASAT_GOT_U_L -> outliers?
print(df["T1_ASAT_GOT_U_L"].describe())
df["T1_ASAT_GOT_U_L"].hist()
plt.title("All of T1_ASAT_GOT_U_L")
plt.show()
df.loc[df["T1_ASAT_GOT_U_L"] < 200 ,"T1_ASAT_GOT_U_L"].hist()
plt.title("T1_ASAT_GOT_U_L < 200")

Conclusion: 
- T1_Erythroblasten_Percent was dropped as all values are 0
- T1_ALAT_GPT_U_L has some very large values that might not be realistic
- T1_ASAT_GOT_U_L has some very large values that might not be realistic

# Investigating NANs:

In [None]:
missing = df.isna()

In [None]:
missing.mean().sort_values(ascending=False)

Conclusion: 
- IL10, T1_eGFR, T1_IL18_pgml_Boraschi, T1_Reticulated_Platelets_Percent have all very high missing percent ->Is that realistic?

# Outlier analysis

In [None]:
filled_X = df.fillna(df.mean())
Y = filled_X["POD"]
Y_2 = filled_X["POCD_dichotomous_T2"]
X = filled_X.drop(columns=["POD", "POCD_dichotomous_T2"])

## Unsupervised UMAP:

In [None]:
# Calc UMAP:
reducer = umap.UMAP(set_op_mix_ratio=0.25)
embedded = reducer.fit_transform(X)#, Y)

In [None]:
# Apply LOF on UMAP embeddings:
from sklearn.neighbors import LocalOutlierFactor
clf = LocalOutlierFactor(contamination=0.01)
y_pred = clf.fit_predict(embedded)
X_scores = clf.negative_outlier_factor_
outliers_UMAP = y_pred == -1
df[outliers_UMAP]

In [None]:
# Outliers calculated on UMAP:
p = umap.plot.points(reducer, labels=outliers_UMAP, values=None, theme='fire')
plt.title("Outliers calculated on UMAP embeddings. Coloring=Marked as outlier")
fig = p.get_figure()
fig.savefig(plots_dir + "UMAP_outliers.pdf")

In [None]:
# Performance for POD prediction:
p = umap.plot.points(reducer, labels=Y, values=None, theme='fire')
plt.title("UMAP embedding. Coloring=POD")
fig = p.get_figure()
fig.savefig(plots_dir + "UMAP_POD.pdf")

## Supervised UMAP

In [None]:
# Supervised UMAP:
# Calc UMAP:
reducer = umap.UMAP(set_op_mix_ratio=0.25)
embedded = reducer.fit_transform(X, Y)

In [None]:
# Apply LOF on the supervised UMAP embeddings:
y_pred = clf.fit_predict(embedded)
X_scores = clf.negative_outlier_factor_
outliers_UMAP = y_pred == -1
df[outliers_UMAP]

In [None]:
# Outliers calculated on supervised UMAP:
p = umap.plot.points(reducer, labels=outliers_UMAP, values=None, theme='fire')
plt.title("Outliers calculated on supervised UMAP embeddings. Coloring=Marked as outlier")
fig = p.get_figure()
fig.savefig(plots_dir + "UMAP_outliers_superised.pdf")

In [None]:
df

In [None]:
bokeh.plotting.output_notebook()
hover_data = df[["Alter", "sex", "POCD", "MMSE", "ISCED_three_categories", "Groesse", "Gewicht"]]
p = umap.plot.interactive(reducer, labels=outliers_UMAP, hover_data=hover_data, point_size=4, theme="fire")
umap.plot.show(p)
bokeh.plotting.save(p, plots_dir + "Interactive_supervised_UMAP_showing_outliers.html")

In [None]:
# Prediction performance:
p = umap.plot.points(reducer, labels=Y, values=None, theme='fire')
plt.title("UMAP embedding supervised on POD. Coloring=POD")
fig = p.get_figure()
fig.savefig(plots_dir + "UMAP_POD_supervised.pdf")

### Initial conclusion

There were no strong outliers found. Supervised UMAP indicates that some patients with POD might be outliers, but those extreme values could be due to those patients being in the POD group -> exactly what we want to classify.

In [None]:
# Save df:
df.to_csv(data_dir + "investigated.csv", index=False)