In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('../../data/core_data_set_20200211_adalab_clinical.csv', encoding='latin1').replace({' ': np.nan})

In [445]:
# Filter T2 events:
filter_events = ['subject', 'GDS_imputed_T2_trial', 'Status_3Monate', 'ÜberlebenBis90Tage', 'DreiMonatsmortalität', 'ICUdays', 'complication', 'inhouse_mortality_yes_no']
filter_events.append('LOSdays')   # Not sure if this needs to be removed
filter_events.append('OP_Dauer_min')  # OP dauer might only be known after the OP, but it might be an estimate of the duration
filter_events.append(['complication', 'pain_yes_no']) # Leaks information about the success of the operation.
df_clean = df[[col for col in df.columns if col not in filter_events]]

In [447]:
# Filter POD nans and extract pods
df_pod = df_no_sparse.dropna(subset=["POD"])
pod = df_pod["POD"]
df_no_pod = df_pod.drop(columns=["POD"])

In [449]:
# male/female to binary
df_no_pocd.loc[df_no_pocd["dm0020_v1"] == 'male', "dm0020_v1"] = 0
df_no_pocd.loc[df_no_pocd["dm0020_v1"] == 'female', "dm0020_v1"] = 1

In [450]:
# norm
df_no_pocd = (df_no_pocd - df_no_pocd.mean()) / df_no_pocd.std()

In [None]:
# Only use these features:
#df_no_pocd = df_no_pocd[["OP_Dauer_min", "op0270_v2", "ie0072_v1"]]

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [496]:
#clf = GradientBoostingClassifier(n_estimators=100, max_depth=3)
clf = LogisticRegression()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = df_no_pocd.to_numpy().astype(float)
y = pod.to_numpy().astype(float)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)

In [None]:
1 - y_test.mean()

In [None]:
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
(preds == y_test).mean()

In [None]:
importance_df = pd.DataFrame({"value": clf.feature_importances_, "feature": df_no_pocd.columns}).sort_values("value")

In [None]:
importance_df.iloc[-3:]

In [None]:
# auc
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1])
auc

In [None]:
import matplotlib.pyplot as plt

In [None]:
import umap
import sklearn

In [None]:
mapper = sklearn.decomposition.PCA(n_components=10)
#mapper = umap.UMAP(n_components=10, n_neighbors=5)

In [None]:
train_embedding = mapper.fit_transform(x_train)
clf.fit(train_embedding, y_train)
test_embedding = mapper.transform(x_test)
preds = clf.predict(test_embedding)
(preds == y_test).mean()

In [None]:
1 - y_test.mean()

In [None]:
#umap.plot.points(mapper)

In [None]:
plt.scatter(
    train_embedding[:, 0],
    train_embedding[:, 1], 
    c=y_train)

In [None]:
plt.scatter(
    test_embedding[:, 0],
    test_embedding[:, 1], 
    c=y_test)

# Create allow and remove lists

In [None]:
df.columns

In [None]:
remove_list = ['GDS_imputed_T2_trial', 'Status_3Monate', 'ÜberlebenBis90Tage', 'DreiMonatsmortalität', 'ICUdays', 'complication', 'inhouse_mortality_yes_no', 'LOSdays', 'OP_Dauer_min', 'pain_yes_no', 'cc_score_post', 'LackOfCompliance_ed2', 'DeceasedBeforeFU', 'admscore_v2', 'admscore_v3', 'admscore_v4', 'admscore_v5', 'admscore_v6', 'admscore_v7', 'admscore_v8', 'admscore_v9', 'GDS_imputed_T1_trial', 'localisation', 'anesthComb', 'icd0300_v1']

# anesthComb: has only nan and 0
# localisation: drop this feature and instead use the one-hot encoded versions localisation_ThrAbdPlv and localisation_intracranial
# admscore for v2 and above: they are all post-op


In [None]:
dest_file = open('../preprocess_utils/feature_lists/removelist_clinical_features.txt','w')

for feat in remove_list:
    dest_file.write(feat + '\n')
dest_file.close()

In [None]:
allow_list = list(set(df.columns) - set(remove_list))
print(allow_list)


In [None]:
np.save('../preprocess_utils/feature_lists/allowlist_clinical_data.npy', allow_list)

### feature "localisation_intracranial" and case "intracranial" in feature "localisation" are identical -> remove "localisation_intracranial" and do one-hot later

In [None]:
df.loc[:, df.columns != 'subject'] = df.loc[:, df.columns != 'subject'].apply(pd.to_numeric, errors='coerce')

In [None]:
((df['localisation_ThrAbdPlv'] == 1) == (df['localisation'] == (2 or 3))).sum()

In [None]:
((df['localisation_intracranial'] == '1') == (df['localisation'] == '1')).sum()

In [None]:
df['mna0080_v1'].isna().sum()

# Looking at distributions

In [None]:
import sys
sys.path.insert(0, '..')

In [None]:

from utils.plot_utils import plot_hist
plots_dir = '../../plots/new_data/'

plot_hist(df.loc[:, [col for col in allow_list if col != 'subject']], name="clinical", plots_dir=plots_dir)

### Yeo changes bin variables --> exclude those from transformation

In [None]:
from preprocess_utils.preprocessing_utils import apply_yeojohnson
from scipy.stats import yeojohnson


In [None]:
df['ASA_bin'].hist()

In [None]:
import matplotlib.pyplot as plt
plt.hist(yeojohnson(df['ASA_bin'])[0])

In [None]:
df['ASA_bin'].mean()

In [None]:
yeojohnson(df['ASA_bin'])[0].mean()

In [None]:
no_yeo_list = ['brain_lh_rostralmiddlefrontal_area', 'brain_BrainSegVol',
       'brain_BrainSegVolNotVentSurf', 'brain_lateraloccipital_area',
       'brain_RightCerebellumWhiteMatter', 'brain_superiorfrontal_area',
       'brain_TotalGrayVol', 'brain_lh_superiorfrontal_volume',
       'brain_lhCerebralWhiteMatterVol', 'brain_superiorfrontal_volume',
       'brain_lh_rostralmiddlefrontal_volume',
       'brain_lateraloccipital_volume', 'brain_inferiorparietal_area',
       'brain_lh_postcentral_volume', 'brain_lh_superiorparietal_area',
       'brain_lh_precuneus_volume', 'brain_rostralmiddlefrontal_volume',
       'brain_rh_inferiorparietal_area',
       'brain_lh_superiorparietal_volume',
       'brain_rh_lateraloccipital_area', 'brain_lingual_volume',
       'brain_rh_lateraloccipital_volume',
       'brain_rh_superiorparietal_volume',
       'brain_rh_superiorfrontal_area', 'brain_precuneus_volume',
       'brain_lh_inferiortemporal_volume', 'brain_SupraTentorialVol',
       'brain_superiorparietal_volume', 'brain_lh_superiorfrontal_area',
       'brain_lh_G_front_sup_area', 'brain_LeftCerebellumWhiteMatter',
       'brain_MaskVol', 'brain_superiorparietal_area',
       'brain_rh_rostralmiddlefrontal_volume', 'brain_BrainStem',
       'brain_lh_medialorbitofrontal_volume',
       'brain_SupraTentorialVolNotVent', 'brain_postcentral_volume',
       'brain_rostralmiddlefrontal_area', 'brain_BrainSegVolNotVent',
       'brain_rh_rostralmiddlefrontal_area', 'brain_lh_lingual_volume',
       'brain_SupraTentorialVolNotVentVox',
       'brain_lh_G_parietal_sup_volume', 'brain_CerebellumWhiteMatter',
       'brain_rh_superiorfrontal_volume',
       'blood_T1_IL10_pgml', 'blood_T1_Volk_IL8_pgml', 'blood_T1_IL18_pgml_Boraschi']

In [None]:
np.save("/home/angelie/Documents/AdaLab/pharmaimage/src/preprocess_utils/feature_lists/ignorelist_yeo.npy", no_yeo_list)