In [1]:
import sys
import os

import pandas as pd
import matplotlib.pyplot as plt

sys.path.insert(0, os.path.abspath('../..'))
sys.path.insert(0, os.path.abspath('..'))

from src.utils.load_data_utils import get_data, get_train_eval_data

In [2]:
# pharmaimage_good_dataset
# regular path: yeo_Y/z/median/uni_clip_0.9999/multi_clip_Y

In [3]:
class Empty:
    def __init__(self):
        pass

def load_args(data_path):
    args = Empty()
    args.df = data_path
    args.use_pod = 1
    args.use_pocd = 0
    args.eval_only_pod = 0
    args.v = 0
    args.features = None
    args.blood = 0
    args.clinical = 1
    args.imaging = 0
    args.imaging_pca = 0
    args.static = 1
    args.miss_feats = 1
    args.nf = 0
    args.split = 'dev/test'
    return args

In [4]:
import pickle5 as pickle
import importlib
class PickleProtocol:
    def __init__(self, level):
        self.previous = pickle.HIGHEST_PROTOCOL
        self.level = level

    def __enter__(self):
        importlib.reload(pickle)
        pickle.HIGHEST_PROTOCOL = self.level

    def __exit__(self, *exc):
        importlib.reload(pickle)
        pickle.HIGHEST_PROTOCOL = self.previous

def pickle_protocol(level):
    return PickleProtocol(level)
#with pickle_protocol(5):

In [25]:
import umap
import umap.plot

In [50]:
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from xgboost import XGBClassifier
from sklearn.metrics import average_precision_score


def eval_split(data_path, remove_outliers=0, do_umap=False):
    args = load_args(data_path)
    print("Path: ", args.df)
    x_train, y_train, x_eval, y_eval, _, feature_names, class_weights = get_data(args) 
    dev_df_target =  pd.DataFrame(data=y_train[0], columns=["POD"])
    test_df_target =  pd.DataFrame(data=y_eval[0], columns=["POD"])
    print("Diff POD:", (dev_df_target.mean() - test_df_target.mean()).abs().iloc[0])
    
    dev_df = pd.DataFrame(data=x_train[0], columns=feature_names)
    test_df = pd.DataFrame(data=x_eval[0], columns=feature_names)
    
    if "Unnamed: 0" in dev_df.columns:
        dev_df = dev_df.drop(columns=["Unnamed: 0"])
        test_df = test_df.drop(columns=["Unnamed: 0"])
    
    
    #plt.hist(dev_df["clinical_anaesthesia_duration"], bins=30, label="dev", density=True, alpha=1)
    #plt.hist(test_df["clinical_anaesthesia_duration"], bins=30, label="test", density=True, alpha=0.7)
    #plt.legend()
    #plt.show()
    
    fract_diff = (dev_df.mean() - test_df.mean()).abs()
    #print(fract_diff.sort_values().iloc[:10])
    #print(fract_diff.sort_values().iloc[-10:])
    print("Mean diff: ", fract_diff.mean())
    print(len(dev_df), len(test_df))
    combined_data = pd.concat([dev_df, test_df], axis=0)
    non_na_cols = ~dev_df.columns.str.contains("_nan")
    clf = IsolationForest(random_state=0, contamination=0.95, n_estimators=100)
    clf.fit(dev_df.loc[:, non_na_cols])#
    iso_dev_preds = clf.predict(dev_df.loc[:, non_na_cols])
    iso_dev_mask = iso_dev_preds == 1
    print("Isolation Forest:")
    print("Total outliers in dev set: ", iso_dev_mask.sum(), "Prop: ", iso_dev_mask.mean())
    iso_test_preds = clf.predict(test_df.loc[:, non_na_cols])
    iso_test_mask = iso_test_preds == 1
    print("Total outliers in test set: ", iso_test_mask.sum(), "Prop: ", iso_test_mask.mean())
    print()
    
    #print("LOF:")
    #clf = LocalOutlierFactor(n_neighbors=20, contamination=0.48)
    #preds = clf.fit_predict(combined_data)
    #dev_preds = preds[:len(dev_df)]
    #test_preds = preds[:len(test_df)]
    #dev_mask = dev_preds == 1
    #test_mask = test_preds == 1
    #print("Total outliers in dev set: ", dev_mask.sum(), "Prop: ", dev_mask.mean())
    #print("Total outliers in test set: ", test_mask.sum(), "Prop: ", test_mask.mean())
    #print()
    
    # remove devset outliers (should decrease performance)
    train_targets = y_train[0]#[~iso_dev_mask]
    
    if remove_outliers:
        dev_df = dev_df.iloc[np.where(~iso_dev_mask)]
        train_targets = train_targets[~iso_dev_mask]
    
    clf = XGBClassifier(n_estimators=100, max_depth=1, learning_rate=0.09)
    clf.fit(dev_df, train_targets)
    preds = clf.predict_proba(test_df)[:, 1]
    targets = y_eval[0]
    ap = average_precision_score(targets, preds)
    print("AP:", ap)
    
    if do_umap:
        umapper = umap.UMAP(n_components=2)
        umap_dev_embs = umapper.fit_transform(dev_df)

        umap.plot.points(umapper, labels=train_targets, theme="fire")
        print("test data")
        umap_test_embs = umapper.transform(test_df)
        umapper.embedding_ = umap_test_embs
        umap.plot.points(umapper, labels=targets, theme="fire")
        # For fun train a model:
        clf = XGBClassifier(n_estimators=100, max_depth=1, learning_rate=0.09)
        clf.fit(umap_dev_embs, train_targets)
        preds = clf.predict_proba(umap_test_embs)[:, 1]
        targets = y_eval[0]
        ap = average_precision_score(targets, preds)
        print("AP UMAP: ", ap)
        
        # check refitting on test data
        #print("test retrain")
        #umapper.fit_transform(test_df)
        #umap.plot.points(umapper, labels=targets, theme="fire")


In [52]:
eval_split("pharmaimage_good_dataset", do_umap=False)

Path:  pharmaimage_good_dataset
Diff POD: 0.0058212829069336225
Mean diff:  0.04636260970049596
725 186
Isolation Forest:
Total outliers in dev set:  37 Prop:  0.05103448275862069
Total outliers in test set:  9 Prop:  0.04838709677419355

AP: 0.6814940253147471


In [53]:
eval_split("pharmaimage_good_dataset", remove_outliers=1)

Path:  pharmaimage_good_dataset
Diff POD: 0.0058212829069336225
Mean diff:  0.04636260970049596
725 186
Isolation Forest:
Total outliers in dev set:  37 Prop:  0.05103448275862069
Total outliers in test set:  9 Prop:  0.04838709677419355

AP: 0.6805417413044623


In [54]:
eval_split("yeo_Y/z/median/uni_clip_0.9999/multi_clip_Y")

Path:  yeo_Y/z/median/uni_clip_0.9999/multi_clip_Y
Diff POD: 0.003869786127850633
Mean diff:  0.03599501876843749
728 186
Isolation Forest:
Total outliers in dev set:  37 Prop:  0.050824175824175824
Total outliers in test set:  4 Prop:  0.021505376344086023

AP: 0.6370867829683413


In [55]:
eval_split("yeo_Y/z/median/uni_clip_0.9999/multi_clip_Y", remove_outliers=1)

Path:  yeo_Y/z/median/uni_clip_0.9999/multi_clip_Y
Diff POD: 0.003869786127850633
Mean diff:  0.03599501876843749
728 186
Isolation Forest:
Total outliers in dev set:  37 Prop:  0.050824175824175824
Total outliers in test set:  4 Prop:  0.021505376344086023

AP: 0.6489318552656391


In [56]:
eval_split("yeo_Y/z/median/uni_clip_0.9999/multi_clip_N")

Path:  yeo_Y/z/median/uni_clip_0.9999/multi_clip_N
Diff POD: 0.0010781632150971554
Mean diff:  0.03551868572201418
743 186
Isolation Forest:
Total outliers in dev set:  38 Prop:  0.05114401076716016
Total outliers in test set:  15 Prop:  0.08064516129032258

AP: 0.4651123645479984


In [17]:
eval_split("yeo_Y/z/median/uni_clip_0.9999/multi_clip_N", remove_outliers=1)

Path:  yeo_Y/z/median/uni_clip_0.9999/multi_clip_N
Diff POD: 0.0010781632150971554
Mean diff:  0.03551868572201418
743 186
Isolation Forest:
Total outliers in dev set:  38 Prop:  0.05114401076716016
Total outliers in test set:  15 Prop:  0.08064516129032258

AP: 0.46682889129781274


In [18]:
eval_split("yeo_Y/z/median/uni_clip_0.95/multi_clip_Y")

Path:  yeo_Y/z/median/uni_clip_0.95/multi_clip_Y
Diff POD: 0.007606403208519158
Mean diff:  0.0348366394337834
622 186
Isolation Forest:
Total outliers in dev set:  32 Prop:  0.05144694533762058
Total outliers in test set:  15 Prop:  0.08064516129032258

AP: 0.4468360052401391


In [19]:
eval_split("yeo_Y/z/median/uni_clip_0.95/multi_clip_N")

Path:  yeo_Y/z/median/uni_clip_0.95/multi_clip_N
Diff POD: 0.0010781632150971554
Mean diff:  0.03289160507674584
743 186
Isolation Forest:
Total outliers in dev set:  38 Prop:  0.05114401076716016
Total outliers in test set:  13 Prop:  0.06989247311827956

AP: 0.47609244485058194


In [20]:
eval_split("yeo_Y/z/median/uni_clip_0.95/multi_clip_N", remove_outliers=1)

Path:  yeo_Y/z/median/uni_clip_0.95/multi_clip_N
Diff POD: 0.0010781632150971554
Mean diff:  0.03289160507674584
743 186
Isolation Forest:
Total outliers in dev set:  38 Prop:  0.05114401076716016
Total outliers in test set:  13 Prop:  0.06989247311827956

AP: 0.4777579478223559
