In [1]:
import sys
import os
import joblib
import time
import ast

import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
import optuna

sys.path.insert(0, os.path.abspath('../..'))
sys.path.insert(0, os.path.abspath('..'))

from src.train import _init_model, start_training, get_args
from src.utils.load_data_utils import get_data, get_train_eval_data
import src.rfe as rfe
from src.eval import evaluate_model
from src.tune import get_tune_args


In [2]:
class Empty:
    def __init__(self):
        pass

  and should_run_async(code)


In [3]:
def get_args_from_study(study_name):
    study = optuna.study.load_study(study_name, storage="sqlite:///optuna.db")
    study_sorted = study.trials_dataframe().sort_values('value', ascending=False)
    best_trial = study.best_trial
    params = best_trial.params
    
    args = Empty()
    
    if "mlp_act" in params:
        # Translate dict to mlp dict
        layers = params["layers"]
        nodes = params["nodes"]
        layers_nodes = f'({nodes},' + f' {nodes},' * (layers - 1) + ')'
        args.mlp_hl = ast.literal_eval(layers_nodes)
        args.mlp_act = params["mlp_act"]
        args.mlp_solver = 'sgd'
        args.mlp_lr = params["mlp_lr"]
        args.mlp_lr_init = params['mlp_lr_init']
        args.mlp_max_iter = params['mlp_max_iter']
        args.m = "mlp"

    args.v = 0
    args.save = 0
    args.from_config = 0

    args.df = "yeo_Y/z/median/uni_clip_0.9999/multi_clip_Y"
    args.features = None
    args.blood = 0
    args.clinical = 1
    args.imaging = 0
    args.imaging_pca = 0
    args.static = 1
    args.miss_feats = 1

    args.use_pod = 1
    args.use_pocd = 0
    args.eval_only_pod = 0

    args.log = 0
    args.use_class_weights = 0
    args.pos_weight = 0
    args.mlp_tol = 1e-4
    args.mlp_v = 0
    return args

In [4]:
# good xgboost: Nov-14-21:23:35_xgb_prauc_3_clinical_20000
# new xboost: Dec-04-16:41:56_xgb_prauc_3_clinical_10000
# mlp: Nov-16-10:55:21_mlp_prauc_3_clinical_20000
# torch: 2020-10-19-23-27-19_torch_median_auc_3

############ Specify this: ########################
study_name = "Dec-04-16:41:56_xgb_prauc_3_clinical_10000"  #"Nov-16-10:52:25_torch_prauc_3_blood_20000_median"
####################################################

def get_best_trial(study_name):
    best_args = ''
    study_folder = os.path.join("optuna_studies", study_name)
    for file in os.listdir(study_folder):
        if file.endswith("rank2.pkl"):
            best_args = joblib.load(os.path.join(study_folder, file))
    return best_args

best_args = get_best_trial(study_name)
best_args

Namespace(act_fnc='relu', alpha_dropout=0.0, augm_std=0.0, batch_size=32, beta=0.9, blood=False, bn=0, cache_dir='.cache/tune_trials/Fri_Dec__4_16:41:56_2020', clinical=True, df='yeo_Y/z/median/uni_clip_0.9999/multi_clip_Y', dropout=0.0, dt='data', dts='clinical', e=None, ensemble_bootstrap=0, ensemble_k=0, ensemble_prior=0, eval_only_pod=0, features=None, fill_method='median', freeze_prepro='1', from_config=None, hidden_size=128, imaging=False, imaging_pca=False, imaging_pca_var=0.8, l=None, log=False, lr=0.005, m='xgb', max_eps=50, max_est=200, metric='prauc', min_est=1, miss_feats=1, mlp_act='relu', mlp_hl=(100,), mlp_lr='constant', mlp_lr_init=0.001, mlp_max_iter=200, mlp_solver='adam', mlp_tol=0.0001, mlp_v=0, momentum=0.9, n_layers=2, nf=3, norm_method='z', nt=2000, optimizer='adam', pocd_weight=1.0, pod_weight=1.0, pos_weight=0, pp=0, pruner='median', remove_multi_outliers=1, remove_outliers=0.9999, rf_max_depth=None, rf_num_est=100, saliency_n_samples=50, saliency_std=0.01, sav

In [5]:
if best_args == "":
    best_args = get_args_from_study(study_name)

In [6]:
# use dev set 
best_args.nf = 0
best_args.split = "dev/test" 

# save retrained model
# best_args.save = True

# load data
x_train, y_train, x_eval, y_eval, _, feature_names, class_weights = get_data(best_args) 

In [7]:
len(x_train)

  and should_run_async(code)


1

In [8]:
x_train[0].shape

(726, 90)

In [9]:
perc_list = np.arange(0.1, 1, 0.02)

In [10]:
orig_len = x_train[0].shape[0]

In [11]:
import numpy as np

In [11]:
all_scores = []
for _ in range(5):
    scores = []
    for perc in perc_list:
        reduced_len = int(orig_len * perc)
        reduced_idcs = np.random.choice(range(orig_len), reduced_len)
        x_train_red = [x_train[0][:reduced_len]]
        y_train_red = [y_train[0][:reduced_len]]
        best_args.seed = np.random.randint(0, 1000)
        # retrain with dataset subselection
        score, models = start_training(best_args, None, 'prauc', x_train_red, y_train_red, x_eval, y_eval, feature_names, class_weights)
        print("Num samples now: ", x_train_red[0].shape[0], "Score:", f'{score:.3f}')
        scores.append(score)
    all_scores.append(scores)

660
Num samples now:  72 Score: 0.481
645
Num samples now:  87 Score: 0.461
987
Num samples now:  101 Score: 0.434
428
Num samples now:  116 Score: 0.517
212
Num samples now:  130 Score: 0.493
203
Num samples now:  145 Score: 0.476
219
Num samples now:  159 Score: 0.507
255
Num samples now:  174 Score: 0.406
611
Num samples now:  188 Score: 0.446
918


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/anton/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-8ec56511224e>", line 11, in <module>
    score, models = start_training(best_args, None, 'prauc', x_train_red, y_train_red, x_eval, y_eval, feature_names, class_weights)
  File "/home/anton/AdaLab/pharmaimage/src/train.py", line 279, in start_training
    models = _train_model(feature_names, x_train, y_train, x_eval, y_eval, class_weights, callbacks, args)
  File "../src/utils/logging.py", line 10, in wrapper
    result = func(*args, **kwargs)
  File "/home/anton/AdaLab/pharmaimage/src/train.py", line 244, in _train_model
    model.fit(x_train[i], y_train[i])
  File "/home/anton/anaconda3/lib/python3.7/site-packages/xgboost/sklearn.py", line 824, in fit
    callbacks=callbacks)
  File "/home/anton/anaconda3/lib/python3.7/site-packages/xgboost/training.py", li

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/anton/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-8ec56511224e>", line 11, in <module>
    score, models = start_training(best_args, None, 'prauc', x_train_red, y_train_red, x_eval, y_eval, feature_names, class_weights)
  File "/home/anton/AdaLab/pharmaimage/src/train.py", line 279, in start_training
    models = _train_model(feature_names, x_train, y_train, x_eval, y_eval, class_weights, callbacks, args)
  File "../src/utils/logging.py", line 10, in wrapper
    result = func(*args, **kwargs)
  File "/home/anton/AdaLab/pharmaimage/src/train.py", line 244, in _train_model
    model.fit(x_train[i], y_train[i])
  File "/home/anton/anaconda3/lib/python3.7/site-packages/xgboost/sklearn.py", line 824, in fit
    callbacks=callbacks)
  File "/home/anton/anaconda3/lib/python3.7/site-packages/xgboost/training.py", li

TypeError: object of type 'NoneType' has no len()

In [None]:
len(perc_list)

In [None]:
np.mean(all_scores, axis=0).shape

In [None]:
mean_scores = np.mean(all_scores, axis=0)
std_scores = np.std(all_scores, axis=0)

In [None]:
import matplotlib.pyplot as plt
p = plt.plot(perc_list, mean_scores)
plt.fill_between(perc_list, mean_scores - std_scores, mean_scores + std_scores, color=p[0].get_color(), alpha=0.5)
plt.xlabel("Fraction of training data")
plt.ylabel("AP")
plt.savefig(f"train_data_effect_{study_name}.jpg")
plt.savefig(f"train_data_effect_{study_name}.png")
plt.savefig(f"train_data_effect_{study_name}.pdf")