In [4]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from matplotlib import pyplot as plt
from os.path import join as oj
import numpy as np
from copy import deepcopy
plt.style.use('dark_background')
import sys
sys.path.append('../src')
import data
import pickle as pkl
import viz
from viz import *
import analyze_helper, train
from sklearn import metrics
from config import *
import pandas as pd
from tqdm import tqdm
outcome_def = 'y_consec_thresh'
out_dir = oj(DIR_RESULTS, 'may7_1') # mar7_2 is 0.95, mar8_1 is 0.96
results = analyze_helper.load_results(out_dir)

# get data
df = data.get_data()
n = df.shape[0]
df_cv = df[df.valid == 1] # exclude test cells, short/long tracks, hotspots
X, y, norms = analyze_helper.normalize(df_cv, outcome_def)

# select model
r = results
r = r.sort_values('accuracy', ascending=False)
idx = np.array(r.index)
accs = np.array(r.accuracy)
# model_name = idx[0]
model_name = 'mlp2_17_select_lasso=7_ros=1_h=0_cal=True'
print('using model', model_name)
for i in range(5):
    print(f'\t{accs[i]:.3f}', idx[i])
    

# load model + preds
d_full_cv, idxs_cv = analyze_helper.get_data_over_folds(model_name, out_dir, df_cv.cell_num, X, y)
y_full_cv = df_cv[outcome_def].iloc[idxs_cv].values.astype(np.int)
preds_cv = d_full_cv[model_name].values
preds_proba_cv = d_full_cv[model_name + '_proba'].values

results_individual = pkl.load(open(oj(out_dir, f'{model_name}.pkl'), 'rb'))
assert np.sum(idxs_cv == np.arange(idxs_cv.size)) == idxs_cv.size, \
       'points not in same order'
assert np.mean(preds_cv==y_full_cv) == np.average(results_individual['cv']['accuracy'], 
                                               weights=results_individual['num_pts_by_fold_cv']), \
        'did not properly load model/data'
tp, tn, fp, fn = analyze_helper.calc_errs(preds_cv, y_full_cv)
print('succesfully loaded!')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload




using model mlp2_17_select_lasso=7_ros=1_h=0_cal=True
	0.760 svm_17_select_lasso=9_ros=1.2_h=1_cal=True
	0.760 svm_17_select_lasso=9_ros=1_h=1_cal=True
	0.759 mlp2_17_select_lasso=9_ros=1_h=1_cal=True
	0.758 mlp2_17_select_lasso=15_ros=1_h=0_cal=True
	0.757 mlp2_17_select_lasso=9_ros=1_h=0_cal=True
succesfully loaded!




# accuracies on different test datasets

**calculate predictions on diff datasets**

In [47]:
orig_dset = 'clath_aux+gak_a7d2'
normalize_by_train = False
df = data.get_data()

# training data
CELLS_TRAIN = config.DSETS[orig_dset]['train']
df_train = df[df.cell_num.isin(CELLS_TRAIN)]
X_train = df_train[data.get_feature_names(df_train)]
X_mean_train = X_train.mean()
X_std_train = X_train.std()

# testing data
CELLS_TEST = config.DSETS[orig_dset]['test']
df_test = df[df.cell_num.isin(CELLS_TEST)]
X_test = df_test[data.get_feature_names(df_test)]
if normalize_by_train:
    X_test = (X_test - X_mean_train) / X_std_train
else:
    X_test = (X_test - X_test.mean()) / X_test.std()
y_test = df_test[outcome_def].values

# get model
m0 = results_individual['imps']['model'][0]
feat_names_selected = results_individual['feat_names_selected']
preds_test = m0.predict(X_test[feat_names_selected]) 
preds_proba_test = m0.predict_proba(X_test[feat_names_selected]) [:, 1]

# set up lists
dsets = ['validation', 'test']
ys_list = [y_full_cv, y_test]
preds_list = [preds_cv, preds_test]
preds_proba_list = [preds_proba_cv, preds_proba_test]
Y_max_list = [df_cv['Y_max'], df_test['Y_max']]

def normalize_and_predict(dset_name, normalize_by_train):
    df_new = data.get_data(dset=dset_name, use_processed=True,
                           use_processed_dicts=True, outcome_def=outcome_def,
                           previous_meta_file=oj(DIR_PROCESSED,
                                                 'metadata_clath_aux+gak_a7d2.pkl'))
    df_new = df_new[df_new['valid']] # exclude test cells, short/long tracks, hotspots
    
    # impute (only does anything for dynamin data)
    df_new = df_new.fillna(df_new.median())
    
    X_new = df_new[data.get_feature_names(df_new)]
    if normalize_by_train:
        X_new = (X_new - X_mean_train) / X_std_train
    else:
        X_new = (X_new - X_new.mean()) / X_new.std()
    y_new = df_new[outcome_def].values
    preds_new = m0.predict(X_new[feat_names_selected]) 
    preds_proba_new = m0.predict_proba(X_new[feat_names_selected])[:, 1]
    Y_maxes = df_new['Y_max']
    return df_new, y_new, preds_new, preds_proba_new, Y_maxes

# loop over new datasets
dset_names = [k for k in config.DSETS.keys() if not k == orig_dset]
for dset_name in tqdm(dset_names):
    _, y_new, preds_new, preds_proba_new, Y_maxes = normalize_and_predict(dset_name, normalize_by_train)
    dsets.append(dset_name)
    ys_list.append(deepcopy(y_new))
    preds_list.append(deepcopy(preds_new))
    preds_proba_list.append(deepcopy(preds_proba_new))
    Y_max_list.append(Y_maxes)




  0%|          | 0/7 [00:00<?, ?it/s][A[A[A


 14%|█▍        | 1/7 [00:00<00:03,  1.98it/s][A[A[A


 29%|██▊       | 2/7 [00:05<00:08,  1.77s/it][A[A[A


 43%|████▎     | 3/7 [00:22<00:25,  6.32s/it][A[A[A


 57%|█████▋    | 4/7 [00:25<00:16,  5.48s/it][A[A[A


 71%|███████▏  | 5/7 [00:26<00:08,  4.09s/it][A[A[A


 86%|████████▌ | 6/7 [00:28<00:03,  3.39s/it][A[A[A


100%|██████████| 7/7 [00:28<00:00,  2.56s/it][A[A[A

**show metrics on different dsets**

In [11]:
def roundd(x):
    try:
        return x.round(2)
    except:
        return [y.round(2) for y in x]
    
def mean_diff(vals, preds):
    return np.mean(vals[preds==1]) - np.mean(vals[preds==0])

r_long = {}
n = len(dsets)
for s in train.scorers:
    if s == 'roc_auc':
        r_long[s] = [roundd(metrics.roc_auc_score(ys_list[i],
                                                  preds_proba_list[i])) for i in range(n)]
    elif 'curve' not in s and 'acc' in s:
        r_long[s] = [roundd(train.scorers[s](ys_list[i], preds_list[i])) for i in range(n)]        
r_long['diff_aux_max_by_class'] = [mean_diff(Y_max_list[i], preds_list[i]) for i in range(n)]
r_long['aux+ ratio'] = [np.mean(ys_list[i]) for i in range(n)]

r = pd.DataFrame.from_dict(r_long).transpose()
r.columns = dsets
r.round(2)

Unnamed: 0,validation,test,clath_aux+gak,clath_aux+gak_a7d2_new,clath_aux_dynamin,clath_aux+gak_new,clath_gak,clath_pi4p_notreatment,ap2_pi4p
balanced_accuracy,0.74,0.77,0.75,0.66,0.69,0.76,0.69,0.75,0.79
accuracy,0.74,0.69,0.75,0.64,0.63,0.76,0.67,0.54,0.58
roc_auc,0.81,0.88,0.81,0.71,0.79,0.83,0.74,0.84,0.83
diff_aux_max_by_class,481.56,552.96,1346.36,1838.8,539.72,1253.91,509.87,5276.23,6708.42
aux+ ratio,0.5,0.22,0.6,0.7,0.26,0.52,0.4,0.96,1.0


In [12]:
print(r.to_latex())

\begin{tabular}{lrrrrrrrrr}
\toprule
{} &  validation &        test &  clath\_aux+gak &  clath\_aux+gak\_a7d2\_new &  clath\_aux\_dynamin &  clath\_aux+gak\_new &   clath\_gak &  clath\_pi4p\_notreatment &     ap2\_pi4p \\
\midrule
balanced\_accuracy     &    0.740000 &    0.770000 &       0.750000 &                0.660000 &           0.690000 &           0.760000 &    0.690000 &                 0.75000 &     0.790000 \\
accuracy              &    0.740000 &    0.690000 &       0.750000 &                0.640000 &           0.630000 &           0.760000 &    0.670000 &                 0.54000 &     0.580000 \\
roc\_auc               &    0.810000 &    0.880000 &       0.810000 &                0.710000 &           0.790000 &           0.830000 &    0.740000 &                 0.84000 &     0.830000 \\
diff\_aux\_max\_by\_class &  481.561862 &  552.964877 &    1346.360305 &             1838.796365 &         539.723075 &        1253.913174 &  509.865985 &              5276.23233 &  6708.

# diff dataset summaries

In [13]:
# process original data (and save out lifetime thresholds)
dset_names = [k for k in sorted(config.DSETS.keys()) if not 'pi4p' in k]
# dsets = ['clath_aux', 'orig_gak', 'clath_aux_no_a7d2', 'clath_aux_a8', 'clath_pi4p_notreatment']

NUM_DSETS = len(dset_names)
rs = {
    k: [] for k in ['X_mean', 'Y_max']
}
ds = {
    k: [] for k in ['lifetime']
}
for dset in tqdm(dset_names):
    # process new data (using lifetime thresholds from original data)
    df = data.get_data(dset=dset,
                  previous_meta_file='processed/metadata_orig.pkl')
    for k in rs.keys():
        rs[k].append(df[k].mean())
    for k in ds.keys():
        ds[k].append(df[k])



  0%|          | 0/6 [00:00<?, ?it/s][A[A

 17%|█▋        | 1/6 [00:00<00:01,  4.87it/s][A[A

 33%|███▎      | 2/6 [00:00<00:01,  2.80it/s][A[A

 50%|█████     | 3/6 [00:04<00:03,  1.25s/it][A[A

 67%|██████▋   | 4/6 [00:04<00:02,  1.03s/it][A[A

 83%|████████▎ | 5/6 [00:15<00:03,  3.84s/it][A[A

100%|██████████| 6/6 [00:16<00:00,  3.09s/it][A[A

**means of some features**

In [None]:
R, C = 1, 2
plt.figure(figsize=(8, 3), dpi=500)
for i, k in enumerate(rs.keys()):
    plt.subplot(R, C, i + 1)
    plt.barh(dset_names, rs[k], color=cb)
    plt.xlabel('Average ' + k)
plt.tight_layout()
plt.show()

In [None]:
R, C = 1, len(ds.keys())
plt.figure(figsize=(8, 3), dpi=500)
for i, k in enumerate(ds.keys()):
    ax = plt.subplot(R, C, i + 1)
    # plt.barh(dset_names, [np.mean(x) for x in ds[k]], color=cb)
    ax.violinplot([val.values for val in ds[k]], vert=False, widths=1, showmedians=True, showextrema=True) #, quantiles=[25, 50])
    plt.yticks(np.arange(len(dset_names)) + 1, dset_names)
    plt.xlabel(k)
    plt.xscale('log')
plt.tight_layout()
plt.show()

# lower res data

In [None]:
accs = []
DOWNSAMPLE_list = range(1, 21)
for DOWNSAMPLE in tqdm(DOWNSAMPLE_list):

    # downsample
    df_cv = deepcopy(df[df.valid == 1]) # exclude test cells, short/long tracks, hotspots
    df_cv['X'] = [x[::DOWNSAMPLE] for x in df_cv.X]
    df_cv['X_extended'] = [x[::DOWNSAMPLE] for x in df_cv.X_extended]
    df_cv['lifetime'] = [len(x) for x in df_cv.X]
    df_cv = data.add_features(df_cv)


    # get data
    X, y, norms = analyze_helper.normalize(df_cv, outcome_def)
    d_full_cv, idxs_cv = analyze_helper.get_data_over_folds(model_name, out_dir, df_cv.cell_num, X, y)
    y_full_cv = df_cv[outcome_def].iloc[idxs_cv].values.astype(np.int)
    preds = d_full_cv[model_name].values
    preds_proba = d_full_cv[model_name + '_proba'].values
    acc = np.mean(preds==y_full_cv)
    accs.append(acc)
#     print(f'downsampling rate {DOWNSAMPLE} acc {acc.round(3)}')

In [None]:
plt.figure(dpi=300)
plt.plot(DOWNSAMPLE_list, accs, '.-', color=cb)
plt.xlabel('Downsamping factor')
plt.ylabel('Accuracy on difficult region')
plt.savefig('downampling.pdf')
plt.show()

In [None]:
# plot of example track
track_num = 3
ex = deepcopy(df[df.valid == 1]).iloc[track_num]
viz.plot_example(ex)
plt.plot(np.arange(len(ex.X))[::3], ex.X[::3], 'o', color='w', alpha=0.5)
plt.show()

# viz biggest errs

In [45]:
dset_name = 'clath_aux_dynamin'
df_new, y_new, preds_new, preds_proba_new, Y_maxes = normalize_and_predict(dset_name, normalize_by_train=False)

['lifetime', 'cell_num', 'catIdx', 't', 'mean_total_displacement', 'mean_square_displacement', 'x_pos_seq', 'y_pos_seq', 'x_pos', 'y_pos', 'X_pvals', 'X_extended', 'X', 'X_starts', 'X_ends', 'Y_pvals', 'Y', 'Y_starts', 'Y_ends', 'Z_pvals', 'Z', 'Z_starts', 'Z_ends', 'lifetime_extended', 'pid', 'valid', 'X_max', 'X_max_extended', 'X_min', 'X_mean', 'X_std', 'Y_max', 'Y_mean', 'Y_std', 'X_peak_idx', 'Y_peak_idx', 'X_peak_time_frac', 'slope_end', 'X_peak_last_15', 'X_peak_last_5', 'rise', 'fall', 'rise_extended', 'fall_extended', 'fall_late_extended', 'rise_slope', 'fall_slope', 'rise_local_3', 'fall_local_3', 'rise_local_11', 'fall_local_11', 'max_diff', 'min_diff', 'y_score', 'y_thresh', 'y', 'y_num_sig', 'y_single_sig', 'y_double_sig', 'y_conservative_thresh', 'y_consec_sig', 'y_sig_min_diff', 'y_consec_thresh', 'sig_idxs', 'hotspots', 'Y_peak_time_frac', 'y_z_score', 'X_max_around_Y_peak', 'X_max_after_Y_peak', 'X_max_diff', 'y_rule_based', 'short', 'long']


In [None]:
# visualize the biggest errors
num_to_plot = 25
# print('total pts', preds.shape[0])
# for idxs, name in zip([fp, fn, tp, tn], ['fp', 'fn', 'tp', 'tn']):
for idxs, name in zip([fp, fn], ['fp', 'fn']):
# for idxs, name in zip([tp, tn], ['tp', 'tn']):
    print(name)
    inds = viz.viz_biggest_errs(df_new, None, None,
                                y_new,
                                preds_new,
                                preds_proba_new,
                                num_to_plot,
                                plot_z=True, xlim_constant=False)
#     plt.savefig(f'{name}.pdf')
    plt.show()