In [50]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from matplotlib import pyplot as plt
from os.path import join as oj
import numpy as np
from copy import deepcopy
plt.style.use('dark_background')
import data
from matplotlib_venn import venn3, venn2
import pickle as pkl
import viz
from style import *
import analyze_helper
from sklearn import decomposition
from sklearn.calibration import calibration_curve
import config
import train 
import pandas as pd
from tqdm import tqdm
outcome_def = 'y_consec_thresh'
out_dir = oj('/scratch/users/vision/abc', 'apr28_1') # mar7_2 is 0.95, mar8_1 is 0.96
results = analyze_helper.load_results(out_dir)

# get data
df = data.get_data()
n = df.shape[0]
df_cv = df[df.valid == 1] # exclude test cells, short/long tracks, hotspots
X, y, norms = analyze_helper.normalize(df_cv, outcome_def)

# select model
r = results
r = r.sort_values('accuracy', ascending=False)
idx = np.array(r.index)
accs = np.array(r.accuracy)
model_name = idx[0]
# model_name = 'svm_16_ros=1.2_select_rf=3'
print('using model', model_name)
for i in range(5):
    print(f'\t{accs[i]:.3f}', idx[i])
    

# load model + preds
d_full_cv, idxs_cv = analyze_helper.get_data_over_folds(model_name, out_dir, df_cv.cell_num, X, y)
y_full_cv = df_cv[outcome_def].iloc[idxs_cv].values.astype(np.int)
preds_cv = d_full_cv[model_name].values
preds_proba_cv = d_full_cv[model_name + '_proba'].values

results_individual = pkl.load(open(oj(out_dir, f'{model_name}.pkl'), 'rb'))
assert np.sum(idxs_cv == np.arange(idxs_cv.size)) == idxs_cv.size, \
       'points not in same order'
assert np.mean(preds_cv==y_full_cv) == np.average(results_individual['cv']['accuracy'], 
                                               weights=results_individual['num_pts_by_fold_cv']), \
        'did not properly load model/data'
tp, tn, fp, fn = analyze_helper.calc_errs(preds_cv, y_full_cv)
print('succesfully loaded!')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
using model svm_16_None=3_ros=1_h=1_cal=True
	0.760 svm_16_None=3_ros=1_h=1_cal=True
	0.758 mlp2_16_None=3_ros=1.2_h=0_cal=True
	0.757 svm_16_None=3_ros=1_h=0_cal=True
	0.756 svm_16_None=3_ros=1.2_h=1_cal=True
	0.756 mlp2_16_None=3_ros=1_h=-1_cal=True
succesfully loaded!


# lower res data

**data at lower res**

In [46]:
accs = []
DOWNSAMPLE_list = [1, 2, 3, 4, 5, 10, 20]
for DOWNSAMPLE in tqdm(DOWNSAMPLE_list):


    # downsample
    df_cv = deepcopy(df[df.valid == 1]) # exclude test cells, short/long tracks, hotspots
    df_cv['X'] = [x[::DOWNSAMPLE] for x in df_cv.X]
    df_cv['X_extended'] = [x[::DOWNSAMPLE] for x in df_cv.X_extended]
    df_cv['lifetime'] = [len(x) for x in df_cv.X]
    df_cv = data.add_features(df_cv)


    # get data
    X, y, norms = analyze_helper.normalize(df_cv, outcome_def)
    d_full_cv, idxs_cv = analyze_helper.get_data_over_folds(model_name, out_dir, df_cv.cell_num, X, y)
    y_full_cv = df_cv[outcome_def].iloc[idxs_cv].values.astype(np.int)
    preds = d_full_cv[model_name].values
    preds_proba = d_full_cv[model_name + '_proba'].values
    acc = np.mean(preds==y_full_cv)
    accs.append(acc)
#     print(f'downsampling rate {DOWNSAMPLE} acc {acc.round(3)}')

100%|██████████| 7/7 [00:28<00:00,  3.66s/it]


In [None]:
plt.figure(dpi=300)
plt.plot(DOWNSAMPLE_list, accs, '.-', color=cb)
plt.xlabel('downsampling factor')
plt.ylabel('hard acc')
plt.show()

In [None]:
track_num = 3
ex = deepcopy(df[df.valid == 1]).iloc[track_num]
# print(ex)
viz.plot_example(ex)
plt.plot(np.arange(len(ex.X))[::3], ex.X[::3], 'o', color='w', alpha=0.5)
plt.show()

# look at test data

In [26]:
CELLS_TRAIN = config.SPLITS['orig']['train']
df_train = df[df.cell_num.isin(CELLS_TRAIN)]
X_train = df_train[data.get_feature_names(df_train)]
X_mean_train = X_train.mean()
X_std_train = X_train.std()

CELLS_TEST = config.SPLITS['orig']['test']
df_test = df[df.cell_num.isin(CELLS_TEST)]
X_test = df_test[data.get_feature_names(df_test)]
X_test = (X_test - X_mean_train) / X_std_train
y_test = df_test[outcome_def].values

df_new = data.get_data(dset='clath_aux', use_processed=True,
                   use_processed_dicts=True, outcome_def=outcome_def,
                   previous_meta_file='processed/metadata_orig.pkl')
X_new = df_new[data.get_feature_names(df_new)]
X_new = (X_new - X_mean_train) / X_std_train
y_new = df_new[outcome_def].values

In [31]:
m0 = results_individual['imps']['model'][0]
feat_names_selected = results_individual['feat_names_selected']
preds_test = m0.predict(X_test[feat_names_selected]) 
preds_new = m0.predict(X_new[feat_names_selected]) 

In [51]:
def roundd(x):
    try:
        return x.round(2)
    except:
        return [y.round(2) for y in x]
r_long = {}
for s in train.scorers:
    r_long[s] = [roundd(train.scorers[s](y_full_cv, preds_cv)),
                 roundd(train.scorers[s](y_test, preds_test)), 
                 roundd(train.scorers[s](y_new, preds_new))]

def mean_diff(df, k, preds):
    return np.mean(df[k][preds==1] - df[k][preds==0])
    
for k in ['X_max']:
    r_long[k] = [mean_diff(df_cv, k, preds)]

IndexError: boolean index did not match indexed array along dimension 0; dimension is 2936 but corresponding boolean dimension is 1237

In [44]:
r = pd.DataFrame.from_dict(r_long).transpose()
r.columns = ['train', 'test', 'new']
r

Unnamed: 0,train,test,new
balanced_accuracy,0.76,0.76,0.66
accuracy,0.76,0.66,0.76
precision,0.75,0.39,0.59
recall,0.78,0.93,0.43
f1,0.76,0.55,0.5
roc_auc,0.76,0.76,0.66
precision_recall_curve,"[[0.5, 0.75, 1.0], [1.0, 0.78, 0.0], [0.0, 1.0]]","[[0.22, 0.39, 1.0], [1.0, 0.93, 0.0], [0.0, 1.0]]","[[0.28, 0.59, 1.0], [1.0, 0.43, 0.0], [0.0, 1.0]]"
roc_curve,"[[0.0, 0.26, 1.0], [0.0, 0.78, 1.0], [2.0, 1.0...","[[0.0, 0.41, 1.0], [0.0, 0.93, 1.0], [2.0, 1.0...","[[0.0, 0.12, 1.0], [0.0, 0.43, 1.0], [2.0, 1.0..."


In [None]:
viz.plot_confusion_matrix(y_test, preds, 
                          classes=np.array(['aux-', 'aux+']), normalize=False)

In [None]:
for s in train.scorers:
    print(s, f'train: {roundd(train.scorers[s](y_full_cv, preds_cv))}\ttest: {roundd(train.scorers[s](y_new, preds_test))}')    