## RF

In [1]:
# import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from rsnautils import *

In [2]:
pkls = L(path_pred.glob('*_tta.pkl'))

In [3]:
fn = pkls[0]

In [4]:
def read_preds(fn):
    preds,targs = fn.load()
    df = pd.DataFrame(preds.numpy())
    df.columns = [':'.join([fn.stem[:-4], c]) for c in htypes]
    return df

In [5]:
get_data = get_rsna_data_func(1, nw=8)
dbch = get_data(128, None, use_wgt=False)

In [6]:
pred_dfs = pkls.map(read_preds)

In [7]:
Xdf = pd.concat(pred_dfs, axis=1)

In [30]:
def split_data(df):
    idx = L.range(df)
    mask = df.index.isin(set(val_sops))
    return idx[~mask],idx[mask]

In [16]:
preds,targs = fn.load()

In [25]:
t = targs.numpy()[:-2]

In [31]:
splits = split_data(df_comb)

In [33]:
y_valid = df_comb.iloc[splits[1]][htypes]

In [34]:
np.equal(t, y_valid.values).mean()

0.8986414202337001

In [36]:
np.nonzero(t)

(array([    37,     37,     38,     38, ..., 136742, 136742, 136743, 136743]),
 array([0, 5, 0, 4, ..., 0, 5, 0, 5]))

In [35]:
np.nonzero(y_valid.values)

(array([   256,    256,    257,    257, ..., 136762, 136762, 136763, 136763]),
 array([0, 5, 0, 5, ..., 0, 3, 0, 3]))

In [9]:
loss = get_loss()

In [30]:
def calc_loss(df):
    ypred = logit(to_device(tensor(df.values)))[:136785]
    targ = to_device(tensor(y_valid))
    return loss(ypred, targ).cpu().item(), accuracy_multi(ypred, targ).cpu().item()

losses= dict(list(zip([fn.stem for fn in pkls], L(pred_dfs).map(calc_loss))))
pd.Series(losses).sort_values()

xserxt18_wgtd_tta                (0.576793372631073, 0.9059083461761475)
xrn18_wgtd_tta                  (0.5804729461669922, 0.9056243896484375)
rn18_wgtd_no_pre_tta            (0.5809637308120728, 0.9058936834335327)
train_rn18_orig_tta             (0.5880447626113892, 0.9056402444839478)
impre-xrn34-freeze2-deep_tta     (0.597140371799469, 0.9049774408340454)
impre-xrn34-freeze2_tta         (0.5978264212608337, 0.9049201607704163)
train_rn18_all_tta              (0.5992218255996704, 0.9053905010223389)
rn18_wgtd_freeze2_tta           (0.6032768487930298, 0.9054087400436401)
train_rn34_tta                   (0.603354811668396, 0.9052966237068176)
xrn34_wgtd_deep_tta             (0.6039314270019531, 0.9047532081604004)
xrn34_wgtd_deepish_tta          (0.6092332005500793, 0.9053575992584229)
train_rn50_all_tta              (0.6109864115715027, 0.9048433899879456)
rn18_wgtd_freeze2_rep_tta       (0.6111599206924438, 0.9053490161895752)
train_rn18_nofr_tta              (0.612043917179107

In [21]:
valid_df = df_comb.loc[y_valid.index]

In [22]:
set_seed(42)
val_patients = valid_df.PatientID.unique()
np.random.shuffle(val_patients)
split_idx = int(0.8*len(val_patients))
train_p, val_p = val_patients[0:split_idx], val_patients[split_idx:]
train_idx = valid_df.PatientID.isin(train_p).values
val_idx = valid_df.PatientID.isin(val_p).values

train_x  = Xdf.loc[train_idx]
train_y  = y_valid.loc[train_idx]

val_x  = Xdf.loc[val_idx]
val_y  = y_valid.loc[val_idx]


In [23]:
labels = [l for l in htypes if l != 'any']

In [24]:
# train label

In [25]:
def feature_cols(df, label): return [c for c in df.columns if c.split(':')[1] == label]
def model_cols(df, model):   return [c for c in df.columns if c.split(':')[0] == model]

def pull_features(df, label):
    any_cols = feature_cols(df, 'any')
    feat_cols = feature_cols(df, label)
    return df[feat_cols+any_cols]

def pull_any_features(df, model):
    any_cols = feature_cols(df, 'any')
    m_cols = model_cols(df, model)
    cols = list(set(any_cols+m_cols))
    return df[cols]

In [26]:
def get_data(label, x, y):
    xl = pull_any_features(x, label) if label == 'any' else pull_features(x, label)
    return xl, y[label]

In [27]:
def train_label(label, x, y):
    print(f'training {label}')
    clf = RandomForestClassifier(n_estimators=100, min_samples_leaf=10, max_features=0.9, n_jobs=16, oob_score=True)
    xl,yl = get_data(label, x, y)
    clf = clf.fit(xl,yl)
    clf.feat_imp_cols = xl.columns
    return clf

In [28]:
def train(label): return train_label(label, train_x, train_y) 
rfs = { label:train(label) for label in htypes }

training any
training epidural
training intraparenchymal
training intraventricular
training subarachnoid
training subdural


In [29]:
def predict_rfs(rfs, x, labels):
    preds = []
    for label in labels:
        print(f'predict {label}')
        xl = pull_any_features(x, label) if label == 'any' else pull_features(x, label)
        ypred = rfs[label].predict_proba(xl)
        preds.append(ypred[:,1])
    return pd.DataFrame(np.stack(preds).T, index=x.index, columns=labels)

In [30]:
ypred = predict_rfs(rfs, val_x, htypes)

predict any
predict epidural
predict intraparenchymal
predict intraventricular
predict subarachnoid
predict subdural


In [31]:
yp = tensor(ypred)
p,t = to_device((yp,tensor(val_y.values).float()))

In [32]:
loss(logit(p),t).item(), accuracy_multi(logit(p), t).item()

(0.06706075370311737, 0.9789391756057739)

In [34]:
for label in htypes:
    print('')
    print(f'feat importances for {label}')
    print('---------------------------------------------------')
    fis = sorted(zip(map(lambda x: round(x, 4), rfs[label].feature_importances_), rfs[label].feat_imp_cols),  reverse=True)
    for fi in fis:
        print(fi)


feat importances for any
---------------------------------------------------
(0.2813, 'xrn34_wgtd_deepish:any')
(0.1748, 'train_rn18:any')
(0.1262, 'xrn34_wgtd_deep:any')
(0.1187, 'rn18_wgtd_freeze2_rep:any')
(0.078, 'rn18_wgtd:any')
(0.0554, 'train_rn18_all:any')
(0.0368, 'xrn50_wgtd:any')
(0.0247, 'train_rn18_all_flip:any')
(0.0076, 'xrn50_wgtd_flip:any')
(0.0053, 'xrn18_wgtd:any')
(0.0052, 'xrn18_wgtd_flip:any')
(0.0051, 'xrn50_wgtd_deep_flip:any')
(0.0051, 'xrn50_wgtd_deep:any')
(0.005, 'xrn34_wgtd_deep_flip:any')
(0.0048, 'train_rn18_flip:any')
(0.0046, 'rn18_wgtd_hist:any')
(0.0045, 'xrn34_wgtd_deepish_flip:any')
(0.0043, 'rn18_wgtd_hist_flip:any')
(0.0043, 'rn18_wgtd_freeze2:any')
(0.0043, 'rn18_wgtd_flip:any')
(0.0042, 'rn18_wgtd_no_pre:any')
(0.0041, 'xrn34_wgtd_flip:any')
(0.0038, 'xrn34_wgtd:any')
(0.0038, 'rn18_wgtd_freeze2_rep_flip:any')
(0.0037, 'impre-xrn34-freeze2:any')
(0.0036, 'rn18_wgtd_no_pre_flip:any')
(0.0036, 'rn18_wgtd_freeze:any')
(0.0036, 'rn18_wgtd_freeze2_f

(0.2656, 'xrn34_wgtd_deep:subdural')
(0.2314, 'train_rn18:subdural')
(0.1367, 'train_rn18_all:subdural')
(0.0583, 'xrn34_wgtd_deepish:subdural')
(0.0183, 'rn18_wgtd:subdural')
(0.0167, 'xrn34_wgtd:subdural')
(0.014, 'xrn18_wgtd:subdural')
(0.0128, 'train_rn18_flip:subdural')
(0.0126, 'rn18_wgtd_no_pre:subdural')
(0.0119, 'xrn34_wgtd_deep_flip:subdural')
(0.0098, 'xrn50_wgtd:subdural')
(0.0094, 'rn18_wgtd_freeze2:subdural')
(0.0084, 'xrn50_wgtd_deep_flip:subdural')
(0.0078, 'train_rn18_all_flip:subdural')
(0.0074, 'xrn50_wgtd_flip:subdural')
(0.0074, 'xrn18_wgtd_flip:subdural')
(0.0072, 'xrn50_wgtd_deep:subdural')
(0.006, 'rn18_wgtd_no_pre_flip:subdural')
(0.0059, 'xrn34_wgtd_deepish_flip:subdural')
(0.0058, 'rn18_wgtd_hist:subdural')
(0.0055, 'xrn34_wgtd_flip:subdural')
(0.0055, 'impre-xrn34-freeze2-deep:subdural')
(0.0054, 'rn18_wgtd_hist_flip:subdural')
(0.0051, 'rn18_wgtd_freeze:subdural')
(0.005, 'impre-xrn34-freeze2:subdural')
(0.0049, 'rn18_wgtd_freeze2_rep:subdural')
(0.0049, 'r

In [29]:
def calc_loss_rf_valid(df):
    ypred = logit(to_device(tensor(df.loc[val_idx].values)))
    targ = to_device(tensor(val_y))
    return loss(ypred, targ).cpu().item(), accuracy_multi(ypred, targ).cpu().item()

losses= dict(list(zip([fn.stem for fn in all_preds], L(pred_dfs).map(calc_loss_rf_valid))))
pd.Series(losses).sort_values()

rn18_wgtd                        (0.06856484711170197, 0.9775579571723938)
xrn34_wgtd_deep                  (0.06873700767755508, 0.9772636890411377)
rn18_wgtd_freeze2                (0.06893426924943924, 0.9776120185852051)
rn18_wgtd_hist                   (0.06914354115724564, 0.9777020812034607)
xrn34_wgtd                       (0.06921088695526123, 0.9772636890411377)
xrn34_wgtd_deep_flip             (0.06938355416059494, 0.9772456884384155)
rn18_wgtd_freeze2_rep            (0.06949618458747864, 0.9775159358978271)
rn18_wgtd_freeze_flip            (0.06960073858499527, 0.9772937297821045)
rn18_wgtd_flip                    (0.06961469352245331, 0.977449893951416)
rn18_wgtd_freeze                 (0.06966757029294968, 0.9774558544158936)
rn18_wgtd_hist_flip               (0.0698188841342926, 0.9773477911949158)
xrn34_wgtd_deepish               (0.06982376426458359, 0.9772156476974487)
xrn34_wgtd_flip                   (0.06985139846801758, 0.977041482925415)
xrn50_wgtd_deep          

In [30]:
for label in htypes:
    print('')
    print(f'feat importances for {label}')
    print('---------------------------------------------------')
    fis = sorted(zip(map(lambda x: round(x, 4), rfs[label].feature_importances_), rfs[label].feat_imp_cols),  reverse=True)
    for fi in fis:
        print(fi)


feat importances for any
---------------------------------------------------
(0.423, 'xrn34_wgtd_deepish:any')
(0.1413, 'xrn34_wgtd_deep:any')
(0.139, 'rn18_wgtd_freeze2_rep:any')
(0.1124, 'rn18_wgtd:any')
(0.0389, 'xrn50_wgtd:any')
(0.0169, 'rn18_wgtd_freeze2:any')
(0.0097, 'xrn50_wgtd_flip:any')
(0.0094, 'xrn34_wgtd_flip:any')
(0.0078, 'rn18_wgtd_hist:any')
(0.0071, 'xrn34_wgtd_deep_flip:any')
(0.0071, 'rn18_wgtd_no_pre:any')
(0.0069, 'xrn34_wgtd:any')
(0.0065, 'xrn18_wgtd_flip:any')
(0.0061, 'xrn50_wgtd_deep_flip:any')
(0.0061, 'impre-xrn34-freeze2:any')
(0.006, 'rn18_wgtd_hist_flip:any')
(0.0058, 'xrn50_wgtd_deep:any')
(0.0054, 'rn18_wgtd_freeze2_rep_flip:any')
(0.0052, 'xrn18_wgtd:any')
(0.0052, 'rn18_wgtd_freeze2_flip:any')
(0.0052, 'rn18_wgtd_flip:any')
(0.005, 'xrn34_wgtd_deepish_flip:any')
(0.0044, 'rn18_wgtd_no_pre_flip:any')
(0.0044, 'impre-xrn34-freeze2-deep:any')
(0.0039, 'rn18_wgtd_freeze:any')
(0.0039, 'impre-xrn34-freeze2-deep_flip:any')
(0.0037, 'rn18_wgtd_freeze_flip

(0.4376, 'xrn34_wgtd_deep:subdural')
(0.0865, 'xrn34_wgtd_deepish:subdural')
(0.0677, 'rn18_wgtd:subdural')
(0.0609, 'rn18_wgtd_no_pre:subdural')
(0.0347, 'xrn34_wgtd_deep_flip:subdural')
(0.028, 'xrn50_wgtd:subdural')
(0.0256, 'rn18_wgtd_freeze2:subdural')
(0.0196, 'xrn34_wgtd:subdural')
(0.016, 'xrn18_wgtd:subdural')
(0.0134, 'rn18_wgtd_no_pre_flip:subdural')
(0.0129, 'xrn18_wgtd_flip:subdural')
(0.0095, 'xrn50_wgtd_flip:subdural')
(0.0086, 'xrn50_wgtd_deep_flip:subdural')
(0.0084, 'xrn50_wgtd_deep:subdural')
(0.0071, 'xrn34_wgtd_deepish_flip:subdural')
(0.0069, 'rn18_wgtd_hist:subdural')
(0.0068, 'xrn34_wgtd_flip:subdural')
(0.0065, 'rn18_wgtd_freeze2_flip:subdural')
(0.0062, 'rn18_wgtd_hist_flip:subdural')
(0.0061, 'impre-xrn34-freeze2:subdural')
(0.0059, 'rn18_wgtd_freeze2_rep:subdural')
(0.0059, 'impre-xrn34-freeze2-deep:subdural')
(0.0058, 'rn18_wgtd_freeze:subdural')
(0.0053, 'rn18_wgtd_flip:subdural')
(0.0052, 'rn18_wgtd_freeze_flip:subdural')
(0.0051, 'impre-xrn34-freeze2_fli