In [86]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os
from os.path import join as oj
import sys
sys.path.append('../src')
import numpy as np
import seaborn as sns
import torch
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
plt.style.use('dark_background')
import data
from skorch.callbacks import Checkpoint
from skorch import NeuralNetRegressor
from config import *
from tqdm import tqdm
import train_reg
import config
import pandas as pd
import features
from scipy.stats import skew, pearsonr
import outcomes
from sklearn.model_selection import KFold
from torch import nn, optim
from torch.nn import functional as F

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# prepare data

In [174]:
dsets = ['clath_aux+gak_a7d2_new'] # this is one of the large datasets
feat_names = ['X_same_length_normalized', 'mean_total_displacement',
              'mean_square_displacement', 'lifetime']
meta = ['cell_num', 'Y_sig_mean', 'Y_sig_mean_normalized']
df_full = None
for dset in dsets:
    df = data.get_data(dset=dset)
    df = df[df.lifetime > 15] # only keep hard tracks
    df = df[df.cell_num.isin(config.DSETS[dset]['train'])] # exclude held-out test data
    
    # downsample tracks
    length = 40
    df['X_same_length'] = [features.downsample(df.iloc[i]['X'], length)
                           for i in range(len(df))] # downsampling
    
    # normalize features/tracks
    df = features.normalize_track(df, track='X_same_length')
    for feat in feat_names[1:]:
        df = features.normalize_feature(df, feat)

    # regression response
    df = train_reg.add_sig_mean(df)     
    
    # remove extraneous feats
    df = df[feat_names + meta]
    df = df.dropna() 
    
    # merge all dsets
#     if df_full is None:
#         df_full = deepcopy(df)
#     else:
#         df_full = df_full.merge(df)

  sigs = np.array(r[f'{track}_pvals']) < 0.05


## train neural net

In [83]:
# decide on architecture
#out_dir = f'{DIR_RESULTS}/dec10_deep'
#out_dir = f'results/regression/deep_learning/Dec10'

In [170]:
os.makedirs(out_dir, exist_ok=True)
outcome_def = 'Y_sig_mean_normalized'
num_epochs = 100
num_hidden = 40
for model_type in ['nn_lstm']: # = 'nn_cnn' # 'nn_lstm', 'fcnn', 'nn_cnn', 'nn_attention'
    train_reg.train_reg(df,
                        feat_names=feat_names,
                        track_name='X_same_length_normalized',
                        model_type=model_type, 
                        outcome_def=outcome_def,
                        out_name=oj(out_dir, f'{dset}_{outcome_def}_{model_type}.pkl'),
                        fcnn_hidden_neurons=num_hidden,
                        fcnn_epochs=num_epochs)

0it [00:00, ?it/s]

Looping over cv...
fitting dnn...


1it [07:33, 453.70s/it]

0.35182113105726776
fitting dnn...


2it [14:32, 443.14s/it]

0.4015514160392738
fitting dnn...


3it [21:50, 441.76s/it]

0.400747189612112
fitting dnn...


4it [29:27, 446.19s/it]

0.33849530559320296
fitting dnn...


5it [37:31, 457.45s/it]

0.30469280129626386
fitting dnn...


6it [45:33, 465.01s/it]

0.3157889507198859
fitting dnn...


7it [53:49, 474.35s/it]

0.39279609947230987
fitting dnn...


8it [1:01:26, 460.80s/it]

0.37963092609616433
Training with full data...
fitting dnn...





# analyze results

In [173]:
results = train_reg.load_results(out_dir)
r = results
r = r[[k for k in r if not 'std' in k]]
r = r[[k for k in r if not '_f' in k]]
# r = r[r.index.str.contains('ros')] # only use random sampling
r = r.sort_values(by=['r2'], ascending=False)
# r.style.background_gradient(cmap='viridis', axis=None) # all values on same cmap
r

{'r2': [0.31163279314100656, 0.3453041601512681, 0.39706460939750887, 0.30246224065343696, 0.2809839933189948, 0.3054622017270293, 0.3570558211887991, 0.338949112417699], 'pearsonr': [0.5583629549382731, 0.5883389359726974, 0.6325927541751527, 0.5525102004287371, 0.5316454577400022, 0.5539241225633553, 0.5989862186629875, 0.5835063679131884]}
dict_keys(['r2', 'pearsonr'])
{'r2': [0.33521232098052733, 0.3889407525530788, 0.41655853395869635, 0.3315478305412497, 0.2878354054474871, 0.3134636081493648, 0.40023319427487514, 0.3650294111763208], 'pearsonr': [0.581252573324601, 0.623760069904918, 0.6471940871096604, 0.5815456321990675, 0.5450916045150992, 0.5623113039255007, 0.6326850791058214, 0.606304709982837]}
dict_keys(['r2', 'pearsonr'])
{'r2': [0.35182113105726776, 0.4015514160392738, 0.400747189612112, 0.33849530559320296, 0.30469280129626386, 0.3157889507198859, 0.39279609947230987, 0.37963092609616433], 'pearsonr': [0.5996547919485246, 0.6348907684608562, 0.6479268746710987, 0.5849

Unnamed: 0_level_0,cv_accuracy_by_cell,pearsonr,r2
model_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
clath_aux+gak_a7d2_new_Y_sig_mean_normalized_nn_lstm,"[0.35182113105726776, 0.4015514160392738, 0.40...",0.604,0.361
clath_aux+gak_a7d2_new_Y_sig_mean_normalized_fcnn,"[0.33521232098052733, 0.3889407525530788, 0.41...",0.597,0.354
clath_aux+gak_a7d2_new_Y_sig_mean_normalized_nn_cnn,"[0.31163279314100656, 0.3453041601512681, 0.39...",0.574,0.328
