In [34]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os
from os.path import join as oj
import sys
sys.path.append('../src')
import numpy as np
import seaborn as sns
import torch
import matplotlib
from matplotlib import pyplot as plt
from tqdm import tqdm
import pandas as pd
from functools import partial
import pickle as pkl
from sklearn import metrics
from sklearn.linear_model import LinearRegression

import data
import config
import features
import train_reg
import neural_networks

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
df = pd.read_pickle(oj(config.DIR_PROCESSED, 'df_full.pkl'))
df = features.normalize_track(df, track='X_same_length', by_time_point=False) # adds X_same_length_normalized
df = df[~df.short & ~df.hotspots] # filter out easy/invalid tracks

In [21]:
# each dataset has different number of cells each with different number of tracks
# the 'split' variable says whether a track is in the train or test set
df.groupby('dset')['split'].value_counts()

dset                    split
clath_aux+gak           train     2018
                        test       535
clath_aux+gak_a7d2      train     3327
                        test      1067
clath_aux+gak_a7d2_new  train     7877
                        test      1703
clath_aux+gak_new       train     3404
                        test       679
clath_aux_dynamin       train    34559
                        test      9367
clath_gak               train     3496
                        test      1498
Name: split, dtype: int64

# fit models

In [38]:
np.random.seed(42)
for dset_name in df['dset'].unique():
    d = df[(df['dset'] == dset_name) & (df['split'] == 'train')]
    checkpoint_fname = oj(config.DIR_MODELS, 'dnn_individual', f'dnn_{dset_name}.pkl')
    dnn = neural_networks.neural_net_sklearn(D_in=40, H=20, p=0, arch='lstm', epochs=100)
    dnn.fit(X=d[['X_same_length_normalized']],
            y=d['Y_sig_mean_normalized'].values,
            verbose=False, checkpoint_fname=checkpoint_fname)
    pkl.dump({'model_state_dict': dnn.model.state_dict()}, open(checkpoint_fname, 'wb'))
    
    # fit dasc
    checkpoint_fname = oj(config.DIR_MODELS, 'dasc_individual', f'dasc_{dset_name}.pkl')
    dasc_model = LinearRegression().fit(d['X_d1'].values.reshape(-1, 1), d['Y_sig_mean_normalized'])
    pkl.dump(dasc_model, open(checkpoint_fname, 'wb'))

# look at model performance on corresponding test set

In [49]:
scores = {
    'r2': [],
    'acc': [],
    'r2_dasc': [],
    'acc_dasc': [],
}
dset_names = sorted(df['dset'].unique())
for dset_name in dset_names:
    # load models
    dnn = neural_networks.neural_net_sklearn(D_in=40, H=20, p=0, arch='lstm')
    ckpt = pkl.load(open(oj(config.DIR_MODELS, 'dnn_individual', f'dnn_{dset_name}.pkl'), 'rb'))
    dnn.model.load_state_dict(ckpt['model_state_dict'])
    dasc_model = pkl.load(open(oj(config.DIR_MODELS, 'dasc_individual', f'dasc_{dset_name}.pkl'), 'rb'))
    
    # test on test set
    d = df[(df['dset'] == dset_name) & (df['split'] == 'test')]
    preds_reg = dnn.predict(d[['X_same_length_normalized']])
    preds_class = (preds_reg).astype(int)
    preds_reg_dasc = dasc_model.predict(d['X_d1'].values.reshape(-1, 1))
    scores['r2'].append(metrics.r2_score(d['Y_sig_mean_normalized'], preds_reg))
    scores['acc'].append(metrics.accuracy_score(d['y_consec_thresh'], preds_class))
    scores['r2_dasc'].append(metrics.r2_score(d['Y_sig_mean_normalized'], preds_reg_dasc))
    scores['acc_dasc'].append(metrics.accuracy_score(d['y_consec_thresh'], (preds_reg_dasc > 0).astype(int)))    
scores = pd.DataFrame.from_dict(scores)
scores.index = dset_names
scores

Unnamed: 0,r2,acc,r2_dasc,acc_dasc
clath_aux+gak,0.382298,0.691589,0.230255,0.71028
clath_aux+gak_a7d2,0.28764,0.722587,0.160431,0.545455
clath_aux+gak_a7d2_new,0.449184,0.512038,0.04907,0.722842
clath_aux+gak_new,0.399651,0.777614,0.17401,0.387334
clath_aux_dynamin,0.428417,0.725846,0.232492,0.442084
clath_gak,0.26584,0.654206,0.121623,0.542056


# look at model performance on all test sets

In [None]:
dset_names = sorted(df['dset'].unique())
ks = []
for d in dset_names:
    ks += [f'r2_{d}', f'r2_dasc_{d}', f'acc_{d}', f'acc_dasc_{d}']
scores = {
    k: [] for k in ks
}
for dset_name in dset_names:
    # load models
    dnn = neural_networks.neural_net_sklearn(D_in=40, H=20, p=0, arch='lstm')
    ckpt = pkl.load(open(oj(config.DIR_MODELS, 'dnn_individual', f'dnn_{dset_name}.pkl'), 'rb'))
    dnn.model.load_state_dict(ckpt['model_state_dict'])
    dasc_model = pkl.load(open(oj(config.DIR_MODELS, 'dasc_individual', f'dasc_{dset_name}.pkl'), 'rb'))
    
    # test on all test sets
    for test_set_name in dset_names:
        d = df[(df['dset'] == test_set_name) & (df['split'] == 'test')]
        preds_reg = dnn.predict(d[['X_same_length_normalized']])
        preds_class = (preds_reg).astype(int)
        preds_reg_dasc = dasc_model.predict(d['X_d1'].values.reshape(-1, 1))
        scores['r2'].append(metrics.r2_score(d['Y_sig_mean_normalized'], preds_reg))
        scores['acc'].append(metrics.accuracy_score(d['y_consec_thresh'], preds_class))
        scores['r2_dasc'].append(metrics.r2_score(d['Y_sig_mean_normalized'], preds_reg_dasc))
        scores['acc_dasc'].append(metrics.accuracy_score(d['y_consec_thresh'], (preds_reg_dasc > 0).astype(int)))    
scores = pd.DataFrame.from_dict(scores)
scores.index = dset_names
scores