## Cross-validation

This notebook uses the methodology of sBNN which can be found [here](https://github.com/berenslab/sBNN).

In [2]:
# Import packages
import numpy as np
import pandas as pd
import seaborn as sns

import sparseRRR
import matplotlib.pyplot as plt
import importlib

import time
import pickle
import warnings

### Load data

In [20]:
V1_25degree = pickle.load(open('../../code/pickles/V1_features.pickle', 'rb'))
ephys_features = np.array(V1_25degree['X_o'].columns)[:-1]
Xo = V1_25degree['X_o'].copy()
V1_names = V1_25degree['V1_names']

v1_meta = pd.read_csv('../../data/Gouwens (2020)/20200625_patchseq_metadata_mouse.csv')
v1_meta = v1_meta.set_index('ephys_session_id')
v1_meta = v1_meta.reindex(Xo.index.astype('int64'))
celltypes = v1_meta['corresponding_AIT2.3.1_alias']

v1_genes = pd.read_csv('../../data/Gouwens (2020)/20200513_Mouse_PatchSeq_Release_count.csv')
v1_genes=v1_genes.set_index('Unnamed: 0').T
v1_genes=v1_genes.reindex(v1_meta['transcriptomics_sample_id'].values)
gene_names = v1_genes.columns.values

In [24]:
original_gene_names = np.array(v1_genes.columns)
IonChannelGenes = pd.read_csv(
    "../../data/GO_term_summary_20211104_075621.txt", header=0, sep="\t"
)
selectedGenes = np.array(
    [
        (gene in IonChannelGenes["MGI Gene/Marker ID"].values)
        | (
            gene
            in [
                "Pvalb",
                "Lamp5",
                "Vip",
                "Sst",
            ]
        )
        for gene in v1_genes.columns
    ]
)
enough_counts = np.sum(v1_genes, axis=0) >= 10
data_exons = v1_genes.loc[:,selectedGenes & enough_counts]
exonCounts = data_exons.values
gene_names = original_gene_names[selectedGenes & enough_counts]

In [26]:
non_nans = ~np.isnan(exonCounts.sum(axis=1))
exonCounts = exonCounts[non_nans,:]
exonCounts.shape

(3559, 420)

In [4]:
model_param_names = np.array(['C', r'$R_{input}$', r'$\tau$', r'$g_{Nat}$', r'$g_{Na}$', r'$g_{Kd}$', r'$g_{M}$',
                         r'$g_{Kv31}$', r'$g_{L}$', r'$E_{leak}$', r'$\tau_{max}$', 'VT', 'rate_to_SS_factor', 'I'])
prior_min = [0.1,  20,  0.1,    0,        0,      0,      0,      0,      0, -130,    50,    -90,   0.1,   20]
prior_max = [15,   1000,   70,   250,     100,      30,    3,     250,     3,  -50,  4000,   -35,    3,   870]

In [5]:
fav_tr_schedule='2d'

In [7]:
ttypes_tasic = np.load('../../data/Gouwens (2020)/tasic-ttypes.npy')
colors_tasic = np.load('../../data/Gouwens (2020)/tasic-colors.npy')
color_map = {}
for c_type in np.unique(ttypes_tasic):
    color_map.update({c_type:colors_tasic[np.where(ttypes_tasic==c_type)[0][0]]})
colors = np.array([
    color_map[c_type] if c_type in ttypes_tasic else 'black' for c_type in celltypes
])

In [None]:
with open('../save_model_parameters/V1/training_schedule_{}.pickle'.format(fav_tr_schedule), 'rb') as f:
        THETA = pickle.load(f)
highest_posterior_samples=np.concatenate(
    [THETA['highest posterior samples'][cell] if cell in THETA['highest posterior samples'] else np.ones((13,))*np.nan for cell in Xo.index]
).reshape((Xo.shape[0],len(model_param_names)))

In [8]:
model_params_df=pd.DataFrame.from_dict(
    THETA['highest posterior samples'],
    orient='index',
    columns=model_param_names
)

### Preprocessing

In [27]:
# Further preprocessing for genes (gene selection, library size normalization and Z-scoring)
genes = exonCounts.copy()
genes = genes / np.sum(genes, axis=1, keepdims=True) * np.median(np.sum(genes, axis=1, keepdims=True))
genes = np.log2(genes + 1)
genes = genes - np.mean(genes, axis=0)
genes = genes / np.std(genes, axis=0)
genes = genes.astype('float32')

# Z-scoring the model parameters
THETA = model_params_df.copy().values[non_nans,:]
THETA_mean=THETA.mean(axis=0)
THETA_std=THETA.std(axis=0)
THETA-=THETA_mean
THETA/=THETA_std
THETA = THETA.astype('float32')

In [28]:
print(genes.shape)
print(THETA.shape)

(3559, 420)
(3559, 14)


### sRRR

In [54]:
# CV sRRR rank = 2

alphas = np.concatenate((np.arange(.05,1,.05),np.arange(1,4,1)))
l1_ratios = np.array([1])

rank = 2
folds = 10
seed = 42

r2_sRRR_rank2 = np.zeros((folds, l1_ratios.shape[0], alphas.shape[0]))
r2_sRRR_rank2_tr = np.zeros((folds, l1_ratios.shape[0], alphas.shape[0]))
r2_sRRR_relaxed_rank2 = np.zeros((folds, l1_ratios.shape[0], alphas.shape[0]))
r2_sRRR_relaxed_rank2_tr = np.zeros((folds, l1_ratios.shape[0], alphas.shape[0]))
nonzero_rank2 = np.zeros((folds, l1_ratios.shape[0], alphas.shape[0]))
corrs_rank2 = np.zeros((folds, l1_ratios.shape[0], alphas.shape[0], rank))
corrs_relaxed_rank2 = np.zeros((folds, l1_ratios.shape[0], alphas.shape[0], rank))


t = time.time()

np.random.seed(seed)
n = genes.shape[0]
ind = np.random.permutation(n)
X = genes[ind,:]
Y = THETA[ind,:]

# CV folds
for cvfold in range(folds):
    print('\nCVFOLD: ', cvfold)
    indtest  = np.arange(cvfold*int(n/folds), (cvfold+1)*int(n/folds))
    indtrain = np.setdiff1d(np.arange(n), indtest)
    x_train = np.copy(X[indtrain,:])
    y_train = np.copy(Y[indtrain,:])
    x_test = np.copy(X[indtest,:])
    y_test = np.copy(Y[indtest,:])

    # Mean centering (like sRRR)
    X_mean = np.mean(x_train, axis=0)
    x_train -= X_mean
    x_test -= X_mean
    Y_mean = np.mean(y_train, axis=0)
    y_train -= Y_mean
    y_test -= Y_mean
    
    x_train = x_train.astype('float64')
    y_train = y_train.astype('float64')
    x_test = x_test.astype('float64')
    y_test = y_test.astype('float64')
    
    for i,a in enumerate(alphas):
        for j, b in enumerate(l1_ratios):
            vx,vy = sparseRRR.elastic_rrr(x_train, y_train, alpha=a, l1_ratio=b, rank=rank, sparsity='row-wise')

            nz = np.sum(np.abs(vx), axis=1) != 0
            if np.sum(nz) < rank:
                nonzero_rank2[cvfold, j, i] = np.nan
                continue

            if np.allclose(np.std(x_test @ vx, axis=0), 0):
                nonzero_rank2[cvfold, j, i] = np.nan
                continue

            nonzero_rank2[cvfold, j, i] = np.sum(nz)
            r2_sRRR_rank2[cvfold, j, i] = 1 - np.sum((y_test - x_test @ vx @ vy.T)**2) / np.sum(y_test**2)
            r2_sRRR_rank2_tr[cvfold, j, i] = 1 - np.sum((y_train - x_train @ vx @ vy.T)**2) / np.sum(y_train**2)
            for r in range(rank):
                corrs_rank2[cvfold, j, i, r] = np.corrcoef(x_test @ vx[:,r], y_test @ vy[:,r], rowvar=False)[0,1]

            # Relaxation
            vx[nz,:],vy = sparseRRR.elastic_rrr(x_train[:,nz], y_train, alpha=a, l1_ratio=0, rank=rank, sparsity='row-wise')

            if np.allclose(np.std(x_test @ vx, axis=0), 0):
                nonzero_rank2[cvfold, j, i] = np.nan
                continue

            r2_sRRR_relaxed_rank2[cvfold, j, i] = 1 - np.sum((y_test - x_test @ vx @ vy.T)**2) / np.sum(y_test**2)
            r2_sRRR_relaxed_rank2_tr[cvfold, j, i] = 1 - np.sum((y_train - x_train @ vx @ vy.T)**2) / np.sum(y_train**2)
            for r in range(rank):
                corrs_relaxed_rank2[cvfold, j, i, r] = np.corrcoef(x_test @ vx[:,r], y_test @ vy[:,r], rowvar=False)[0,1]

t = time.time() - t
m,s = divmod(t, 60)
h,m = divmod(m, 60)
print('Time: {}h {:2.0f}m {:2.0f}s'.format(h,m,s))


CVFOLD:  0

CVFOLD:  1

CVFOLD:  2

CVFOLD:  3

CVFOLD:  4

CVFOLD:  5

CVFOLD:  6

CVFOLD:  7

CVFOLD:  8

CVFOLD:  9
Time: 0.0h  0m 42s


In [55]:
with open('pickles/V1/sRRR_vs_Bottleneck_rank2_CV_lasso_ion_channel_marker.pickle', 'wb') as f:
    pickle.dump([r2_sRRR_rank2, r2_sRRR_relaxed_rank2, r2_sRRR_rank2_tr, r2_sRRR_relaxed_rank2_tr, \
                nonzero_rank2, corrs_rank2, corrs_relaxed_rank2], f)

In [56]:
# CV sRRR full rank

alphas = np.concatenate((np.arange(.05,1,.05),np.arange(1,4,1)))
l1_ratios = np.array([1])

rank = THETA.shape[1]
folds = 10
seed = 42

r2_sRRR_rank14 = np.zeros((folds, l1_ratios.shape[0], alphas.shape[0]))
r2_sRRR_rank14_tr = np.zeros((folds, l1_ratios.shape[0], alphas.shape[0]))
r2_sRRR_relaxed_rank14 = np.zeros((folds, l1_ratios.shape[0], alphas.shape[0]))
r2_sRRR_relaxed_rank14_tr = np.zeros((folds, l1_ratios.shape[0], alphas.shape[0]))
nonzero_rank14 = np.zeros((folds, l1_ratios.shape[0], alphas.shape[0]))
corrs_rank14 = np.zeros((folds, l1_ratios.shape[0], alphas.shape[0], rank))
corrs_relaxed_rank14 = np.zeros((folds, l1_ratios.shape[0], alphas.shape[0], rank))


t = time.time()

np.random.seed(seed)
n = genes.shape[0]
ind = np.random.permutation(n)
X = genes[ind,:]
Y = THETA[ind,:]

# CV folds
for cvfold in range(folds):
    print('\nCVFOLD: ', cvfold)
    indtest  = np.arange(cvfold*int(n/folds), (cvfold+1)*int(n/folds))
    indtrain = np.setdiff1d(np.arange(n), indtest)
    x_train = np.copy(X[indtrain,:])
    y_train = np.copy(Y[indtrain,:])
    x_test = np.copy(X[indtest,:])
    y_test = np.copy(Y[indtest,:])

    # Mean centering (like sRRR)
    X_mean = np.mean(x_train, axis=0)
    x_train -= X_mean
    x_test -= X_mean
    Y_mean = np.mean(y_train, axis=0)
    y_train -= Y_mean
    y_test -= Y_mean
    
    x_train = x_train.astype('float64')
    y_train = y_train.astype('float64')
    x_test = x_test.astype('float64')
    y_test = y_test.astype('float64')
    
    for i,a in enumerate(alphas):
        for j, b in enumerate(l1_ratios):
            vx,vy = sparseRRR.elastic_rrr(x_train, y_train, alpha=a, l1_ratio=b, rank=rank, sparsity='row-wise')

            nz = np.sum(np.abs(vx), axis=1) != 0
            if np.sum(nz) < rank:
                nonzero_rank14[cvfold, j, i] = np.nan
                continue

            if np.allclose(np.std(x_test @ vx, axis=0), 0):
                nonzero_rank14[cvfold, j, i] = np.nan
                continue

            nonzero_rank14[cvfold, j, i] = np.sum(nz)
            r2_sRRR_rank14[cvfold, j, i] = 1 - np.sum((y_test - x_test @ vx @ vy.T)**2) / np.sum(y_test**2)
            r2_sRRR_rank14_tr[cvfold, j, i] = 1 - np.sum((y_train - x_train @ vx @ vy.T)**2) / np.sum(y_train**2)
            for r in range(rank):
                corrs_rank14[cvfold, j, i, r] = np.corrcoef(x_test @ vx[:,r], y_test @ vy[:,r], rowvar=False)[0,1]

            # Relaxation
            vx[nz,:],vy = sparseRRR.elastic_rrr(x_train[:,nz], y_train, alpha=a, l1_ratio=0, rank=rank, sparsity='row-wise')

            if np.allclose(np.std(x_test @ vx, axis=0), 0):
                nonzero_rank14[cvfold, j, i] = np.nan
                continue

            r2_sRRR_relaxed_rank14[cvfold, j, i] = 1 - np.sum((y_test - x_test @ vx @ vy.T)**2) / np.sum(y_test**2)
            r2_sRRR_relaxed_rank14_tr[cvfold, j, i] = 1 - np.sum((y_train - x_train @ vx @ vy.T)**2) / np.sum(y_train**2)
            for r in range(rank):
                corrs_relaxed_rank14[cvfold, j, i, r] = np.corrcoef(x_test @ vx[:,r], y_test @ vy[:,r], rowvar=False)[0,1]

t = time.time() - t
m,s = divmod(t, 60)
h,m = divmod(m, 60)
print('Time: {}h {:2.0f}m {:2.0f}s'.format(h,m,s))


CVFOLD:  0

CVFOLD:  1

CVFOLD:  2

CVFOLD:  3

CVFOLD:  4

CVFOLD:  5

CVFOLD:  6

CVFOLD:  7

CVFOLD:  8

CVFOLD:  9
Time: 0.0h  1m 28s


In [57]:
with open('pickles/V1/sRRR_vs_Bottleneck_rank14_CV_lasso_ion_channel_marker.pickle', 'wb') as f:
    pickle.dump([r2_sRRR_rank14, r2_sRRR_relaxed_rank14, r2_sRRR_rank14_tr, r2_sRRR_relaxed_rank14_tr, \
                nonzero_rank14, corrs_rank14, corrs_relaxed_rank14], f)

Check:

In [58]:
alphas = np.concatenate((np.arange(.05,1,.05),np.arange(1,4,1)))
l1_ratios = np.array([1])

In [59]:
with open('pickles/V1/sRRR_vs_Bottleneck_rank2_CV_lasso_ion_channel_marker.pickle', 'rb') as f:
    r2_sRRR_rank2, r2_sRRR_relaxed_rank2, r2_sRRR_rank2_tr, r2_sRRR_relaxed_rank2_tr, \
                nonzero_rank2, corrs_rank2, corrs_relaxed_rank2 = pickle.load(f)

In [60]:
with open('pickles/V1/sRRR_vs_Bottleneck_rank14_CV_lasso_ion_channel_marker.pickle', 'rb') as f:
    r2_sRRR_rank14, r2_sRRR_relaxed_rank14, r2_sRRR_rank14_tr, r2_sRRR_relaxed_rank14_tr, \
                nonzero_rank14, corrs_rank14, corrs_relaxed_rank14 = pickle.load(f)

In [61]:
# Select closest to 25 genes value
# suppressing "mean of empty slice" warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)
    n_rank2 = np.nanmean(nonzero_rank2, axis=0)[0, :]
    r_rank2_r = np.nanmean(r2_sRRR_relaxed_rank2, axis=0)[0, :]
    n_rank14 = np.nanmean(nonzero_rank14, axis=0)[0, :]
    r_rank14_r = np.nanmean(r2_sRRR_relaxed_rank14, axis=0)[0, :]

In [62]:
n_rank14

array([328.1,  93.6,  52.2,  33.8,  19.2,   nan,   nan,   nan,   nan,
         nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
         nan,   nan,   nan,   nan])

In [63]:
print(r_rank14_r[np.isclose(n_rank14, 25, atol=7)][0], '+/-',
      np.nanstd(r2_sRRR_relaxed_rank14, axis=0)[0, :][np.isclose(n_rank14, 25, atol=7)][0])

0.13225230895219492 +/- 0.012206411786443265


In [64]:
n_rank2

array([90. , 41.6, 24.7, 16.2, 11.7,  9. ,  6.2,  5. ,  4.9,  4.4,  4. ,
        3.3,  2. ,  2. ,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan])

In [65]:
print(r_rank2_r[np.isclose(n_rank2, 25, atol=7)][0], '+/-',
      np.nanstd(r2_sRRR_relaxed_rank2, axis=0)[0, :][np.isclose(n_rank2, 25, atol=7)][0])

0.11391024887055759 +/- 0.009828143360845296
