# Perform and pickle cross-validation for all datasets

In [1]:
%matplotlib notebook

import numpy as np
import pylab as plt
import seaborn as sns
import pickle
import matplotlib

import sparseRRR

In [2]:
def preprocess(data):
    X = data['counts'][:,data['mostVariableGenes']] / np.sum(data['counts'], axis=1) * 1e+6
    X = np.array(X)
    X = np.log2(X + 1)
    X = X - np.mean(X, axis=0)
    X = X / np.std(X, axis=0)

    Y = data['ephys']
    Y = Y - np.mean(Y, axis=0)
    Y = Y / np.std(Y, axis=0)
    
    return (X,Y)

## The main cross-validation setup

In [5]:
data = pickle.load(open('data/scala2020.pickle', 'rb'))
X,Y = preprocess(data)
print('Shape of X:', X.shape, '\nShape of Y:', Y.shape)

Shape of X: (1213, 1000) 
Shape of Y: (1213, 16)


In [55]:
%%time

lambdas = np.concatenate((np.arange(.1,1,.1), np.arange(1,11)))
alphas = np.array([.25, .5, .75, 1])

cvresults = sparseRRR.elastic_rrr_cv(X, Y, rank=2, reps=1, folds=10, alphas=alphas, lambdas=lambdas)

alphas = np.array([1])
ranks = np.arange(1, Y.shape[1]+1)

cvresults_rank = {}
for r in ranks:
    cvresults_rank[r] = sparseRRR.elastic_rrr_cv(X, Y, rank=r, reps=1, folds=10, alphas=alphas, lambdas=lambdas)
    
pickle.dump([cvresults, cvresults_rank], open('pickles/cvresults-scala2020.pickle', 'wb'))

1.......... Time: 0.0h  3m 41s
1.......... Time: 0.0h  0m 28s
1.......... Time: 0.0h  0m 38s
1.......... Time: 0.0h  0m 53s
1.......... Time: 0.0h  1m 17s
1.......... Time: 0.0h  1m 45s
1.......... Time: 0.0h  3m  3s
1.......... Time: 0.0h  3m 47s
1.......... Time: 0.0h  5m 36s
1.......... Time: 0.0h  4m 31s
1.......... Time: 0.0h  4m  3s
1.......... Time: 0.0h  4m 13s
1.......... Time: 0.0h  4m 57s
1.......... Time: 0.0h  3m 24s
1.......... Time: 0.0h  5m  7s
1.......... Time: 0.0h  2m 39s
1.......... Time: 0.0h  1m 34s
CPU times: user 2h 3min 18s, sys: 50.2 s, total: 2h 4min 8s
Wall time: 51min 35s


In [6]:
data = pickle.load(open('data/scala2019.pickle', 'rb'))
X,Y = preprocess(data)
print('Shape of X:', X.shape, '\nShape of Y:', Y.shape)

Shape of X: (102, 1000) 
Shape of Y: (102, 13)


In [7]:
%%time

lambdas = np.concatenate([np.arange(.1,3,.1), np.arange(3,11)])
alphas = [.25, .5, .75, 1]

cvresults = sparseRRR.elastic_rrr_cv(X, Y, rank=2, reps=10, folds=10, alphas=alphas, lambdas=lambdas)

alphas = np.array([.5])
ranks = np.arange(1, Y.shape[1]+1)

cvresults_rank = {}
for r in ranks:
    cvresults_rank[r] = sparseRRR.elastic_rrr_cv(X, Y, lambdas=lambdas, alphas=alphas, reps=10, rank=r, folds=10)
    
pickle.dump([cvresults, cvresults_rank], open('pickles/cvresults-scala2019.pickle', 'wb'))

1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h 10m  6s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h  1m 42s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h  2m 21s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h  6m 51s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h  9m 26s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h 12m 18s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h 11m 56s
1.......... 2.......

In [8]:
data = pickle.load(open('data/cadwell2016.pickle', 'rb'))
X,Y = preprocess(data)
print('Shape of X:', X.shape, '\nShape of Y:', Y.shape)

Shape of X: (44, 3000) 
Shape of Y: (44, 11)


In [9]:
%%time

lambdas = np.concatenate([np.arange(.1,3,.1), np.arange(3,11)])
alphas = [.25, .5, .75, 1]

cvresults = sparseRRR.elastic_rrr_cv(X, Y, rank=2, reps=10, folds=11, alphas=alphas, lambdas=lambdas)

alphas = np.array([.5])
ranks = np.arange(1, Y.shape[1]+1)

cvresults_rank = {}
for r in ranks:
    cvresults_rank[r] = sparseRRR.elastic_rrr_cv(X, Y, lambdas=lambdas, alphas=alphas, reps=10, rank=r, folds=11)
    
pickle.dump([cvresults, cvresults_rank], open('pickles/cvresults-cadwell2016.pickle', 'wb'))

1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h 18m 32s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h  1m 36s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h  4m 39s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h  9m  9s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h 18m 40s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h 17m 36s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........

In [13]:
data = pickle.load(open('data/fuzik2016.pickle', 'rb'))
X,Y = preprocess(data)
print('Shape of X:', X.shape, '\nShape of Y:', Y.shape)

Shape of X: (80, 1313) 
Shape of Y: (80, 80)


In [14]:
lambdas = np.arange(.1,6,.25)
alphas = np.array([.5])
ranks = np.array([1,2])

cvresults = {}
for r in ranks:
    cvresults[r] = sparseRRR.elastic_rrr_cv(X, Y, lambdas=lambdas, alphas=alphas, reps=10, rank=r, folds=10)
    
pickle.dump(cvresults, open('pickles/cvresults-fuzik2016.pickle', 'wb'))

1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h  2m 22s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h  4m 40s


In [4]:
data = pickle.load(open('data/gouwens2020.pickle', 'rb'))
# Already preprocessed
X=data['counts'].toarray().astype('float64')
Y=data['ephys']
print('Shape of X:', X.shape, '\nShape of Y:', Y.shape)

Shape of X: (3395, 1252) 
Shape of Y: (3395, 55)


In [12]:
%%time

lambdas = np.concatenate((np.arange(.3,1,.1), np.arange(1,11)))
alphas = np.array([.25, .5, .75, 1])

cvresults = sparseRRR.elastic_rrr_cv(X, Y, rank=2, reps=1, folds=10, alphas=alphas, lambdas=lambdas)

alphas = np.array([1])
lambdas = lambdas = np.concatenate((np.arange(.3,1,.1), np.arange(1,11)))
ranks = np.arange(1, 17)

cvresults_rank = {}
for r in ranks:
    cvresults_rank[r] = sparseRRR.elastic_rrr_cv(X, Y, rank=r, reps=1, folds=10, alphas=alphas, lambdas=lambdas)
    
pickle.dump([cvresults, cvresults_rank], open('pickles/cvresults-gouwens2020.pickle', 'wb'))

1.......... Time: 0.0h 42m 32s
1.......... Time: 0.0h  2m 15s
1.......... Time: 0.0h  5m 14s
1.......... Time: 0.0h  2m 36s
1.......... Time: 0.0h 12m 23s
1.......... Time: 0.0h 17m  7s
1.......... Time: 0.0h 12m 44s
1.......... Time: 0.0h 14m 38s
1.......... Time: 0.0h 10m 23s
1.......... Time: 0.0h 17m 11s
1.......... Time: 0.0h 54m 10s
1.......... Time: 1.0h 20m 56s
1.......... Time: 1.0h  9m 51s
1.......... Time: 1.0h 17m  4s
1.......... Time: 1.0h  4m  2s
1.......... Time: 1.0h  9m 13s
1.......... Time: 0.0h 59m  8s
CPU times: user 17h 42min 53s, sys: 1d 15h 12min 48s, total: 2d 8h 55min 42s
Wall time: 10h 11min 25s


## Preprocessing variants

In [3]:
import rnaseqTools

def cv_preprocessing_variants(data, filename, lambdas=[1,2,3], alpha=.5, reps=1, folds=10, n_genes=1000):
    X,Y = preprocess(data)

    alphas = np.array([alpha])
    cv1 = sparseRRR.elastic_rrr_cv(X, Y, rank=2, reps=reps, 
                                   folds=folds, alphas=alphas, lambdas=lambdas)

    X = data['counts'][:,data['mostVariableGenes']] / np.sum(data['counts'], axis=1) * 1e+6
    X = np.array(X)
    X = np.log2(X + 1)
    X = X - np.mean(X, axis=0)
    cv2 = sparseRRR.elastic_rrr_cv(X, Y, rank=2, reps=reps,
                                   folds=folds, alphas=alphas, lambdas=lambdas)

    X = data['counts'] / np.sum(data['counts'], axis=1) * 1e+6
    X = np.array(X)
    X = np.log2(X + 1)
    ind = np.sum(X, axis=0) > 0
    X = X - np.mean(X, axis=0)
    cv4 = sparseRRR.elastic_rrr_cv(X, Y, rank=2, reps=reps,
                                   folds=folds, alphas=alphas, lambdas=lambdas)

    X = X[:, ind]
    X = X / np.std(X, axis=0)
    cv3 = sparseRRR.elastic_rrr_cv(X, Y, rank=2, reps=reps,
                                   folds=folds, alphas=alphas, lambdas=lambdas)
    
    
    def cv_preprocess(Xtrain, Xtest, n_genes=n_genes):
        mostVariableGenes = rnaseqTools.geneSelection(Xtrain, n=n_genes, threshold=32, plot=False, verbose=0)

        X = Xtrain[:,mostVariableGenes] / np.sum(Xtrain, axis=1, keepdims=True) * 1e+6
        X = np.array(X)
        X = np.log2(X + 1)
        ind = np.sum(X, axis=0) > 0
        X = X[:, ind]
        mu = np.mean(X, axis=0)
        std = np.std(X, axis=0)
        X = (X - mu) / std
    
        Xt = Xtest[:,mostVariableGenes] / np.sum(Xtest, axis=1, keepdims=True) * 1e+6
        Xt = np.array(Xt)
        Xt = np.log2(Xt + 1)
        Xt = Xt[:, ind]
        Xt = (Xt - mu) / std
    
        return(X, Xt) 
    
    X = np.array(data['counts'].todense())
    cv5 = sparseRRR.elastic_rrr_cv(X, Y, rank=2, reps=reps,     
                                   folds=folds, alphas=alphas, lambdas=lambdas,
                                   preprocess = cv_preprocess)

    pickle.dump([cv1, cv2, cv3, cv4, cv5], open(filename, 'wb'))

In [4]:
data = pickle.load(open('data/scala2020.pickle', 'rb'))
lambdas = np.concatenate((np.arange(.1,1,.1), np.arange(1,11)))
cv_preprocessing_variants(data, 'pickles/cvresults-scala2020-variants.pickle', 
                          lambdas=lambdas, alpha=1, reps=1, folds=10)

1.......... Time: 0.0h  0m 39s
1.......... Time: 0.0h  3m 32s
1.......... Time: 0.0h 35m  7s
1.......... Time: 0.0h 18m 19s
1.......... Time: 0.0h  1m 28s


In [5]:
data = pickle.load(open('data/scala2019.pickle', 'rb'))
lambdas = np.concatenate([np.arange(.1,3,.1), np.arange(3,11)])
cv_preprocessing_variants(data, 'pickles/cvresults-scala2019-variants.pickle', 
                          lambdas=lambdas, alpha=.5, reps=10, folds=10)

1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h  5m 23s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h 18m 25s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h 59m 55s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h 40m 58s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h  2m 28s


In [4]:
data = pickle.load(open('data/cadwell2016.pickle', 'rb'))
lambdas = np.concatenate([np.arange(.1,3,.1), np.arange(3,11)])
cv_preprocessing_variants(data, 'pickles/cvresults-cadwell2016-variants.pickle', 
                          lambdas=lambdas, alpha=.5, reps=10, folds=11, n_genes=3000)

1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h 10m 34s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h 20m 49s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h 45m 17s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h 34m 56s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h  7m  4s


## Witten and Suo comparisons

In [3]:
import spls_scca

In [4]:
data = pickle.load(open('data/scala2020.pickle', 'rb'))
X,Y = preprocess(data)

lx_scan = np.arange(500, 2000, 100)
%time corr_witten, nonz_witten = spls_scca.witten_cv(X, Y, lx_scan, ncomps=2, reps=1)

lx_scan = np.arange(1,15)
%time corr_suo, nonz_suo = spls_scca.suo_cv(X, Y, lx_scan, reps=1)

#pickle.dump([corr_witten, nonz_witten, corr_suo, nonz_suo], 
#            open('pickles/cvresults-scala2020-wittensuo.pickle', 'wb'))

. done
CPU times: user 20.7 s, sys: 1min 19s, total: 1min 39s
Wall time: 2.57 s
. done
CPU times: user 2h 50min 53s, sys: 8h 2min 37s, total: 10h 53min 30s
Wall time: 16min 23s


In [4]:
data = pickle.load(open('data/scala2019.pickle', 'rb'))
X,Y = preprocess(data)

lx_scan = np.arange(20, 75, 5)
%time corr_witten, nonz_witten = spls_scca.witten_cv(X, Y, lx_scan, ncomps=2, reps=10)

lx_scan = np.concatenate((np.arange(.1,1,.1), np.arange(1, 5, .5)))
%time corr_suo, nonz_suo = spls_scca.suo_cv(X, Y, lx_scan, reps=10)

pickle.dump([corr_witten, nonz_witten, corr_suo, nonz_suo], 
            open('pickles/cvresults-scala2019-wittensuo.pickle', 'wb'))

.......... done
CPU times: user 34.7 s, sys: 55.8 s, total: 1min 30s
Wall time: 11.9 s
.......... done
CPU times: user 1h 58min 31s, sys: 2h 46min 37s, total: 4h 45min 8s
Wall time: 36min 7s


In [5]:
data = pickle.load(open('data/cadwell2016.pickle', 'rb'))
X,Y = preprocess(data)

lx_scan = np.arange(20, 75, 5)
%time corr_witten, nonz_witten = spls_scca.witten_cv(X, Y, lx_scan, ncomps=2, reps=10)

lx_scan = np.concatenate((np.arange(.1,1,.1), np.arange(1, 5, .5)))
%time corr_suo, nonz_suo = spls_scca.suo_cv(X, Y, lx_scan, reps=10)

pickle.dump([corr_witten, nonz_witten, corr_suo, nonz_suo], 
            open('pickles/cvresults-cadwell2016-wittensuo.pickle', 'wb'))

.......... done
CPU times: user 19.6 s, sys: 28.5 s, total: 48.1 s
Wall time: 6.14 s
.......... done
CPU times: user 2h 8min 35s, sys: 2h 58min 2s, total: 5h 6min 37s
Wall time: 38min 45s


In [70]:
data = pickle.load(open('data/gouwens2020.pickle', 'rb'))
# Already preprocessed
X=data['counts'].toarray().astype('float64')
Y=data['ephys']

lx_scan = np.arange(5000, 100000, 1000)
%time corr_witten, nonz_witten = spls_scca.witten_cv(X, Y, lx_scan, ncomps=2, reps=1)

lx_scan = np.arange(1,15)
%time corr_suo, nonz_suo = spls_scca.suo_cv(X, Y, lx_scan, reps=1)

pickle.dump([corr_witten, nonz_witten, corr_suo, nonz_suo], 
            open('pickles/cvresults-gouwens2020-wittensuo.pickle', 'wb'))

. done
CPU times: user 2min 42s, sys: 5min 46s, total: 8min 28s
Wall time: 12.8 s
. done
CPU times: user 5h 4min 38s, sys: 12h 33min 3s, total: 17h 37min 42s
Wall time: 26min 36s


# Sanity check

In [68]:
corr_witten, nonz_witten, corr_suo, nonz_suo = pickle.load(open('pickles/cvresults-gouwens2020-wittensuo.pickle', 'rb'))
data = pickle.load(open('data/gouwens2020.pickle', 'rb'))
# Already preprocessed
X=data['counts'].toarray().astype('float64')
Y=data['ephys']
lx_scan = np.arange(10000, 14000, 1000)
%time corr_witten, nonz_witten = spls_scca.witten_cv_sanity_check(X, Y, lx_scan, genes=data['genes'],\
                                        ncomps=2, reps=1, keep_comp1_param_cte=False)

.1st component: 
corr coeff:  0.8559262858333232
# genes selected:  49
reg param=10000 , genes selected:  ['Adarb2' 'Prox1' 'Nxph1' 'Igf1' 'Ptprm' 'Synpr' 'Cacna2d1' 'Cacna2d3'
 'Fam135b' 'Tmem44' 'Tmem132c' 'Sorcs3' 'Cpne7' 'Kctd8' 'Kit' 'Kcns3'
 'Adcy2' 'Sox5' 'Itpr1' 'Galnt14' 'Penk' 'Pde4b' 'Pvalb' 'Coro6' 'Gabrd'
 'Crtac1' 'Fxyd6' 'Sorcs1' 'Iqgap2' 'Col19a1' 'Nek7' 'Rapgef5' 'Rgs8'
 'Eya4' 'Cntnap4' 'Vip' 'Kcnt2' 'Slit2' 'Lingo2' 'Man1c1' '1700019L22Rik'
 'Tmem132d' 'Egfem1' 'Plch2' 'Myo1e' 'Esrrg' 'Runx2' 'Mitf' 'Epb4.1']
1st component: 
corr coeff:  0.8380994433948786
# genes selected:  31
reg param=11000 , genes selected:  ['Adarb2' 'Prox1' 'Nxph1' 'Igf1' 'Ptprm' 'Synpr' 'Cacna2d3' 'Fam135b'
 'Tmem44' 'Tmem132c' 'Sorcs3' 'Cpne7' 'Kit' 'Adcy2' 'Galnt14' 'Pvalb'
 'Coro6' 'Gabrd' 'Fxyd6' 'Sorcs1' 'Iqgap2' 'Nek7' 'Rapgef5' 'Eya4' 'Vip'
 'Kcnt2' 'Slit2' 'Lingo2' 'Tmem132d' 'Egfem1' 'Runx2']
1st component: 
corr coeff:  0.8124905970369488
# genes selected:  20
reg param=12000 , genes

In [70]:
# If you'd like to do witten again only

# corr_witten, nonz_witten, corr_suo, nonz_suo = pickle.load(open('pickles/cvresults-scala2020-wittensuo.pickle', 'rb'))
# data = pickle.load(open('data/scala2020.pickle', 'rb'))
# X,Y = preprocess(data)
# lx_scan = np.arange(500, 2000, 100)
# %time corr_witten, nonz_witten = spls_scca.witten_cv(X, Y, lx_scan, ncomps=2, reps=1)
# pickle.dump([corr_witten, nonz_witten, corr_suo, nonz_suo], 
#             open('pickles/cvresults-scala2020-wittensuo.pickle', 'wb'))