# Perform and pickle cross-validation for all datasets

In [1]:
%matplotlib notebook

import numpy as np
import pylab as plt
import seaborn as sns
import pickle
import matplotlib

import sparseRRR

In [2]:
def preprocess(data, normalize='cpm'):
    X = data['counts'][:,data['mostVariableGenes']] / np.sum(data['counts'], axis=1)
    if normalize=='cpm':
        X *= 1e+6
    elif normalize=='median':
        X *= np.median(np.array(np.sum(data['counts'],axis=1)))
    X = np.array(X)
    X = np.log2(X + 1)
    X = X - np.mean(X, axis=0)
    X = X / np.std(X, axis=0)

    Y = data['ephys']
    Y = Y - np.mean(Y, axis=0)
    Y = Y / np.std(Y, axis=0)
    
    return (X,Y)

## The main cross-validation setup

In [27]:
data = pickle.load(open('data/scala2020.pickle', 'rb'))
X,Y = preprocess(data)
print('Shape of X:', X.shape, '\nShape of Y:', Y.shape)

Shape of X: (1213, 1000) 
Shape of Y: (1213, 16)


In [55]:
%%time

lambdas = np.concatenate((np.arange(.1,1,.1), np.arange(1,11)))
alphas = np.array([.25, .5, .75, 1])

cvresults = sparseRRR.elastic_rrr_cv(X, Y, rank=2, reps=1, folds=10, alphas=alphas, lambdas=lambdas)

alphas = np.array([1])
ranks = np.arange(1, Y.shape[1]+1)

cvresults_rank = {}
for r in ranks:
    cvresults_rank[r] = sparseRRR.elastic_rrr_cv(X, Y, rank=r, reps=1, folds=10, alphas=alphas, lambdas=lambdas)
    
pickle.dump([cvresults, cvresults_rank], open('pickles/cvresults-scala2020.pickle', 'wb'))

1.......... Time: 0.0h  3m 41s
1.......... Time: 0.0h  0m 28s
1.......... Time: 0.0h  0m 38s
1.......... Time: 0.0h  0m 53s
1.......... Time: 0.0h  1m 17s
1.......... Time: 0.0h  1m 45s
1.......... Time: 0.0h  3m  3s
1.......... Time: 0.0h  3m 47s
1.......... Time: 0.0h  5m 36s
1.......... Time: 0.0h  4m 31s
1.......... Time: 0.0h  4m  3s
1.......... Time: 0.0h  4m 13s
1.......... Time: 0.0h  4m 57s
1.......... Time: 0.0h  3m 24s
1.......... Time: 0.0h  5m  7s
1.......... Time: 0.0h  2m 39s
1.......... Time: 0.0h  1m 34s
CPU times: user 2h 3min 18s, sys: 50.2 s, total: 2h 4min 8s
Wall time: 51min 35s


In [6]:
data = pickle.load(open('data/scala2019.pickle', 'rb'))
X,Y = preprocess(data)
print('Shape of X:', X.shape, '\nShape of Y:', Y.shape)

Shape of X: (102, 1000) 
Shape of Y: (102, 13)


In [7]:
%%time

lambdas = np.concatenate([np.arange(.1,3,.1), np.arange(3,11)])
alphas = [.25, .5, .75, 1]

cvresults = sparseRRR.elastic_rrr_cv(X, Y, rank=2, reps=10, folds=10, alphas=alphas, lambdas=lambdas)

alphas = np.array([.5])
ranks = np.arange(1, Y.shape[1]+1)

cvresults_rank = {}
for r in ranks:
    cvresults_rank[r] = sparseRRR.elastic_rrr_cv(X, Y, lambdas=lambdas, alphas=alphas, reps=10, rank=r, folds=10)
    
pickle.dump([cvresults, cvresults_rank], open('pickles/cvresults-scala2019.pickle', 'wb'))

1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h 10m  6s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h  1m 42s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h  2m 21s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h  6m 51s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h  9m 26s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h 12m 18s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h 11m 56s
1.......... 2.......

In [8]:
data = pickle.load(open('data/cadwell2016.pickle', 'rb'))
X,Y = preprocess(data)
print('Shape of X:', X.shape, '\nShape of Y:', Y.shape)

Shape of X: (44, 3000) 
Shape of Y: (44, 11)


In [9]:
%%time

lambdas = np.concatenate([np.arange(.1,3,.1), np.arange(3,11)])
alphas = [.25, .5, .75, 1]

cvresults = sparseRRR.elastic_rrr_cv(X, Y, rank=2, reps=10, folds=11, alphas=alphas, lambdas=lambdas)

alphas = np.array([.5])
ranks = np.arange(1, Y.shape[1]+1)

cvresults_rank = {}
for r in ranks:
    cvresults_rank[r] = sparseRRR.elastic_rrr_cv(X, Y, lambdas=lambdas, alphas=alphas, reps=10, rank=r, folds=11)
    
pickle.dump([cvresults, cvresults_rank], open('pickles/cvresults-cadwell2016.pickle', 'wb'))

1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h 18m 32s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h  1m 36s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h  4m 39s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h  9m  9s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h 18m 40s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h 17m 36s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........

In [25]:
data = pickle.load(open('data/fuzik2016.pickle', 'rb'))
X,Y = preprocess(data, normalize='median')
print('Shape of X:', X.shape, '\nShape of Y:', Y.shape)

Shape of X: (80, 1313) 
Shape of Y: (80, 80)


In [26]:
lambdas = np.arange(.1,6,.25)
alphas = np.array([.5])
ranks = np.array([1,2])

cvresults = {}
for r in ranks:
    cvresults[r] = sparseRRR.elastic_rrr_cv(X, Y, lambdas=lambdas, alphas=alphas, reps=10, rank=r, folds=10)
    
pickle.dump(cvresults, open('pickles/cvresults-fuzik2016.pickle', 'wb'))

1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h  2m  1s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h  4m 53s


In [4]:
data = pickle.load(open('data/gouwens2020.pickle', 'rb'))
# Already preprocessed
X=data['counts'].toarray().astype('float64')
Y=data['ephys']
print('Shape of X:', X.shape, '\nShape of Y:', Y.shape)

Shape of X: (3395, 1252) 
Shape of Y: (3395, 55)


In [12]:
%%time

lambdas = np.concatenate((np.arange(.3,1,.1), np.arange(1,11)))
alphas = np.array([.25, .5, .75, 1])

cvresults = sparseRRR.elastic_rrr_cv(X, Y, rank=2, reps=1, folds=10, alphas=alphas, lambdas=lambdas)

alphas = np.array([1])
lambdas = lambdas = np.concatenate((np.arange(.3,1,.1), np.arange(1,11)))
ranks = np.arange(1, 17)

cvresults_rank = {}
for r in ranks:
    cvresults_rank[r] = sparseRRR.elastic_rrr_cv(X, Y, rank=r, reps=1, folds=10, alphas=alphas, lambdas=lambdas)
    
pickle.dump([cvresults, cvresults_rank], open('pickles/cvresults-gouwens2020.pickle', 'wb'))

1.......... Time: 0.0h 42m 32s
1.......... Time: 0.0h  2m 15s
1.......... Time: 0.0h  5m 14s
1.......... Time: 0.0h  2m 36s
1.......... Time: 0.0h 12m 23s
1.......... Time: 0.0h 17m  7s
1.......... Time: 0.0h 12m 44s
1.......... Time: 0.0h 14m 38s
1.......... Time: 0.0h 10m 23s
1.......... Time: 0.0h 17m 11s
1.......... Time: 0.0h 54m 10s
1.......... Time: 1.0h 20m 56s
1.......... Time: 1.0h  9m 51s
1.......... Time: 1.0h 17m  4s
1.......... Time: 1.0h  4m  2s
1.......... Time: 1.0h  9m 13s
1.......... Time: 0.0h 59m  8s
CPU times: user 17h 42min 53s, sys: 1d 15h 12min 48s, total: 2d 8h 55min 42s
Wall time: 10h 11min 25s


## Nested CV

In [5]:
data = pickle.load(open('data/scala2020.pickle', 'rb'))
X,Y = preprocess(data)
print('Shape of X:', X.shape, '\nShape of Y:', Y.shape, '\n')

lambdas = np.concatenate([np.arange(.1,1,.1), np.arange(1,11)])
alphas = np.array([.25, .5, .75, 1])
%time sparseRRR.nested_cv(X, Y, lambdas, alphas)

Shape of X: (1213, 1000) 
Shape of Y: (1213, 16) 

1.......... Time: 0.0h  6m  4s
Optimal alpha: 1.0, lambda to get 20 genes: 0.4, test R2 = 0.27
1.......... Time: 0.0h  7m 47s
Optimal alpha: 1.0, lambda to get 20 genes: 0.4, test R2 = 0.45
1.......... Time: 0.0h  7m 46s
Optimal alpha: 1.0, lambda to get 20 genes: 0.4, test R2 = 0.50
1.......... Time: 0.0h  8m  8s
Optimal alpha: 1.0, lambda to get 20 genes: 0.4, test R2 = 0.30
1.......... Time: 0.0h  8m  8s
Optimal alpha: 1.0, lambda to get 20 genes: 0.4, test R2 = 0.42
1.......... Time: 0.0h  7m 60s
Optimal alpha: 1.0, lambda to get 20 genes: 0.4, test R2 = 0.32
1.......... Time: 0.0h  6m 44s
Optimal alpha: 1.0, lambda to get 20 genes: 0.4, test R2 = 0.21
1.......... Time: 0.0h  6m 60s
Optimal alpha: 1.0, lambda to get 20 genes: 0.4, test R2 = 0.22
1.......... Time: 0.0h  6m 27s
Optimal alpha: 1.0, lambda to get 20 genes: 0.4, test R2 = 0.24
1.......... Time: 0.0h  7m 43s
Optimal alpha: 1.0, lambda to get 20 genes: 0.5, test R2 = 0.24

array([0.26718463, 0.45175032, 0.49800972, 0.30037888, 0.4217728 ,
       0.31849165, 0.21365848, 0.21795643, 0.24189906, 0.23515745])

In [6]:
data = pickle.load(open('data/scala2019.pickle', 'rb'))
X,Y = preprocess(data)
print('Shape of X:', X.shape, '\nShape of Y:', Y.shape, '\n')

lambdas = np.concatenate([np.arange(.1,3,.1), np.arange(3,11)])
alphas = np.array([.25, .5, .75, 1])
%time sparseRRR.nested_cv(X, Y, lambdas, alphas)

Shape of X: (102, 1000) 
Shape of Y: (102, 13) 

1.......... Time: 0.0h  0m 53s
Optimal alpha: 0.75, lambda to get 20 genes: 0.6, test R2 = 0.12
1.......... Time: 0.0h  1m 15s
Optimal alpha: 1.0, lambda to get 20 genes: 0.4, test R2 = 0.13
1.......... Time: 0.0h  1m 10s
Optimal alpha: 0.75, lambda to get 20 genes: 0.6, test R2 = 0.03
1.......... Time: 0.0h  1m 23s
Optimal alpha: 0.75, lambda to get 20 genes: 0.6, test R2 = 0.08
1.......... Time: 0.0h  1m 20s
Optimal alpha: 0.25, lambda to get 20 genes: 2.4, test R2 = 0.12
1.......... Time: 0.0h  1m  7s
Optimal alpha: 1.0, lambda to get 20 genes: 0.4, test R2 = 0.06
1.......... Time: 0.0h  1m 10s
Optimal alpha: 0.5, lambda to get 20 genes: 1.1, test R2 = 0.19
1.......... Time: 0.0h  1m  9s
Optimal alpha: 0.75, lambda to get 20 genes: 0.6, test R2 = 0.21
1.......... Time: 0.0h  1m 24s
Optimal alpha: 0.25, lambda to get 20 genes: 2.3, test R2 = 0.20
1.......... Time: 0.0h  0m 56s
Optimal alpha: 0.25, lambda to get 20 genes: 2.4, test R2 =

array([0.12283947, 0.13461875, 0.0250806 , 0.07561174, 0.123654  ,
       0.05508009, 0.1885724 , 0.20958745, 0.19594025, 0.2167022 ])

In [7]:
data = pickle.load(open('data/cadwell2016.pickle', 'rb'))
X,Y = preprocess(data)
print('Shape of X:', X.shape, '\nShape of Y:', Y.shape, '\n')

lambdas = np.concatenate([np.arange(.1,3,.1), np.arange(3,11)])
alphas = np.array([.25, .5, .75, 1])
%time sparseRRR.nested_cv(X, Y, lambdas, alphas)

Shape of X: (44, 3000) 
Shape of Y: (44, 11) 

1.......... Time: 0.0h  1m 22s
Optimal alpha: 0.25, lambda to get 20 genes: 4.0, test R2 = 0.10
1.......... Time: 0.0h  1m 12s
Optimal alpha: 0.5, lambda to get 20 genes: 1.7, test R2 = 0.24
1.......... Time: 0.0h  1m 22s
Optimal alpha: 0.5, lambda to get 20 genes: 1.8, test R2 = 0.36
1.......... Time: 0.0h  1m 26s
Optimal alpha: 0.5, lambda to get 20 genes: 1.8, test R2 = 0.08
1.......... Time: 0.0h  1m  4s
Optimal alpha: 1.0, lambda to get 20 genes: 0.5, test R2 = 0.06
1.......... Time: 0.0h  1m 22s
Optimal alpha: 0.25, lambda to get 20 genes: 4.0, test R2 = 0.26
1.......... Time: 0.0h  1m  5s
Optimal alpha: 0.25, lambda to get 20 genes: 4.0, test R2 = 0.25
1.......... Time: 0.0h  1m 29s
Optimal alpha: 0.75, lambda to get 20 genes: 0.9, test R2 = 0.18
1.......... Time: 0.0h  1m 42s
Optimal alpha: 0.5, lambda to get 20 genes: 1.7, test R2 = 0.04
1.......... Time: 0.0h  1m 35s
Optimal alpha: 0.75, lambda to get 20 genes: 0.9, test R2 = 0.1

array([0.09996219, 0.2395461 , 0.36457844, 0.08296144, 0.06100068,
       0.2637304 , 0.24654567, 0.17585261, 0.03773421, 0.12465749])

In [8]:
data = pickle.load(open('data/gouwens2020.pickle', 'rb'))
# Already preprocessed
X=data['counts'].toarray().astype('float64')
Y=data['ephys']
print('Shape of X:', X.shape, '\nShape of Y:', Y.shape, '\n')

lambdas = np.concatenate((np.arange(.3,1,.1), np.arange(1,11)))
alphas = np.array([.25, .5, .75, 1])
%time sparseRRR.nested_cv(X, Y, lambdas, alphas)

Shape of X: (3395, 1252) 
Shape of Y: (3395, 55) 

1.......... Time: 0.0h 42m  7s
Optimal alpha: 0.75, lambda to get 20 genes: 1.0, test R2 = 0.19
1.......... Time: 0.0h 51m 14s
Optimal alpha: 0.75, lambda to get 20 genes: 1.0, test R2 = 0.18
1.......... Time: 0.0h 48m 22s
Optimal alpha: 1.0, lambda to get 20 genes: 1.0, test R2 = 0.21
1.......... Time: 0.0h 46m 28s
Optimal alpha: 0.75, lambda to get 20 genes: 1.0, test R2 = 0.20
1.......... Time: 0.0h 42m 28s
Optimal alpha: 1.0, lambda to get 20 genes: 1.0, test R2 = 0.17
1.......... Time: 0.0h 44m  5s
Optimal alpha: 1.0, lambda to get 20 genes: 1.0, test R2 = 0.15
1.......... Time: 0.0h 39m  5s
Optimal alpha: 1.0, lambda to get 20 genes: 1.0, test R2 = 0.20
1.......... Time: 0.0h 37m  9s
Optimal alpha: 1.0, lambda to get 20 genes: 1.0, test R2 = 0.17
1.......... Time: 0.0h 39m  0s
Optimal alpha: 1.0, lambda to get 20 genes: 1.0, test R2 = 0.22
1.......... Time: 0.0h 39m 36s
Optimal alpha: 1.0, lambda to get 20 genes: 1.0, test R2 = 0

array([0.18581167, 0.17906378, 0.21283055, 0.19926951, 0.16719289,
       0.14552697, 0.19858184, 0.17058389, 0.21916345, 0.19818346])

## Preprocessing variants

In [3]:
import rnaseqTools

def cv_preprocessing_variants(data, filename, lambdas=[1,2,3], alpha=.5, reps=1, folds=10, n_genes=1000):
    X,Y = preprocess(data)

    alphas = np.array([alpha])
    cv1 = sparseRRR.elastic_rrr_cv(X, Y, rank=2, reps=reps, 
                                   folds=folds, alphas=alphas, lambdas=lambdas)

    X = data['counts'][:,data['mostVariableGenes']] / np.sum(data['counts'], axis=1) * 1e+6
    X = np.array(X)
    X = np.log2(X + 1)
    X = X - np.mean(X, axis=0)
    cv2 = sparseRRR.elastic_rrr_cv(X, Y, rank=2, reps=reps,
                                   folds=folds, alphas=alphas, lambdas=lambdas)

    X = data['counts'] / np.sum(data['counts'], axis=1) * 1e+6
    X = np.array(X)
    X = np.log2(X + 1)
    ind = np.sum(X, axis=0) > 0
    X = X - np.mean(X, axis=0)
    cv4 = sparseRRR.elastic_rrr_cv(X, Y, rank=2, reps=reps,
                                   folds=folds, alphas=alphas, lambdas=lambdas)

    X = X[:, ind]
    X = X / np.std(X, axis=0)
    cv3 = sparseRRR.elastic_rrr_cv(X, Y, rank=2, reps=reps,
                                   folds=folds, alphas=alphas, lambdas=lambdas)
    
    
    def cv_preprocess(Xtrain, Xtest, n_genes=n_genes):
        mostVariableGenes = rnaseqTools.geneSelection(Xtrain, n=n_genes, threshold=32, plot=False, verbose=0)

        X = Xtrain[:,mostVariableGenes] / np.sum(Xtrain, axis=1, keepdims=True) * 1e+6
        X = np.array(X)
        X = np.log2(X + 1)
        ind = np.sum(X, axis=0) > 0
        X = X[:, ind]
        mu = np.mean(X, axis=0)
        std = np.std(X, axis=0)
        X = (X - mu) / std
    
        Xt = Xtest[:,mostVariableGenes] / np.sum(Xtest, axis=1, keepdims=True) * 1e+6
        Xt = np.array(Xt)
        Xt = np.log2(Xt + 1)
        Xt = Xt[:, ind]
        Xt = (Xt - mu) / std
    
        return(X, Xt) 
    
    X = np.array(data['counts'].todense())
    cv5 = sparseRRR.elastic_rrr_cv(X, Y, rank=2, reps=reps,     
                                   folds=folds, alphas=alphas, lambdas=lambdas,
                                   preprocess = cv_preprocess)

    pickle.dump([cv1, cv2, cv3, cv4, cv5], open(filename, 'wb'))

In [4]:
data = pickle.load(open('data/scala2020.pickle', 'rb'))
lambdas = np.concatenate((np.arange(.1,1,.1), np.arange(1,11)))
cv_preprocessing_variants(data, 'pickles/cvresults-scala2020-variants.pickle', 
                          lambdas=lambdas, alpha=1, reps=1, folds=10)

1.......... Time: 0.0h  0m 39s
1.......... Time: 0.0h  3m 32s
1.......... Time: 0.0h 35m  7s
1.......... Time: 0.0h 18m 19s
1.......... Time: 0.0h  1m 28s


In [5]:
data = pickle.load(open('data/scala2019.pickle', 'rb'))
lambdas = np.concatenate([np.arange(.1,3,.1), np.arange(3,11)])
cv_preprocessing_variants(data, 'pickles/cvresults-scala2019-variants.pickle', 
                          lambdas=lambdas, alpha=.5, reps=10, folds=10)

1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h  5m 23s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h 18m 25s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h 59m 55s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h 40m 58s
1.......... 2.......... 3.......... 4.......... 5.......... 6.......... 7.......... 8.......... 9.......... 10.......... Time: 0.0h  2m 28s


In [4]:
data = pickle.load(open('data/cadwell2016.pickle', 'rb'))
lambdas = np.concatenate([np.arange(.1,3,.1), np.arange(3,11)])
cv_preprocessing_variants(data, 'pickles/cvresults-cadwell2016-variants.pickle', 
                          lambdas=lambdas, alpha=.5, reps=10, folds=11, n_genes=3000)

1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h 10m 34s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h 20m 49s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h 45m 17s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h 34m 56s
1........... 2........... 3........... 4........... 5........... 6........... 7........... 8........... 9........... 10........... Time: 0.0h  7m  4s


## Witten and Suo comparisons

In [3]:
import spls_scca

In [None]:
# from rpy2.robjects.packages import importr
# utils = importr('utils')
# utils.install_packages('PMA')

In [7]:
import rpy2.robjects.numpy2ri
rpy2.robjects.numpy2ri.activate()

from rpy2.robjects.packages import importr
pma = importr("PMA")

def witten(X, Y, ncomps=1, lx=1):
    out = pma.CCA(X, Y, typex="standard", typez="standard", standardize=False, 
                  K=ncomps, penaltyx=lx)
    d = { key : out.rx2(key) for key in out.names }
    w = np.asarray(d['u'])
    v = np.asarray(d['v'])
    return (w,v)

def witten_cv(X, Y, reg_params, reps=1, folds=10, seed=42, ncomps=1):
    n = X.shape[0]
    testcorrs = np.zeros((folds, reps, len(reg_params), ncomps))
    nonzero = np.zeros((folds, reps, len(reg_params), ncomps))

    # CV repetitions
    np.random.seed(seed)
    for rep in range(reps):
        print('.', end='')
        ind = np.random.permutation(n)
        X = X[ind,:]
        Y = Y[ind,:]
        
        # CV folds
        for cvfold in range(folds):
            indtest  = np.arange(cvfold*int(n/folds), (cvfold+1)*int(n/folds))
            indtrain = np.setdiff1d(np.arange(n), indtest)
            Xtrain = np.copy(X[indtrain,:])
            Ytrain = np.copy(Y[indtrain,:])
            Xtest  = np.copy(X[indtest,:])
            Ytest  = np.copy(Y[indtest,:])
            
            # mean centering
            X_mean = np.mean(Xtrain, axis=0)
            Xtrain -= X_mean
            Xtest  -= X_mean
            Y_mean = np.mean(Ytrain, axis=0)
            Ytrain -= Y_mean
            Ytest  -= Y_mean

            # loop over regularization parameters
            for i,r in enumerate(reg_params):    
                vx,vy = witten(Xtrain, Ytrain, ncomps=ncomps, lx=r)
                
                if np.sum(np.sum(vx!=0,axis=0)==0)>0 or np.sum(np.sum(vy!=0,axis=0)==0)>0:
                    nonzero[cvfold, rep, i, :] = np.nan
                    continue
                
                for ncomp in range(ncomps):
                    testcorrs[cvfold, rep, i, ncomp] = np.corrcoef((Xtest @ vx[:,ncomp]).T, 
                                                                   (Ytest @ vy[:,ncomp]).T)[0,1]
                    nonzero[cvfold, rep, i, ncomp] = np.sum(vx[:,ncomp]!=0)
        
    print(' done')
    return testcorrs, nonzero

In [9]:
data = pickle.load(open('data/scala2020.pickle', 'rb'))
X,Y = preprocess(data)

lx_scan = np.arange(.05, .20, .01)
%time corr_witten, nonz_witten = witten_cv(X, Y, lx_scan, ncomps=2, reps=1)

pickle.dump([corr_witten, nonz_witten], 
           open('pickles/cvresults-scala2020-witten-new.pickle', 'wb'))

.12
1234567891011
12
123456789101112131415
12
123456789101112131415
12
12345678
12
1234
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
1234
12
1234
12
1234
12
123456789
12
1234
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123456789101112131415
12
123
12
123
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
1234
12
123456789
12
12345678
12
123456789
12
12345678
12
1234
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
1234567891011
12
12345678910
12
123456789
12
12345678910
12
123456789
12
1234
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
12
12
12
12
12
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
123
12
12
12
123456
12
12345678910
12
123456789
12
12345678910
12
1234
12
123
12
123
12
123
12
123


In [4]:
data = pickle.load(open('data/scala2020.pickle', 'rb'))
X,Y = preprocess(data)

lx_scan = np.arange(500, 2000, 100)
%time corr_witten, nonz_witten = spls_scca.witten_cv(X, Y, lx_scan, ncomps=2, reps=1)

lx_scan = np.arange(1,15)
%time corr_suo, nonz_suo = spls_scca.suo_cv(X, Y, lx_scan, reps=1)

#pickle.dump([corr_witten, nonz_witten, corr_suo, nonz_suo], 
#            open('pickles/cvresults-scala2020-wittensuo.pickle', 'wb'))

. done
CPU times: user 20.7 s, sys: 1min 19s, total: 1min 39s
Wall time: 2.57 s
. done
CPU times: user 2h 50min 53s, sys: 8h 2min 37s, total: 10h 53min 30s
Wall time: 16min 23s


In [14]:
data = pickle.load(open('data/scala2019.pickle', 'rb'))
X,Y = preprocess(data)

lx_scan = np.arange(.05, .20, .01)
%time corr_witten, nonz_witten = witten_cv(X, Y, lx_scan, ncomps=2, reps=1)

pickle.dump([corr_witten, nonz_witten], 
           open('pickles/cvresults-scala2019-witten-new.pickle', 'wb'))

.1234
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
123
12
12
123456
12
1234
12
1234
12
123
1234567891011121314
123
1234567891011
1234
1234567891011121314
123456
1234
123456789101112131415
1234
123456789101112131415
1234
123456789101112131415
123
12
123
12
123
12
123
12
123
12
123
12
12345
12
123456
12
1234567891011121314
12
1234567891011
12
1234
12
1234
12
1234567
12
123456789101112131415
12
123456789101112131415
12
1234567891011121314
12
12345678910111213
12
12345678910111213
12
123456789101112
12
12
12
12
12
12
12
123456789101112
12
123456789
12
1234567
12
1234567
12
123456789101112
12
12345678910111213
12
12
12
12
12
12
12
12
12
123
12
1234
12
123456
12
123456789101112131415
12
123456789101112131415
12
123456789101112131415
12
12
12
12
12
123
12
123456789101112
12
123
12
123
12
12
12
12
12
12
12
12
12
123
12
123
12
123
12
123
12
123
12
123
12
123
123
123
123
123
12
123
12
123
12
123
12
123
12
12
12
12
12
12
12
12
12
12
12
123
12
123
12
123
1

In [4]:
data = pickle.load(open('data/scala2019.pickle', 'rb'))
X,Y = preprocess(data)

lx_scan = np.arange(20, 75, 5)
%time corr_witten, nonz_witten = spls_scca.witten_cv(X, Y, lx_scan, ncomps=2, reps=10)

lx_scan = np.concatenate((np.arange(.1,1,.1), np.arange(1, 5, .5)))
%time corr_suo, nonz_suo = spls_scca.suo_cv(X, Y, lx_scan, reps=10)

pickle.dump([corr_witten, nonz_witten, corr_suo, nonz_suo], 
            open('pickles/cvresults-scala2019-wittensuo.pickle', 'wb'))

.......... done
CPU times: user 34.7 s, sys: 55.8 s, total: 1min 30s
Wall time: 11.9 s
.......... done
CPU times: user 1h 58min 31s, sys: 2h 46min 37s, total: 4h 45min 8s
Wall time: 36min 7s


In [15]:
data = pickle.load(open('data/cadwell2016.pickle', 'rb'))
X,Y = preprocess(data)

lx_scan = np.arange(.05, .20, .01)
%time corr_witten, nonz_witten = witten_cv(X, Y, lx_scan, ncomps=2, reps=1)

pickle.dump([corr_witten, nonz_witten], 
           open('pickles/cvresults-cadwell2016-witten-new.pickle', 'wb'))

.12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12


  c /= stddev[:, None]
  c /= stddev[None, :]


12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
 done
CPU times: user 28.3 s, sys: 540 ms, total: 28.8 s
Wall time: 7.14 s


In [5]:
data = pickle.load(open('data/cadwell2016.pickle', 'rb'))
X,Y = preprocess(data)

lx_scan = np.arange(20, 75, 5)
%time corr_witten, nonz_witten = spls_scca.witten_cv(X, Y, lx_scan, ncomps=2, reps=10)

lx_scan = np.concatenate((np.arange(.1,1,.1), np.arange(1, 5, .5)))
%time corr_suo, nonz_suo = spls_scca.suo_cv(X, Y, lx_scan, reps=10)

pickle.dump([corr_witten, nonz_witten, corr_suo, nonz_suo], 
            open('pickles/cvresults-cadwell2016-wittensuo.pickle', 'wb'))

.......... done
CPU times: user 19.6 s, sys: 28.5 s, total: 48.1 s
Wall time: 6.14 s
.......... done
CPU times: user 2h 8min 35s, sys: 2h 58min 2s, total: 5h 6min 37s
Wall time: 38min 45s


In [16]:
data = pickle.load(open('data/gouwens2020.pickle', 'rb'))
# Already preprocessed
X=data['counts'].toarray().astype('float64')
Y=data['ephys']

lx_scan = np.arange(.05, .20, .01)
%time corr_witten, nonz_witten = witten_cv(X, Y, lx_scan, ncomps=2, reps=1)

pickle.dump([corr_witten, nonz_witten], 
           open('pickles/cvresults-gouwens2020-witten-new.pickle', 'wb'))

.123456789
123456789101112131415
1234567891011
1234567891011121314
1234567891011
123456789101112131415
12345678910
123456789101112131415
12345678910
123456789101112131415
123456789
123456789101112131415
123456789
1234567891011121314
123456789
1234567891011121314
123456789
1234567891011121314
123456789
1234567891011121314
123456789
123456789101112131415
123456789
123456789101112131415
123456789
123456789101112131415
123456789
123456789101112131415
123456789
123456789101112131415
123456789
123456789101112131415
12345678910
123456789101112131415
123456789101112
1234567891011121314
1234567891011
123456789101112131415
1234567891011
123456789101112131415
12345678910
1234567891011121314
12345678910
1234567891011121314
123456789
1234567891011121314
123456789
1234567891011121314
123456789
1234567891011121314
123456789
1234567891011121314
123456789
1234567891011121314
123456789
123456789101112131415
123456789
123456789101112131415
123456789
123456789101112131415
123456789
123456789101112131415
1

In [70]:
data = pickle.load(open('data/gouwens2020.pickle', 'rb'))
# Already preprocessed
X=data['counts'].toarray().astype('float64')
Y=data['ephys']

lx_scan = np.arange(5000, 100000, 1000)
%time corr_witten, nonz_witten = spls_scca.witten_cv(X, Y, lx_scan, ncomps=2, reps=1)

lx_scan = np.arange(1,15)
%time corr_suo, nonz_suo = spls_scca.suo_cv(X, Y, lx_scan, reps=1)

pickle.dump([corr_witten, nonz_witten, corr_suo, nonz_suo], 
            open('pickles/cvresults-gouwens2020-wittensuo.pickle', 'wb'))

. done
CPU times: user 2min 42s, sys: 5min 46s, total: 8min 28s
Wall time: 12.8 s
. done
CPU times: user 5h 4min 38s, sys: 12h 33min 3s, total: 17h 37min 42s
Wall time: 26min 36s


# Sanity check

In [68]:
corr_witten, nonz_witten, corr_suo, nonz_suo = pickle.load(open('pickles/cvresults-gouwens2020-wittensuo.pickle', 'rb'))
data = pickle.load(open('data/gouwens2020.pickle', 'rb'))
# Already preprocessed
X=data['counts'].toarray().astype('float64')
Y=data['ephys']
lx_scan = np.arange(10000, 14000, 1000)
%time corr_witten, nonz_witten = spls_scca.witten_cv_sanity_check(X, Y, lx_scan, genes=data['genes'],\
                                        ncomps=2, reps=1, keep_comp1_param_cte=False)

.1st component: 
corr coeff:  0.8559262858333232
# genes selected:  49
reg param=10000 , genes selected:  ['Adarb2' 'Prox1' 'Nxph1' 'Igf1' 'Ptprm' 'Synpr' 'Cacna2d1' 'Cacna2d3'
 'Fam135b' 'Tmem44' 'Tmem132c' 'Sorcs3' 'Cpne7' 'Kctd8' 'Kit' 'Kcns3'
 'Adcy2' 'Sox5' 'Itpr1' 'Galnt14' 'Penk' 'Pde4b' 'Pvalb' 'Coro6' 'Gabrd'
 'Crtac1' 'Fxyd6' 'Sorcs1' 'Iqgap2' 'Col19a1' 'Nek7' 'Rapgef5' 'Rgs8'
 'Eya4' 'Cntnap4' 'Vip' 'Kcnt2' 'Slit2' 'Lingo2' 'Man1c1' '1700019L22Rik'
 'Tmem132d' 'Egfem1' 'Plch2' 'Myo1e' 'Esrrg' 'Runx2' 'Mitf' 'Epb4.1']
1st component: 
corr coeff:  0.8380994433948786
# genes selected:  31
reg param=11000 , genes selected:  ['Adarb2' 'Prox1' 'Nxph1' 'Igf1' 'Ptprm' 'Synpr' 'Cacna2d3' 'Fam135b'
 'Tmem44' 'Tmem132c' 'Sorcs3' 'Cpne7' 'Kit' 'Adcy2' 'Galnt14' 'Pvalb'
 'Coro6' 'Gabrd' 'Fxyd6' 'Sorcs1' 'Iqgap2' 'Nek7' 'Rapgef5' 'Eya4' 'Vip'
 'Kcnt2' 'Slit2' 'Lingo2' 'Tmem132d' 'Egfem1' 'Runx2']
1st component: 
corr coeff:  0.8124905970369488
# genes selected:  20
reg param=12000 , genes

In [70]:
# If you'd like to do witten again only

# corr_witten, nonz_witten, corr_suo, nonz_suo = pickle.load(open('pickles/cvresults-scala2020-wittensuo.pickle', 'rb'))
# data = pickle.load(open('data/scala2020.pickle', 'rb'))
# X,Y = preprocess(data)
# lx_scan = np.arange(500, 2000, 100)
# %time corr_witten, nonz_witten = spls_scca.witten_cv(X, Y, lx_scan, ncomps=2, reps=1)
# pickle.dump([corr_witten, nonz_witten, corr_suo, nonz_suo], 
#             open('pickles/cvresults-scala2020-wittensuo.pickle', 'wb'))