## Load Tasic data

In [1]:
import numpy as np
import pickle
import scanpy as sc
import anndata
import openTSNE
from openTSNE.initialization import rescale as rescale_pca
from sklearn.decomposition import PCA
sc.settings.verbosity = 2
from tqdm import tqdm
from readcount_tools import pearson_residuals_compound, get_tag

#### Quick load Tasic data

In [2]:
basepath = 'data/tasic/'
adata = anndata.read_h5ad(f'{basepath}adata.h5ad')

  utils.warn_names_duplicates("var")


In [3]:
print(adata.shape)
sc.pp.filter_genes(adata,min_cells=5)
print(adata.shape)

(23822, 42776)


tcmalloc: large alloc 1729986560 bytes == 0xb081e000 @ 


filtered out 4266 genes that are detected in less than 5 cells
(23822, 38510)


  utils.warn_names_duplicates("var")


### Compound Pearson residuals

In [4]:
def compute_residuals(adata,alpha,theta,clipping=True,tag_suffix=''):
    infostr = get_tag(alpha,theta,clipping) + tag_suffix
    print(infostr)
    adata.layers[infostr] = pearson_residuals_compound(counts=adata.X.toarray(),theta=theta,alpha=alpha,clipping=clipping)
    adata.var[infostr+'_var'] = np.var(adata.layers[infostr],axis=0)

In [5]:
alphas = np.append(np.logspace(0,3,num=4),50)
[compute_residuals(adata,alpha=alpha,theta=100) for alpha in tqdm(alphas)]

  0%|          | 0/5 [00:00<?, ?it/s]tcmalloc: large alloc 3669549056 bytes == 0x13e7be000 @ 


pr_theta100_alpha1.0


tcmalloc: large alloc 3669549056 bytes == 0x22aa72000 @ 
tcmalloc: large alloc 3669549056 bytes == 0x305600000 @ 
tcmalloc: large alloc 3669549056 bytes == 0x3e098e000 @ 
tcmalloc: large alloc 3669549056 bytes == 0x4bb51c000 @ 
tcmalloc: large alloc 3669549056 bytes == 0x4bb51c000 @ 
tcmalloc: large alloc 3669549056 bytes == 0x13e7be000 @ 
 20%|██        | 1/5 [00:20<01:23, 20.93s/it]tcmalloc: large alloc 3669549056 bytes == 0x13e7be000 @ 


pr_theta100_alpha10.0


tcmalloc: large alloc 3669549056 bytes == 0x22aa72000 @ 
tcmalloc: large alloc 3669549056 bytes == 0x3e098e000 @ 
 40%|████      | 2/5 [00:37<00:54, 18.15s/it]

pr_theta100_alpha100.0


 60%|██████    | 3/5 [00:54<00:35, 17.77s/it]

pr_theta100_alpha1000.0


 80%|████████  | 4/5 [01:11<00:17, 17.46s/it]

pr_theta100_alpha50.0


100%|██████████| 5/5 [01:28<00:00, 17.61s/it]


[None, None, None, None, None]

In [6]:
#recompute means because we did some subsetting
adata.var['gene_mean'] = np.array(np.mean(adata.X,axis=0)).flatten()

In [7]:
def select_hvgs(adata,alpha,theta,n_hvgs=3000,clipping=True):
    
    resvar = adata.var[get_tag(alpha=alpha,theta=theta,clipping=clipping)+'_var']    
    hvg_idx = resvar >= np.sort(resvar)[-n_hvgs]
    adata.var['top%u_%s'%(n_hvgs,get_tag(alpha=alpha,theta=theta,clipping=clipping))] = hvg_idx

In [8]:
[select_hvgs(adata,alpha=alpha,theta=100,n_hvgs=3000,clipping=True) for alpha in tqdm(alphas)]

100%|██████████| 5/5 [00:00<00:00, 308.73it/s]


[None, None, None, None, None]

### Run PCA with 1000 PCs for all alphas

In [9]:
adata.layers

Layers with keys: pr_theta100_alpha1.0, pr_theta100_alpha10.0, pr_theta100_alpha100.0, pr_theta100_alpha1000.0, pr_theta100_alpha50.0

In [10]:
ads_hvg = []
theta=100
ncomp=1000

for alpha in tqdm(alphas):
    
    tag = get_tag(alpha=alpha, theta=100,clipping=True)
    
    #subset to HVGs
    ad = adata[:,adata.var[f'top3000_{tag}']].copy()
    ad.uns['alpha'] = alpha
    
    #recompute residuals
    tag_suffix='_afterHVG'
    full_tag = tag + tag_suffix
    print(full_tag)
    compute_residuals(ad,theta=theta,alpha=alpha,tag_suffix=tag_suffix)
    
    #compute PCA
    pca = PCA(random_state=42)
    ad.obsm['pca'] = rescale_pca(pca.fit_transform(ad.layers[full_tag]))
    ad.obsm[f'pca{ncomp}'] = ad.obsm['pca'][:,:ncomp]
    
    #if alpha = 50, also compute PCA without recomputing residuals (control experiment)
    if alpha==50:
        print('running control exp on', tag)
        pca = PCA(random_state=42)
        ad.obsm['pca_control'] = rescale_pca(pca.fit_transform(ad.layers[tag]))
        ad.obsm[f'pca{ncomp}_control'] = ad.obsm['pca_control'][:,:ncomp]
    
    ads_hvg.append(ad)

  utils.warn_names_duplicates("var")


pr_theta100_alpha1.0_afterHVG
pr_theta100_alpha1.0_afterHVG


 20%|██        | 1/5 [00:20<01:23, 20.90s/it]

pr_theta100_alpha10.0_afterHVG
pr_theta100_alpha10.0_afterHVG


 40%|████      | 2/5 [00:41<01:02, 20.71s/it]

pr_theta100_alpha100.0_afterHVG
pr_theta100_alpha100.0_afterHVG


  utils.warn_names_duplicates("var")


pr_theta100_alpha1000.0_afterHVG
pr_theta100_alpha1000.0_afterHVG


 80%|████████  | 4/5 [01:24<00:21, 21.26s/it]

pr_theta100_alpha50.0_afterHVG
pr_theta100_alpha50.0_afterHVG
running control exp on pr_theta100_alpha50.0


100%|██████████| 5/5 [01:57<00:00, 23.51s/it]


### set first two PCs as shared init for suppl. figure

In [11]:
#define joint PCA init for suppl figure
init_alpha = 10
for ad in ads_hvg:
    if ad.uns['alpha']==init_alpha:
        #use first 2 PCs to initialize the init
        pca_init = ad.obsm['pca'][:,:2]
        adata.obsm['pca_shared_init'] = pca_init

### compute tSNEs
Suppl figure tSNEs with shared PCA init, main figure tSNE with default init

In [12]:
ncomp = 1000
for ad in tqdm(ads_hvg):
    
    if ad.uns['alpha'] in [1,10,100,1000]:
        #use shared PCA init
        init=adata.obsm['pca_shared_init']
    elif ad.uns['alpha']==50:
        #use default PCA init (first 2 PCs)
        init=ad.obsm['pca'][:,:2]
            
    pca_data_after_HVG = ad.obsm[f'pca{ncomp}']
    tsne = openTSNE.TSNE(random_state=42,verbose=True,n_jobs=38)
    ad.obsm['tsne'] = np.array(tsne.fit(X=pca_data_after_HVG,initialization=init))
    
    if ad.uns['alpha'] == 50:
        #compute control tSNE where residuals where not re-computed after HVG selection / before PCA
        pca_data_after_HVG = ad.obsm[f'pca{ncomp}_control']
        tsne = openTSNE.TSNE(random_state=42,verbose=True,n_jobs=38)
        ad.obsm['tsne_hvg_control'] = np.array(tsne.fit(X=pca_data_after_HVG,initialization=init))

  0%|          | 0/5 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
TSNE(n_jobs=38, random_state=42, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 7.34 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.28 seconds
===> Running optimization with exaggeration=12.00, lr=1985.17 for 250 iterations...
Iteration   50, KL divergence 5.9971, 50 iterations in 1.6028 sec
Iteration  100, KL divergence 4.6969, 50 iterations in 1.6963 sec
Iteration  150, KL divergence 4.4263, 50 iterations in 2.3782 sec
Iteration  200, KL divergence 4.2650, 50 iterations in 2.4978 sec
Iteration  250, KL divergence 4.1859, 50 iterations in 2.4451 sec
   --> Time elapsed: 10.62 seconds
===> Running optimization with exaggeration=1.00, lr=1985.17 for 500 iterations...
Iteration   50, KL divergence 3.0453, 50 iterations in 2.1287 

 20%|██        | 1/5 [01:19<05:19, 79.94s/it]

Iteration  500, KL divergence 2.1353, 50 iterations in 8.3937 sec
   --> Time elapsed: 49.44 seconds
--------------------------------------------------------------------------------
TSNE(n_jobs=38, random_state=42, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 7.53 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.34 seconds
===> Running optimization with exaggeration=12.00, lr=1985.17 for 250 iterations...
Iteration   50, KL divergence 5.5512, 50 iterations in 2.1618 sec
Iteration  100, KL divergence 4.4510, 50 iterations in 2.0197 sec
Iteration  150, KL divergence 4.1906, 50 iterations in 1.7325 sec
Iteration  200, KL divergence 4.0650, 50 iterations in 1.7479 sec
Iteration  250, KL divergence 3.9927, 50 iterations in 1.7246 sec
   --> Time elapsed: 9.39 seconds
===> Running optimization with exaggeration=1

 40%|████      | 2/5 [02:32<03:47, 75.76s/it]

--------------------------------------------------------------------------------
TSNE(n_jobs=38, random_state=42, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 8.61 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.32 seconds
===> Running optimization with exaggeration=12.00, lr=1985.17 for 250 iterations...
Iteration   50, KL divergence 5.5577, 50 iterations in 1.8682 sec
Iteration  100, KL divergence 4.4898, 50 iterations in 1.6879 sec
Iteration  150, KL divergence 4.2380, 50 iterations in 1.5092 sec
Iteration  200, KL divergence 4.1201, 50 iterations in 1.6008 sec
Iteration  250, KL divergence 4.0503, 50 iterations in 1.5458 sec
   --> Time elapsed: 8.21 seconds
===> Running optimization with exaggeration=1.00, lr=1985.17 for 500 iterations...
Iteration   50, KL divergence 3.0064, 50 iterations in 1.4554 s

 60%|██████    | 3/5 [03:43<02:26, 73.45s/it]

--------------------------------------------------------------------------------
TSNE(n_jobs=38, random_state=42, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 7.12 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.27 seconds
===> Running optimization with exaggeration=12.00, lr=1985.17 for 250 iterations...
Iteration   50, KL divergence 5.7264, 50 iterations in 1.4526 sec
Iteration  100, KL divergence 4.5457, 50 iterations in 1.4477 sec
Iteration  150, KL divergence 4.2805, 50 iterations in 1.4972 sec
Iteration  200, KL divergence 4.1568, 50 iterations in 1.5177 sec
Iteration  250, KL divergence 4.0849, 50 iterations in 1.4808 sec
   --> Time elapsed: 7.40 seconds
===> Running optimization with exaggeration=1.00, lr=1985.17 for 500 iterations...
Iteration   50, KL divergence 3.0247, 50 iterations in 1.4466 s

 80%|████████  | 4/5 [04:54<01:12, 72.62s/it]

Iteration  500, KL divergence 1.8411, 50 iterations in 11.0282 sec
   --> Time elapsed: 56.34 seconds
--------------------------------------------------------------------------------
TSNE(n_jobs=38, random_state=42, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 7.18 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.26 seconds
===> Running optimization with exaggeration=12.00, lr=1985.17 for 250 iterations...
Iteration   50, KL divergence 5.9875, 50 iterations in 1.5226 sec
Iteration  100, KL divergence 4.5360, 50 iterations in 1.6052 sec
Iteration  150, KL divergence 4.2422, 50 iterations in 1.7217 sec
Iteration  200, KL divergence 4.1145, 50 iterations in 1.8412 sec
Iteration  250, KL divergence 4.0366, 50 iterations in 1.7543 sec
   --> Time elapsed: 8.45 seconds
===> Running optimization with exaggeration=

100%|██████████| 5/5 [07:02<00:00, 84.55s/it]

Iteration  500, KL divergence 1.8087, 50 iterations in 9.7372 sec
   --> Time elapsed: 47.63 seconds





### Run scanpy baseline preprocessing for comparion

In [13]:
n_hvgs=3000
adata_seurat = adata.copy()
sc.pp.normalize_total(adata_seurat)
sc.pp.log1p(adata_seurat)
hvg_seurat = sc.pp.highly_variable_genes(adata_seurat,flavor='seurat',n_top_genes=n_hvgs,inplace=False)

adata.var[f'top{n_hvgs}_seurat'] = np.array(hvg_seurat['highly_variable'])
ad_hvg_seurat = adata[:,adata.var[f'top{n_hvgs}_seurat']].copy()
ad_hvg_seurat.uns['hvg'] = 'Seurat'
ad_hvg_seurat.uns['hvg_plotlabel'] = 'Seurat'
ad_hvg_seurat.uns['hvg_criterion'] = hvg_seurat['dispersions_norm']

def logmedian_PCA(ad,ncomp):
    ad.layers['logmedian'] =  sc.pp.normalize_total(ad,inplace=False)['X']
    sc.pp.log1p(ad,layer='logmedian')
    
    pca = PCA(random_state=42)
    ad.obsm['pca'] = rescale_pca(pca.fit_transform(ad.layers['logmedian'].A))
    ad.obsm[f'pca{ncomp}'] = ad.obsm['pca'][:,:ncomp]

logmedian_PCA(ad_hvg_seurat,ncomp=ncomp)

pca_data_after_HVG = ad_hvg_seurat.obsm[f'pca{ncomp}']
tsne = openTSNE.TSNE(random_state=42,verbose=True,n_jobs=38)
pca_init = ad_hvg_seurat.obsm['pca'][:,:2]
ad_hvg_seurat.obsm['tsne'] = np.array(tsne.fit(X=pca_data_after_HVG,initialization=pca_init))

  utils.warn_names_duplicates("var")


normalizing counts per cell
    finished (0:00:01)
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:04)
normalizing counts per cell
    finished (0:00:00)
--------------------------------------------------------------------------------
TSNE(n_jobs=38, random_state=42, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 7.03 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.26 seconds
===> Running optimization with exaggeration=12.00, lr=1985.17 for 250 iterations...
Iteration   50, KL divergence 5.4656, 50 iterations in 1.4089 sec
Iteration  100, KL divergence 4.4254, 50 iterations in 1.6354 sec
Iteration  150, KL divergence 4.1669, 50 iterations in 1.5159 sec
Iteration  200, KL divergence 4.0382, 50 iterations in 1.7217 sec
Iteration  250, KL divergence 3.9628

### Save results

In [14]:
with open(f'{basepath}ads_hvg.pickle','wb') as f:
    pickle.dump(ads_hvg,f)
adata.write_h5ad(f'{basepath}adata_residuals.h5ad')

In [15]:
ad_hvg_seurat.uns['hvg_criterion'] =ad_hvg_seurat.uns['hvg_criterion'].values
ad_hvg_seurat.write(f'{basepath}ad_hvg_seurat.h5ad')