## Load Tasic data

In [1]:
import numpy as np
import pickle
import scanpy as sc
import anndata
import openTSNE
from sklearn.decomposition import PCA
from openTSNE.initialization import rescale as rescale_pca
sc.settings.verbosity = 2
from tqdm import tqdm
from readcount_tools import get_tag, compute_residuals, select_hvgs, scanpy_preproc_baseline

#### Quick load Tasic data

In [2]:
basepath = 'data/tasic/'
adata = anndata.read_h5ad(f'{basepath}adata.h5ad')

  utils.warn_names_duplicates("var")


In [3]:
print(adata.shape)
sc.pp.filter_genes(adata,min_cells=5)
print(adata.shape)

(23822, 42776)


tcmalloc: large alloc 1729986560 bytes == 0xb0518000 @ 


filtered out 4266 genes that are detected in less than 5 cells
(23822, 38510)


  utils.warn_names_duplicates("var")


### Compound Pearson residuals

In [4]:
alphas = np.append(np.logspace(0,3,num=4),50)
[compute_residuals(adata,alpha=alpha,theta=100) for alpha in tqdm(alphas)]

  0%|          | 0/5 [00:00<?, ?it/s]tcmalloc: large alloc 3669549056 bytes == 0x13e360000 @ 


pr_theta100_alpha1.0


tcmalloc: large alloc 3669549056 bytes == 0x22a7ca000 @ 
tcmalloc: large alloc 3669549056 bytes == 0x305358000 @ 
tcmalloc: large alloc 3669549056 bytes == 0x3e06e6000 @ 
tcmalloc: large alloc 3669549056 bytes == 0x4bb274000 @ 
tcmalloc: large alloc 3669549056 bytes == 0x4bb274000 @ 
tcmalloc: large alloc 3669549056 bytes == 0x13e360000 @ 
 20%|██        | 1/5 [00:18<01:15, 18.85s/it]tcmalloc: large alloc 3669549056 bytes == 0x13e360000 @ 


pr_theta100_alpha10.0


tcmalloc: large alloc 3669549056 bytes == 0x22a7ca000 @ 
tcmalloc: large alloc 3669549056 bytes == 0x3e06e6000 @ 
 40%|████      | 2/5 [00:36<00:55, 18.36s/it]

pr_theta100_alpha100.0


 60%|██████    | 3/5 [00:54<00:36, 18.06s/it]

pr_theta100_alpha1000.0


 80%|████████  | 4/5 [01:11<00:17, 17.56s/it]

pr_theta100_alpha50.0


100%|██████████| 5/5 [01:27<00:00, 17.49s/it]


[None, None, None, None, None]

In [5]:
#recompute means because we did some subsetting
adata.var['gene_mean'] = np.array(np.mean(adata.X,axis=0)).flatten()

In [6]:
[select_hvgs(adata,alpha=alpha,theta=100,n_hvgs=3000,clipping=True) for alpha in tqdm(alphas)]

100%|██████████| 5/5 [00:00<00:00, 135.12it/s]


[None, None, None, None, None]

### Run PCA with 1000 PCs for all alphas

In [7]:
adata.layers

Layers with keys: pr_theta100_alpha1.0, pr_theta100_alpha10.0, pr_theta100_alpha100.0, pr_theta100_alpha1000.0, pr_theta100_alpha50.0

In [8]:
ads_hvg = []
theta=100
ncomp=1000

for alpha in tqdm(alphas):
    
    tag = get_tag(alpha=alpha, theta=100,clipping=True)
    
    #subset to HVGs
    ad = adata[:,adata.var[f'top3000_{tag}']].copy()
    ad.uns['alpha'] = alpha
    
    #recompute residuals
    tag_suffix='_afterHVG'
    full_tag = tag + tag_suffix
    print(full_tag)
    compute_residuals(ad,theta=theta,alpha=alpha,tag_suffix=tag_suffix)
    
    #compute PCA
    pca = PCA(random_state=42)
    ad.obsm['pca'] = rescale_pca(pca.fit_transform(ad.layers[full_tag]))
    ad.obsm[f'pca{ncomp}'] = ad.obsm['pca'][:,:ncomp]
    
    #if alpha = 50, also compute PCA without recomputing residuals (control experiment)
    if alpha==50:
        print('running control exp on', tag)
        pca = PCA(random_state=42)
        ad.obsm['pca_control'] = rescale_pca(pca.fit_transform(ad.layers[tag]))
        ad.obsm[f'pca{ncomp}_control'] = ad.obsm['pca_control'][:,:ncomp]
    
    ads_hvg.append(ad)

  utils.warn_names_duplicates("var")


pr_theta100_alpha1.0_afterHVG
pr_theta100_alpha1.0_afterHVG


 20%|██        | 1/5 [00:16<01:07, 16.98s/it]

pr_theta100_alpha10.0_afterHVG
pr_theta100_alpha10.0_afterHVG


 40%|████      | 2/5 [00:33<00:50, 16.88s/it]

pr_theta100_alpha100.0_afterHVG
pr_theta100_alpha100.0_afterHVG


  utils.warn_names_duplicates("var")


pr_theta100_alpha1000.0_afterHVG
pr_theta100_alpha1000.0_afterHVG


 80%|████████  | 4/5 [01:05<00:16, 16.10s/it]

pr_theta100_alpha50.0_afterHVG
pr_theta100_alpha50.0_afterHVG
running control exp on pr_theta100_alpha50.0


100%|██████████| 5/5 [01:32<00:00, 18.59s/it]


### set first two PCs as shared init for suppl. figure

In [9]:
#define joint PCA init for suppl figure
init_alpha = 10
for ad in ads_hvg:
    if ad.uns['alpha']==init_alpha:
        #use first 2 PCs to initialize the init
        pca_init = ad.obsm['pca'][:,:2]
        adata.obsm['pca_shared_init'] = pca_init

### compute tSNEs
Suppl figure tSNEs with shared PCA init, main figure tSNE with default init

In [10]:
ncomp = 1000
for ad in tqdm(ads_hvg):
    
    if ad.uns['alpha'] in [1,10,100,1000]:
        #use shared PCA init
        init=adata.obsm['pca_shared_init']
    elif ad.uns['alpha']==50:
        #use default PCA init (first 2 PCs)
        init=ad.obsm['pca'][:,:2]
            
    pca_data_after_HVG = ad.obsm[f'pca{ncomp}']
    tsne = openTSNE.TSNE(random_state=42,verbose=True,n_jobs=38)
    ad.obsm['tsne'] = np.array(tsne.fit(X=pca_data_after_HVG,initialization=init))
    
    if ad.uns['alpha'] == 50:
        #compute control tSNE where residuals where not re-computed after HVG selection / before PCA
        pca_data_after_HVG = ad.obsm[f'pca{ncomp}_control']
        tsne = openTSNE.TSNE(random_state=42,verbose=True,n_jobs=38)
        ad.obsm['tsne_hvg_control'] = np.array(tsne.fit(X=pca_data_after_HVG,initialization=init))

  0%|          | 0/5 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
TSNE(n_jobs=38, random_state=42, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 7.27 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.33 seconds
===> Running optimization with exaggeration=12.00, lr=1985.17 for 250 iterations...
Iteration   50, KL divergence 5.9971, 50 iterations in 1.7796 sec
Iteration  100, KL divergence 4.6969, 50 iterations in 1.6073 sec
Iteration  150, KL divergence 4.4263, 50 iterations in 1.5620 sec
Iteration  200, KL divergence 4.2650, 50 iterations in 1.6657 sec
Iteration  250, KL divergence 4.1859, 50 iterations in 1.6947 sec
   --> Time elapsed: 8.31 seconds
===> Running optimization with exaggeration=1.00, lr=1985.17 for 500 iterations...
Iteration   50, KL divergence 3.0453, 50 iterations in 1.6123 s

 20%|██        | 1/5 [01:04<04:18, 64.63s/it]

Iteration  500, KL divergence 2.1353, 50 iterations in 6.4367 sec
   --> Time elapsed: 37.38 seconds
--------------------------------------------------------------------------------
TSNE(n_jobs=38, random_state=42, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 8.09 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.25 seconds
===> Running optimization with exaggeration=12.00, lr=1985.17 for 250 iterations...
Iteration   50, KL divergence 5.5512, 50 iterations in 1.5163 sec
Iteration  100, KL divergence 4.4510, 50 iterations in 1.3975 sec
Iteration  150, KL divergence 4.1906, 50 iterations in 1.5069 sec
Iteration  200, KL divergence 4.0650, 50 iterations in 1.3944 sec
Iteration  250, KL divergence 3.9927, 50 iterations in 1.7286 sec
   --> Time elapsed: 7.54 seconds
===> Running optimization with exaggeration=1

 40%|████      | 2/5 [02:08<03:12, 64.19s/it]

--------------------------------------------------------------------------------
TSNE(n_jobs=38, random_state=42, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 7.21 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.25 seconds
===> Running optimization with exaggeration=12.00, lr=1985.17 for 250 iterations...
Iteration   50, KL divergence 5.5577, 50 iterations in 1.6917 sec
Iteration  100, KL divergence 4.4898, 50 iterations in 1.7616 sec
Iteration  150, KL divergence 4.2380, 50 iterations in 1.7966 sec
Iteration  200, KL divergence 4.1201, 50 iterations in 1.7783 sec
Iteration  250, KL divergence 4.0503, 50 iterations in 1.8125 sec
   --> Time elapsed: 8.84 seconds
===> Running optimization with exaggeration=1.00, lr=1985.17 for 500 iterations...
Iteration   50, KL divergence 3.0064, 50 iterations in 1.4606 s

 60%|██████    | 3/5 [03:12<02:08, 64.07s/it]

Iteration  500, KL divergence 1.8211, 50 iterations in 9.3200 sec
   --> Time elapsed: 47.38 seconds
--------------------------------------------------------------------------------
TSNE(n_jobs=38, random_state=42, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 7.27 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.27 seconds
===> Running optimization with exaggeration=12.00, lr=1985.17 for 250 iterations...
Iteration   50, KL divergence 5.7264, 50 iterations in 1.5823 sec
Iteration  100, KL divergence 4.5457, 50 iterations in 1.6690 sec
Iteration  150, KL divergence 4.2805, 50 iterations in 1.4719 sec
Iteration  200, KL divergence 4.1568, 50 iterations in 1.5615 sec
Iteration  250, KL divergence 4.0849, 50 iterations in 1.4904 sec
   --> Time elapsed: 7.78 seconds
===> Running optimization with exaggeration=1

 80%|████████  | 4/5 [04:15<01:03, 63.48s/it]

Iteration  500, KL divergence 1.8411, 50 iterations in 8.3345 sec
   --> Time elapsed: 47.05 seconds
--------------------------------------------------------------------------------
TSNE(n_jobs=38, random_state=42, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 7.37 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.26 seconds
===> Running optimization with exaggeration=12.00, lr=1985.17 for 250 iterations...
Iteration   50, KL divergence 5.9875, 50 iterations in 1.6773 sec
Iteration  100, KL divergence 4.5360, 50 iterations in 1.7754 sec
Iteration  150, KL divergence 4.2422, 50 iterations in 1.6580 sec
Iteration  200, KL divergence 4.1145, 50 iterations in 1.6969 sec
Iteration  250, KL divergence 4.0366, 50 iterations in 1.6444 sec
   --> Time elapsed: 8.45 seconds
===> Running optimization with exaggeration=1

100%|██████████| 5/5 [06:29<00:00, 77.83s/it]

Iteration  500, KL divergence 1.8087, 50 iterations in 10.5529 sec
   --> Time elapsed: 55.20 seconds





### Run scanpy baseline preprocessing for comparion

In [11]:
ad_hvg_seurat = scanpy_preproc_baseline(adata,n_hvgs=3000,n_comps=1000)

  utils.warn_names_duplicates("var")


normalizing counts per cell
    finished (0:00:01)
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:05)
normalizing counts per cell
    finished (0:00:00)
--------------------------------------------------------------------------------
TSNE(n_jobs=38, random_state=42, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 7.04 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.28 seconds
===> Running optimization with exaggeration=12.00, lr=1985.17 for 250 iterations...
Iteration   50, KL divergence 5.4656, 50 iterations in 1.4626 sec
Iteration  100, KL divergence 4.4254, 50 iterations in 1.5978 sec
Iteration  150, KL divergence 4.1669, 50 iterations in 1.5271 sec
Iteration  200, KL divergence 4.0382, 50 iterations in 1.6023 sec
Iteration  250, KL divergence 3.9628

### Save results

In [12]:
with open(f'{basepath}ads_hvg.pickle','wb') as f:
    pickle.dump(ads_hvg,f)
adata.write_h5ad(f'{basepath}adata_residuals.h5ad')

In [13]:
ad_hvg_seurat.uns['hvg_criterion'] =ad_hvg_seurat.uns['hvg_criterion'].values
ad_hvg_seurat.write(f'{basepath}ad_hvg_seurat.h5ad')