In [1]:
import numpy as np
import anndata

from sklearn.decomposition import PCA
from umap import UMAP
from openTSNE import TSNE

  from .autonotebook import tqdm as notebook_tqdm


### Load datasets

In [2]:
adata_exut=anndata.read_h5ad("../data/preprocessed/exut_adata.h5ad")
adata_sim = anndata.read_h5ad('../data/preprocessed/exut-sim-theta-10-real-seqdepths_adata.h5ad')
adata_merfish=anndata.read_h5ad("../data/preprocessed/merfish_adata.h5ad")
adata_smartseq=anndata.read_h5ad("../data/preprocessed/smartseq_adata.h5ad")
adata_mnist=anndata.read_h5ad("../data/preprocessed/mnist_adata.h5ad")

datasets = [adata_exut,adata_merfish,adata_smartseq,adata_mnist,adata_sim]

In [3]:
#number of classes per dataset
for ad in datasets:
    print(ad.uns['dataset'],len(np.unique(ad.obs['clusterlabels'])),sep=' ')

exut 19
merfish 25
smartseq 28
mnist 10
exut-sim-theta-10-real-seqdepths 19


In [4]:
def compute_embeddings(ad,seeds):
    
    dataset = ad.uns['dataset']
    
    #compute for scaled and unscaled data for main datasets
    if dataset in ['exut','merfish','smartseq']:
        use_scaled_modes = [False, True]
        print('running on scaled and unscaled data for', dataset)
    #only use unscaled for additional datasets
    else:
        use_scaled_modes = [False]
        print('running only on unscaled data for', dataset)
        
    for use_scaled in use_scaled_modes:
        
        if use_scaled:
            x_hd = ad.layers['X_scaled']
            scaled_str = '_scaled'
        else:
            x_hd = ad.X
            scaled_str = ''
        
        for i,seed in enumerate(seeds):

            print(dataset, 'seed:', seed, 'scaled:', use_scaled)

            pca2 = PCA(random_state=seed,n_components=2)
            pca50 = PCA(random_state=seed,n_components=50)
            umap=UMAP(random_state=seed,verbose=True)
            tsne=TSNE(random_state=seed,verbose=True)

            id_str = f'seed_{seed}'

            print('PCA2')
            x_pca2 = pca2.fit_transform(x_hd)
            print('PCA50')
            x_pca50 = pca50.fit_transform(x_hd)
            print('UMAP')
            x_umap = umap.fit_transform(x_pca50)
            print('TSNE')
            x_tsne = tsne.fit(x_pca50)

            ad.obsm[f'x{scaled_str}_pca2_{id_str}'] = x_pca2
            ad.obsm[f'x{scaled_str}_pca50_{id_str}'] = x_pca50
            ad.obsm[f'x{scaled_str}_umap_{id_str}'] = x_umap
            ad.obsm[f'x{scaled_str}_tsne_{id_str}'] = np.array(x_tsne)

            np.save(f'../results/embeddings/npy/{dataset}_x{scaled_str}_pca2_{id_str}.npy',x_pca2,allow_pickle=False)
            np.save(f'../results/embeddings/npy/{dataset}_x{scaled_str}_pca50_{id_str}.npy',x_pca50,allow_pickle=False)
            np.save(f'../results/embeddings/npy/{dataset}_x{scaled_str}_umap_{id_str}.npy',x_umap,allow_pickle=False)
            np.save(f'../results/embeddings/npy/{dataset}_x{scaled_str}_tsne_{id_str}.npy',x_tsne,allow_pickle=False)
    
    ad.uns['seeds']=seeds
    ad.write_h5ad(f'../results/embeddings/{dataset}_adata_standard_embeddings.h5ad')

In [None]:
[compute_embeddings(ad=ad,seeds=np.arange(5)) for ad in datasets]

### Package versions

In [5]:
np.__version__

'1.24.3'

In [6]:
anndata.__version__

'0.10.3'

In [7]:
import sklearn; sklearn.__version__

'1.3.0'

In [8]:
import umap; umap.__version__

'0.5.5'

In [9]:
import openTSNE; openTSNE.__version__

'1.0.1'