In [1]:
import os
import pandas as pd
import numpy as np
import scanpy as sc
import pyranges as pr
import warnings
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import coo_matrix
import SEACells
import anndata
import h5py
from scipy.io import mmread
import tqdm

findfont: Font family ['Raleway'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Lato'] not found. Falling back to DejaVu Sans.


In [3]:
h5_path = "../../../data/mHSCAging10xMultiome/SEACells.h5"
hf = h5py.File(h5_path, 'r')
cell_LSI = np.transpose(hf.get('cellEmbedding'))
cell_meta = np.transpose(hf.get('cellMetadata'))
cell_UMAP = np.transpose(hf.get('UMAP'))
counts_ijv = np.transpose(hf.get('counts'))
barcodes = [bc.decode('ascii') for bc in hf.get('barcodes')]
peak_ranges = [pk.decode('ascii') for pk in hf.get('peaks')]
peak_ranges = pd.Series(peak_ranges)
hf.close()

In [4]:
# Make sparse count matrix
count_data = counts_ijv[:, 2]
count_col = np.array(counts_ijv[:, 1], dtype = int) - 1
count_row = np.array(counts_ijv[:, 0], dtype = int) - 1 # R is 1-based index and python is 0-based
counts = coo_matrix((count_data, (count_row, count_col)), 
                    shape = (len(peak_ranges), len(barcodes)))

In [5]:
# Construct AnnData object
ad = sc.AnnData(counts.T)
ad.obs_names = barcodes
ad.var_names = peak_ranges.values
ad.X = ad.X.tocsr()
ad.obsm['LSI'] = cell_LSI[:, 1:20]

  ad = sc.AnnData(counts.T)


In [6]:
# Leiden and UMAP
warnings.filterwarnings('ignore')
sc.pp.neighbors(ad, use_rep='LSI')
sc.tl.umap(ad)
sc.tl.leiden(ad)
warnings.filterwarnings('default')

2023-08-08 17:42:45.652507: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-08-08 17:42:45.652538: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [8]:
ad

AnnData object with n_obs × n_vars = 12906 × 298813
    obs: 'leiden'
    uns: 'neighbors', 'umap', 'leiden'
    obsm: 'LSI', 'X_umap'
    obsp: 'distances', 'connectivities'

In [9]:
## User defined parameters

## Core parameters 
n_SEACells = 100
build_kernel_on = 'LSI' # key in ad.obsm to use for computing metacells
                          # This would be replaced by 'X_svd' for ATAC data

## Additional parameters
n_waypoint_eigs = 10 # Number of eigenvalues to consider when initializing metacells
waypoint_proportion = 0.9 # Proportion of metacells to initialize using waypoint analysis, 
                        # the remainder of cells are selected by greedy selection

In [10]:
model = SEACells.core.SEACells(ad, 
                  build_kernel_on=build_kernel_on, 
                  n_SEACells=n_SEACells, 
                  n_waypoint_eigs=n_waypoint_eigs,
                  waypt_proportion=waypoint_proportion,
                  convergence_epsilon = 1e-5)

Building kernel...
Computing kNN graph using scanpy NN ...
Computing radius for adaptive bandwidth kernel...


  self._set_arrayXarray(i, j, x)


  0%|          | 0/12906 [00:00<?, ?it/s]

Making graph symmetric...
Computing RBF kernel...




  0%|          | 0/12906 [00:00<?, ?it/s]

Building similarity LIL matrix...




  0%|          | 0/12906 [00:00<?, ?it/s]

Constructing CSR matrix...


In [11]:
# Initialize archetypes
model.initialize_archetypes()
model.fit(n_iter=20)

Building kernel on LSI
Computing diffusion components from LSI for waypoint initialization ... 
Determing nearest neighbor graph...


  temp = sc.AnnData(data_df.values)


Done.
Sampling waypoints ...
Done.
Selecting 82 cells from waypoint initialization.
Initializing residual matrix using greedy column selection
Initializing f and g...


  0%|          | 0/28 [00:00<?, ?it/s]

Selecting 18 cells from greedy initialization.
Randomly initialized A matrix.
Setting convergence threshold at 0.00495055127102785
Starting iteration 1.
Completed iteration 1.
Starting iteration 10.
Completed iteration 10.
Starting iteration 20.
Completed iteration 20.
Converged after 20 iterations.


In [12]:
# Save results to file
ad.obs[['SEACell']].to_csv("../../../data/mHSCAging10xMultiome/SEACells.tsv", sep = "\t", header = None)