In [1]:
import scanpy as sc
import pandas as pd
import os 
import anndata 
from concurrent.futures import ThreadPoolExecutor, as_completed,ProcessPoolExecutor
from tqdm import tqdm
import matplotlib.pyplot as plt
sc.settings.n_jobs = 30




In [6]:
meta_data=pd.read_csv("/home/workspace/IHA_Figure_Revision/Dataset/scRNA_meta_data-2024-05-09.csv")

In [7]:
%%time
file_names= ['/home/workspace/IHA_Figure_Revision/Dataset/scRNA/BRI/h5ad/sample_h5ad/'+x+".h5ad" for x in meta_data['pbmc_sample_id'].tolist()]
adata_list = []

CPU times: user 127 μs, sys: 120 μs, total: 247 μs
Wall time: 253 μs


In [None]:
def load_and_subsample(fp):
    adata = sc.read_h5ad(fp)
    if adata.shape[0] > 2000:
        sc.pp.subsample(adata, n_obs=2000, random_state=3030)
    return adata

with ProcessPoolExecutor(max_workers=30) as executor:
    futures = [executor.submit(load_and_subsample,file) for file in file_names]
    
    for future in tqdm(as_completed(futures), total=len(futures)):
        adata = future.result()
        if adata is not None:
            adata_list.append(adata)

  8%|▊         | 72/868 [00:03<00:31, 24.93it/s]

In [None]:
%%time
def chunked_concat(lst, chunk_size=10):
    chunks = [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
    merged = [anndata.concat(chunk, axis=0, merge="same", index_unique=None) for chunk in chunks]
    return anndata.concat(merged, axis=0, merge="same", index_unique=None)

sampled_adata = chunked_concat(adata_list, chunk_size=10)

In [None]:
sampled_adata.write_h5ad("adata.h5ad")

In [2]:
sampled_adata=sc.read_h5ad("adata.h5ad")

In [3]:
pattern = 'B cell|monocyte|pDC|cDC|Plasm'
sampled_adata_subset = sampled_adata[~sampled_adata.obs['AIFI_L3'].str.contains(pattern, na=False)].copy()

In [4]:
del sampled_adata

In [5]:
%%time
sampled_adata_subset.raw=sampled_adata_subset
sc.pp.normalize_total(sampled_adata_subset, target_sum=1e4)
sc.pp.log1p(sampled_adata_subset)

sc.pp.highly_variable_genes(sampled_adata_subset)
sampled_adata_subset = sampled_adata_subset[:, sampled_adata_subset.var_names[sampled_adata_subset.var['highly_variable']]]

sc.pp.scale(sampled_adata_subset)
sc.tl.pca(sampled_adata_subset, svd_solver='arpack')
sc.pp.neighbors(sampled_adata_subset, n_neighbors=50,use_rep='X_pca', n_pcs=20)
sc.tl.umap(sampled_adata_subset,min_dist=0.45,random_state=0)

  view_to_actual(adata)
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


CPU times: user 5h 10min 32s, sys: 13min 27s, total: 5h 24min
Wall time: 37min 9s


IOStream.flush timed out


In [None]:
sampled_adata_subset.write_h5ad("adata_processed.h5ad")