In [1]:
import pandas as pd 
import numpy as np 
import scanpy as sc
import matplotlib.pyplot as plt
import concurrent.futures
import pickle
import warnings
from datetime import date
import hisepy
import os
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed,ProcessPoolExecutor
from tqdm import tqdm
import anndata
import gc
warnings.filterwarnings("ignore")
sc.settings.n_jobs = 60
print("Current working directory:", os.getcwd())

Current working directory: /home/jupyter/BRI_Figures_Final_V1/Figure4/06_UMAP


# Read MetaData

In [2]:
meta_data=pd.read_csv("/home/jupyter/BRI_Figures_Final_V1//Dataset/scRNA_meta_data-2024-05-09.csv")

# Assemble Data

In [3]:
meta_data_subset=meta_data[meta_data['sample.visitName'].isin(['Flu Year 1 Day 0'])]

In [4]:
%%time
file_names= ['/home/jupyter/BRI_Figures_Final_V1/Dataset/scRNA/BRI/h5ad/sample_h5ad/'+x+".h5ad" for x in meta_data_subset['pbmc_sample_id'].tolist()]
adata_list = []
with ThreadPoolExecutor(max_workers=30) as executor:
    future_to_file = {executor.submit(sc.read_h5ad, file_name): file_name for file_name in file_names}
    for future in tqdm(as_completed(future_to_file), total=len(file_names)):
        result = future.result()
        if result is not None:
            adata_list.append(result)

100% 92/92 [00:19<00:00,  4.65it/s]

CPU times: user 11.8 s, sys: 8.89 s, total: 20.7 s
Wall time: 19.8 s





In [5]:
adata = anndata.concat(adata_list)
del adata_list 
gc.collect()

0

In [6]:
adata.raw=adata
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [7]:
igl_genes = [gene for gene in adata.var_names if gene.startswith("IGL")]
igk_genes = [gene for gene in adata.var_names if gene.startswith("IGK")]
ighc_genes = [gene for gene in adata.var_names if gene.startswith("IGH")]
exl_genes = igl_genes + igk_genes + ighc_genes

mask = ~adata.var_names.isin(exl_genes)
adata = adata[:, mask]

In [8]:
sc.pp.highly_variable_genes(adata)
adata = adata[:, adata.var_names[adata.var['highly_variable']]]

In [9]:
%%time
sc.pp.scale(adata)
sc.tl.pca(adata, svd_solver='arpack')
sc.pp.neighbors(adata, n_neighbors=50,use_rep='X_pca', n_pcs=30)

CPU times: user 1h 13min 52s, sys: 31min 26s, total: 1h 45min 18s
Wall time: 11min 1s


In [10]:
%%time
sc.tl.umap(adata,min_dist=0.45,random_state=0)

  0%|          | 0/200 [00:00<?, ?it/s]

	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
CPU times: user 7h 40min 24s, sys: 11.5 s, total: 7h 40min 36s
Wall time: 38min 53s


In [11]:
adata=adata.raw.to_adata()
adata.raw=adata
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.write_h5ad('BRI_Y1D0.h5ad')

