In [1]:
import scanpy as sc
import pandas as pd
import os 
import anndata 
from concurrent.futures import ThreadPoolExecutor, as_completed,ProcessPoolExecutor
from tqdm import tqdm
import matplotlib.pyplot as plt
sc.settings.n_jobs = 30




In [2]:
meta_data=pd.read_csv("/home/workspace/IHA_Figure_Revision/Dataset/scRNA_meta_data-2024-05-09.csv")

In [3]:
meta_data=meta_data[meta_data["sample.visitName"].isin(["Flu Year 1 Day 0","Flu Year 1 Day 7"])]

In [4]:
%%time
file_names= ['/home/workspace/IHA_Figure_Revision/Dataset/scRNA/BRI/h5ad/sample_h5ad/'+x+".h5ad" for x in meta_data['pbmc_sample_id'].tolist()]
adata_list = []

CPU times: user 88 μs, sys: 7 μs, total: 95 μs
Wall time: 99.7 μs


In [22]:
def load_and_subsample(fp):
    adata = sc.read_h5ad(fp)
    pattern = 'effector B cell|Plasma|memory B'
    mask = (
        adata.obs['AIFI_L3']
        .astype(str)
        .str.contains(pattern, regex=True, case=False, na=False)
    )
    adata = adata[mask, :].copy()
    if adata.shape[0] > 2000:
        sc.pp.subsample(adata, n_obs=2000, random_state=3030)
    return adata

with ProcessPoolExecutor(max_workers=30) as executor:
    futures = [executor.submit(load_and_subsample,file) for file in file_names]
    
    for future in tqdm(as_completed(futures), total=len(futures)):
        adata = future.result()
        if adata is not None:
            adata_list.append(adata)

100%|██████████| 184/184 [00:05<00:00, 35.53it/s]


In [23]:
%%time
def chunked_concat(lst, chunk_size=10):
    chunks = [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
    merged = [anndata.concat(chunk, axis=0, merge="same", index_unique=None) for chunk in chunks]
    return anndata.concat(merged, axis=0, merge="same", index_unique=None)

sampled_adata = chunked_concat(adata_list, chunk_size=4)

CPU times: user 27.9 s, sys: 971 ms, total: 28.8 s
Wall time: 29.1 s


  utils.warn_names_duplicates("obs")


In [24]:
%%time
sampled_adata.raw=sampled_adata
sc.pp.normalize_total(sampled_adata, target_sum=1e4)
sc.pp.log1p(sampled_adata)

igl_genes = [gene for gene in sampled_adata.var_names if gene.startswith("IGL")]
igk_genes = [gene for gene in sampled_adata.var_names if gene.startswith("IGK")]
ighc_genes = [gene for gene in sampled_adata.var_names if gene.startswith("IGH")]
exl_genes = igl_genes + igk_genes + ighc_genes

mask = ~sampled_adata.var_names.isin(exl_genes)
sampled_adata = sampled_adata[:, mask]
sc.pp.highly_variable_genes(sampled_adata)
sampled_adata = sampled_adata[:, sampled_adata.var_names[sampled_adata.var['highly_variable']]]

sc.pp.scale(sampled_adata)
sc.tl.pca(sampled_adata, svd_solver='arpack')
sc.pp.neighbors(sampled_adata, n_neighbors=50, use_rep='X_pca', n_pcs=20)
sc.tl.umap(sampled_adata,random_state=3030)

  adata.uns["hvg"] = {"flavor": flavor}
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  view_to_actual(adata)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


CPU times: user 50min 54s, sys: 2min 56s, total: 53min 50s
Wall time: 7min 9s


In [25]:
del sampled_adata.obs['barcodes']

In [27]:
sampled_adata

AnnData object with n_obs × n_vars = 358180 × 1574
    obs: 'original_barcodes', 'cell_name', 'batch_id', 'pool_id', 'chip_id', 'well_id', 'n_genes', 'n_reads', 'n_umis', 'total_counts_mito', 'pct_counts_mito', 'doublet_score', 'predicted_AIFI_L1', 'AIFI_L1_score', 'AIFI_L1', 'predicted_AIFI_L2', 'AIFI_L2_score', 'AIFI_L2', 'predicted_AIFI_L3', 'AIFI_L3_score', 'AIFI_L3', 'sample.sampleKitGuid', 'cohort.cohortGuid', 'subject.subjectGuid', 'subject.biologicalSex', 'subject.cmv', 'subject.bmi', 'subject.race', 'subject.ethnicity', 'subject.birthYear', 'subject.ageAtFirstDraw', 'sample.visitName', 'sample.drawDate', 'sample.subjectAgeAtDraw', 'specimen.specimenGuid', 'pipeline.fileGuid'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

In [26]:
sampled_adata.write_h5ad("adata_processed_IgG_removed.h5ad")

In [None]:
%%time
sampled_adata=sampled_adata.raw.to_adata()
sc.pp.normalize_total(sampled_adata, target_sum=1e4)
sc.pp.log1p(sampled_adata)

In [None]:
sampled_adata_D0=sampled_adata[sampled_adata.obs["sample.visitName"]=="Flu Year 1 Day 0"]
sampled_adata_D7=sampled_adata[sampled_adata.obs["sample.visitName"]=="Flu Year 1 Day 7"]

In [None]:
sc.pl.umap(sampled_adata_D0,color='DAPP1',size=5,show=False,frameon=False,legend_loc=None,vmax=4.5)

sc.pl.umap(sampled_adata_D7,color='DAPP1',size=5,show=False,frameon=False,legend_loc=None,vmax=4.5)


In [None]:
sc.pl.umap(sampled_adata_D0,color='CD38',size=5,show=False,use_raw=False,frameon=False,legend_loc=None)

sc.pl.umap(sampled_adata_D7,color='CD38',size=5,show=False,use_raw=False,frameon=False,legend_loc=None)


In [None]:
sc.pl.umap(
    sampled_adata,
        color='AIFI_L3',
        size=1,
        show=False,
        frameon=False,
        legend_loc=None)