In [3]:
import pandas as pd 
import numpy as np 
import scanpy as sc
import matplotlib.pyplot as plt
import concurrent.futures
import pickle
import warnings
from datetime import date
import hisepy
import os
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed,ProcessPoolExecutor
from tqdm import tqdm
import anndata
import milopy
import milopy.core as milo
import gc
warnings.filterwarnings("ignore")
sc.settings.n_jobs = 60
print("Current working directory:", os.getcwd())

Current working directory: /home/jupyter/Additional_Analysis/CMV_Tao


# Read MetaData

In [4]:
meta_data=pd.read_csv("/home/jupyter/BRI_Figures/Dataset/SF4_meta_data-2024-05-05.csv")

# Assemble Year 1 Day 0

In [5]:
%%time
file_names= ['/home/jupyter/BRI_Figures/Dataset/scRNA/SF4/h5ad/'+x+".h5ad" for x in meta_data['pbmc_sample_id'].tolist()]
adata_list = []
with ThreadPoolExecutor(max_workers=10) as executor:
    future_to_file = {executor.submit(sc.read_h5ad, file_name): file_name for file_name in file_names}
    for future in tqdm(as_completed(future_to_file), total=len(file_names)):
        result = future.result()
        if result is not None:
            adata_list.append(result)

100% 235/235 [08:46<00:00,  2.24s/it]

CPU times: user 31.4 s, sys: 1min 15s, total: 1min 47s
Wall time: 9min 12s





In [8]:
adata = anndata.concat(adata_list,join='outer')

In [23]:
adata=adata[~adata.obs['cmv_status'].isna()]

In [26]:
fitlered_gene=pd.DataFrame()
for i in adata.obs['celltypist_l3'].unique():
    print(i)
    adata_subset=adata[adata.obs['celltypist_l3']==i]
    sc.pp.filter_genes(adata_subset, min_cells=round(np.shape(adata_subset.X)[0]*0.1))
    gene_list=pd.DataFrame(list(adata_subset.var.index))
    gene_list.columns=['gene']
    gene_list['AIFI_L3']=i
    fitlered_gene=pd.concat([fitlered_gene,gene_list])

GZMK+ CD56dim NK cell
GZMK- CD56dim NK cell
Proliferating NK cell
CD56bright NK cell
ISG+ CD56dim NK cell
Adaptive NK cell
CD95 memory B cell
Transitional B cell
Core memory B cell
Activated memory B cell
CD27- effector B cell
ISG+ naive B cell
CD27+ effector B cell
Core naive B cell
Early memory B cell
Type 2 polarized memory B cell
GZMK+ Vd2 gdT
CD8aa
GZMK- CD27+ EM CD8 T cell
KLRF1+ GZMB+ CD27- EM CD8 T cell
Memory CD8 Treg
CD4 MAIT
SOX4+ Vd1 gdT
KLRB1+ memory CD8 Treg
GZMB+ Vd2 gdT
GZMB- CD27+ EM CD4 T cell
GZMK+ memory CD4 Treg
KLRB1+ memory CD4 Treg
CM CD4 T cell
GZMK+ CD27+ EM CD8 T cell
Core naive CD8 T cell
SOX4+ naive CD4 T cell
KLRF1+ effector Vd1 gdT
ISG+ naive CD8 T cell
CD8 MAIT
CM CD8 T cell
ISG+ naive CD4 T cell
Proliferating T cell
SOX4+ naive CD8 T cell
Core naive CD4 T cell
ISG+ memory CD8 T cell
Naive Vd1 gdT
KLRF1- GZMB+ CD27- EM CD8 T cell
Naive CD4 Treg
GZMB- CD27- EM CD4 T cell
ISG+ memory CD4 T cell
ISG+ MAIT
Memory CD4 Treg
DN T cell
HLA-DRhi cDC2
CD14+ cDC2
I

In [27]:
fitlered_gene.to_csv("fitlered_gene_SF4.csv")

# Get Counts

In [6]:
file_names= ['/home/jupyter/BRI_Figures/Dataset/scRNA/SF4/h5ad/'+x+".h5ad" for x in meta_data['pbmc_sample_id'].tolist()]

In [10]:
def get_counts(sample_id):
    adata = sc.read_h5ad('/home/jupyter/BRI_Figures/Dataset/scRNA/SF4/h5ad/'+sample_id+".h5ad", backed='r')
    counts_df = pd.DataFrame(adata.obs["celltypist_l3"].value_counts()).reset_index()
    counts_df['pbmc_sample_id'] = sample_id
    return counts_df

In [11]:
with concurrent.futures.ProcessPoolExecutor(max_workers=60) as executor:
    results = list(tqdm(executor.map(get_counts, meta_data['pbmc_sample_id']), total=len(meta_data['pbmc_sample_id'])))

100% 235/235 [00:00<00:00, 378.95it/s]


In [12]:
final_df = pd.concat(results, ignore_index=True)
final_df.to_csv("SF4_AIFI_L3_Counts.csv")