In [1]:
import scanpy as sc
import pandas as pd 
import numpy as np
import anndata
import re
import h5py
import scipy.sparse as scs
import concurrent.futures
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csc_matrix
from concurrent.futures import ThreadPoolExecutor, as_completed
import umap
import random
import multiprocessing
random.seed(123)
from joblib import Parallel, delayed
import warnings
#warnings.filterwarnings('ignore')
import os
from tqdm import tqdm


In [2]:
def read_mat(h5_con):
    mat = scs.csc_matrix(
        (h5_con['matrix']['data'][:], # Count values
         h5_con['matrix']['indices'][:], # Row indices
         h5_con['matrix']['indptr'][:]), # Pointers for column positions
        shape = tuple(h5_con['matrix']['shape'][:]) # Matrix dimensions
    )
    return mat


def read_obs(h5con):
    bc = h5con['matrix']['barcodes'][:]
    bc = [x.decode('UTF-8') for x in bc]

    # Initialized the DataFrame with cell barcodes
    obs_df = pd.DataFrame({ 'barcodes' : bc })

    # Get the list of available metadata columns
    obs_columns = h5con['matrix']['observations'].keys()

    # For each column
    for col in obs_columns:
        # Read the values
        values = h5con['matrix']['observations'][col][:]
        # Check for byte storage
        if(isinstance(values[0], (bytes, bytearray))):
            # Decode byte strings
            values = [x.decode('UTF-8') for x in values]
        # Add column to the DataFrame
        obs_df[col] = values
    
    return obs_df
# define a function to construct anndata object from a h5 file
def read_h5_anndata(h5_file):
    h5_con = h5py.File(h5_file, mode = 'r')
    # extract the expression matrix
    mat = read_mat(h5_con)
    # extract gene names
    genes = h5_con['matrix']['features']['name'][:]
    genes = [x.decode('UTF-8') for x in genes]
    # extract metadata
    obs_df = read_obs(h5_con)
    # construct anndata
    adata = anndata.AnnData(mat.T,
                             obs = obs_df)
    # make sure the gene names aligned
    adata.var_names = genes

    adata.var_names_make_unique()
    return adata
def get_last_pattern(inputstr):
    pattern = r"[^/]+(?=$)"
    match = re.search(pattern, inputstr)
    if match:
        return match.group(0)
    else:
        return ""
def process_file(file_name):
    adata = read_h5_anndata(file_name)
    output_file = 'h5ad_flu_after_qc/'+adata.obs['pbmc_sample_id'][0]+'.h5ad'
    if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
        print(f"File {output_file} already exists and is not empty. Skipping processing.")
        return
    adata.obs=adata.obs.merge(meta_data[col_list], on='pbmc_sample_id', how='left')
    doublet_scores=pd.read_csv('/home/jupyter/BRI_Analysis/scRNA/Doublet_Scores/'+adata.obs['pbmc_sample_id'][0]+'.csv', index_col=0)
    L1_labels=pd.read_csv('/home/jupyter/BRI_Analysis/scRNA/Labels/'+adata.obs['pbmc_sample_id'][0]+'_L1_predicted_labels.csv', index_col=0)
    L2_labels=pd.read_csv('/home/jupyter/BRI_Analysis/scRNA/Labels/'+adata.obs['pbmc_sample_id'][0]+'_L2_predicted_labels.csv', index_col=0)
    L3_labels=pd.read_csv('/home/jupyter/BRI_Analysis/scRNA/Labels/'+adata.obs['pbmc_sample_id'][0]+'_L3_predicted_labels.csv', index_col=0)
    L3_5_labels=pd.read_csv('/home/jupyter/BRI_Analysis/scRNA/Labels/'+adata.obs['pbmc_sample_id'][0]+'_L3.5_predicted_labels.csv', index_col=0)
    L1_labels.columns=[ 'barcodes', 'AIFI_L1']
    L2_labels.columns=[ 'barcodes', 'AIFI_L2']
    L3_labels.columns=[ 'barcodes', 'AIFI_L3']
    L3_5_labels.columns=[ 'barcodes', 'AIFI_L3.5']
    print(doublet_scores['barcodes'].tolist()==adata.obs['barcodes'].tolist(),
          L1_labels['barcodes'].tolist()==adata.obs['barcodes'].tolist(),
          L2_labels['barcodes'].tolist()==adata.obs['barcodes'].tolist(),
          L3_labels['barcodes'].tolist()==adata.obs['barcodes'].tolist(),
          L3_5_labels['barcodes'].tolist()==adata.obs['barcodes'].tolist())
    adata.obs= pd.merge(adata.obs, doublet_scores, on='barcodes', how='left')
    adata.obs= pd.merge(adata.obs, L1_labels, on='barcodes', how='left')
    adata.obs= pd.merge(adata.obs, L2_labels, on='barcodes', how='left')
    adata.obs= pd.merge(adata.obs, L3_labels, on='barcodes', how='left')
    adata.obs= pd.merge(adata.obs, L3_5_labels, on='barcodes', how='left')
    adata.obs.index=adata.obs['barcodes']
    adata.var["mito"] = adata.var_names.str.startswith("MT-")
    adata=adata[adata.obs['predicted_doublet']==False]
    sc.pp.calculate_qc_metrics(adata, qc_vars=["mito"], inplace=True)
    adata=adata[(adata.obs["pct_counts_mito"] <10) & (adata.obs["n_genes"] <5000) &(adata.obs["n_genes"] >200) ]
    adata.write_h5ad('h5ad_flu_after_qc/'+adata.obs['pbmc_sample_id'][0]+'.h5ad')

In [3]:
col_list=['subject.biologicalSex','subject.ethnicity', 'subject.partnerCode',
          'subject.race', 'subject.subjectGuid', 'cohort.cohortGuid', 'sample.visitName', 
          'sample.visitDetails', 'subject.birthYear','CMV.IgG.Serology.Result.Interpretation', 
          'BMI','pbmc_sample_id']

In [4]:
meta_data=pd.read_csv('/home//jupyter/BRI_Analysis/scRNA/hise_meta_data_2024-01-23_fixed.csv')

# Basic QC and doublet removal from scrublet

In [22]:
%%time
file_names = meta_data['file.path'].tolist()

with multiprocessing.Pool(processes=60) as pool:
    pool.map(process_file, file_names)


File h5ad_flu_after_qc/PB00294-02.h5ad already exists and is not empty. Skipping processing.
File h5ad_flu_after_qc/PB00295-02.h5ad already exists and is not empty. Skipping processing.
File h5ad_flu_after_qc/PB00296-02.h5ad already exists and is not empty. Skipping processing.
File h5ad_flu_after_qc/PB00297-02.h5ad already exists and is not empty. Skipping processing.
File h5ad_flu_after_qc/PB00625-02.h5ad already exists and is not empty. Skipping processing.
File h5ad_flu_after_qc/PB00165-03.h5ad already exists and is not empty. Skipping processing.
File h5ad_flu_after_qc/PB00034-01.h5ad already exists and is not empty. Skipping processing.
File h5ad_flu_after_qc/PB00360-01.h5ad already exists and is not empty. Skipping processing.
File h5ad_flu_after_qc/PB00323-01.h5ad already exists and is not empty. Skipping processing.
File h5ad_flu_after_qc/PB00007-01.h5ad already exists and is not empty. Skipping processing.
File h5ad_flu_after_qc/PB00537-01.h5ad already exists and is not empty

# Read all h5 and split into individual H5

In [5]:
def load_file(file_name):
    try:
        result = sc.read_h5ad("h5ad_flu_after_qc/" + file_name + '.h5ad')
        return result
    except Exception as e:
        print(f'Error reading {file_name}: {e}')
        return None

In [7]:
%%time
file_names= meta_data["pbmc_sample_id"].tolist()
h5_list = []
with ThreadPoolExecutor(max_workers=60) as executor:
    future_to_file = {executor.submit(load_file, file_name): file_name for file_name in file_names}
    for future in tqdm(as_completed(future_to_file), total=len(file_names)):
        result = future.result()
        if result is not None:
            h5_list.append(result)

100% 868/868 [21:39<00:00,  1.50s/it]

CPU times: user 3min 28s, sys: 3min 48s, total: 7min 17s
Wall time: 21min 39s





In [14]:
def subset_adata(adata, celltype):
    adata_subset = adata[adata.obs['AIFI_L3'] == celltype].copy()
    return adata_subset

In [84]:
if h5_list[26].obs['AIFI_L3'].nunique() == 71:
    for i in range(0,71):
        celltypes_to_process = [h5_list[26].obs['AIFI_L3'].unique()[i]]
        adata_list = []
        print(celltypes_to_process[0])

        with ThreadPoolExecutor(max_workers=60) as executor:
            future_to_adata = {executor.submit(subset_adata, adata_file, celltype): adata_file for celltype in celltypes_to_process for adata_file in h5_list}
            for future in tqdm(as_completed(future_to_adata), total=len(future_to_adata)):
                result = future.result()
                if result is not None:
                    adata_list.append(result)
        combined= anndata.concat(adata_list)
        combined.write_h5ad('h5_by_celltype/'+celltypes_to_process[0]+'.h5ad')

KLRF1- GZMB+ CD27- EM CD8 T cell


100% 868/868 [00:25<00:00, 34.18it/s]


KLRF1+ GZMB+ CD27- EM CD8 T cell


100% 868/868 [00:15<00:00, 54.72it/s] 


GZMB- CD27- EM CD4 T cell


100% 868/868 [00:20<00:00, 42.62it/s]


Core naive B cell


100% 868/868 [00:23<00:00, 36.99it/s]


Core naive CD4 T cell


100% 868/868 [00:44<00:00, 19.38it/s]


CD27- effector B cell


100% 868/868 [00:15<00:00, 56.42it/s] 


Core CD16 monocyte


100% 868/868 [00:19<00:00, 44.85it/s] 


GZMK- CD56dim NK cell


100% 868/868 [00:19<00:00, 44.13it/s]


Core CD14 monocyte


100% 868/868 [00:26<00:00, 33.17it/s]


ISG+ CD16 monocyte


100% 868/868 [00:14<00:00, 59.58it/s] 


Transitional B cell


100% 868/868 [00:12<00:00, 67.52it/s] 


KLRF1- effector Vd1 gdT


100% 868/868 [00:12<00:00, 69.47it/s]  


Core naive CD8 T cell 


100% 868/868 [00:20<00:00, 42.27it/s]


Memory CD4 Treg


100% 868/868 [00:14<00:00, 59.80it/s] 


CM CD4 T cell


100% 868/868 [00:20<00:00, 41.92it/s]


Naive CD4 Treg


100% 868/868 [00:15<00:00, 54.77it/s] 


KLRF1- GZMB+ CD27- memory CD4 T cell


100% 868/868 [00:14<00:00, 58.12it/s] 


GZMK+ CD27+ EM CD8 T cell


100% 868/868 [00:17<00:00, 48.47it/s]


Core memory B cell


100% 868/868 [00:16<00:00, 54.19it/s] 


HLA-DRhi cDC2


100% 868/868 [00:13<00:00, 65.11it/s] 


CMP cell


100% 868/868 [00:09<00:00, 95.19it/s]  


Adaptive NK cell


100% 868/868 [00:14<00:00, 59.67it/s] 


CD8aa


100% 868/868 [00:11<00:00, 73.59it/s] 


ISG+ CD14 monocyte


100% 868/868 [00:18<00:00, 47.14it/s]


GZMB- CD27+ EM CD4 T cell


100% 868/868 [00:18<00:00, 45.91it/s] 


GZMK+ CD56dim NK cell


100% 868/868 [00:16<00:00, 52.19it/s]


IL1B+ CD14 monocyte


100% 868/868 [00:12<00:00, 68.04it/s] 


C1Q+ CD16 monocyte


100% 868/868 [00:12<00:00, 70.33it/s] 


Platelet


100% 868/868 [00:13<00:00, 63.04it/s]  


CD8 MAIT


100% 868/868 [00:16<00:00, 52.74it/s] 


DN T cell


100% 868/868 [00:11<00:00, 73.40it/s] 


SOX4+ naive CD4 T cell


100% 868/868 [00:13<00:00, 62.50it/s] 


CD27+ effector B cell


100% 868/868 [00:12<00:00, 68.31it/s]  


CD4 MAIT


100% 868/868 [00:12<00:00, 72.19it/s] 


CD56bright NK cell


100% 868/868 [00:11<00:00, 75.74it/s] 


CLP cell


100% 868/868 [00:11<00:00, 74.58it/s] 


ISG+ naive B cell


100% 868/868 [00:12<00:00, 69.60it/s] 


CM CD8 T cell


100% 868/868 [00:15<00:00, 56.82it/s] 


GZMK- CD27+ EM CD8 T cell


100% 868/868 [00:14<00:00, 60.68it/s] 


SOX4+ naive CD8 T cell


100% 868/868 [00:12<00:00, 71.81it/s]  


GZMK+ Vd2 gdT


100% 868/868 [00:14<00:00, 60.99it/s]


SOX4+ Vd1 gdT


100% 868/868 [00:08<00:00, 98.62it/s]  


CD95 memory B cell


100% 868/868 [00:10<00:00, 85.72it/s]  


ISG+ naive CD4 T cell


100% 868/868 [00:12<00:00, 68.35it/s] 


pDC


100% 868/868 [00:13<00:00, 66.18it/s]  


CD14+ cDC2


100% 868/868 [00:13<00:00, 62.06it/s] 


ISG+ memory CD4 T cell


100% 868/868 [00:10<00:00, 86.76it/s]  


ASDC


100% 868/868 [00:08<00:00, 96.54it/s]  


Plasma cell


100% 868/868 [00:12<00:00, 69.68it/s] 


Erythrocyte


100% 868/868 [00:12<00:00, 69.25it/s]  


KLRF1+ effector Vd1 gdT


100% 868/868 [00:12<00:00, 71.91it/s] 


ISG+ cDC2


100% 868/868 [00:12<00:00, 72.31it/s] 


Type 2 polarized memory B cell


100% 868/868 [00:10<00:00, 85.42it/s]  


KLRB1+ memory CD8 Treg


100% 868/868 [00:08<00:00, 98.35it/s] 


GZMB+ Vd2 gdT


100% 868/868 [00:13<00:00, 62.38it/s]  


Intermediate monocyte


100% 868/868 [00:15<00:00, 55.67it/s] 


Proliferating T cell


100% 868/868 [00:12<00:00, 70.30it/s]  


KLRB1+ memory CD4 Treg


100% 868/868 [00:11<00:00, 73.02it/s]  


Early memory B cell


100% 868/868 [00:05<00:00, 164.51it/s] 


Activated memory B cell


100% 868/868 [00:12<00:00, 69.66it/s] 


ISG+ CD56dim NK cell


100% 868/868 [00:09<00:00, 90.63it/s] 


ISG+ naive CD8 T cell


100% 868/868 [00:08<00:00, 101.78it/s] 


Naive Vd1 gdT


100% 868/868 [00:00<00:00, 30720.50it/s]


cDC1


100% 868/868 [00:03<00:00, 238.46it/s] 


Proliferating NK cell


100% 868/868 [00:09<00:00, 88.42it/s]  


ILC


100% 868/868 [00:05<00:00, 172.18it/s] 


ISG+ memory CD8 T cell


100% 868/868 [00:09<00:00, 92.07it/s]  


BaEoMaP cell


100% 868/868 [00:07<00:00, 116.16it/s] 


ISG+ MAIT


100% 868/868 [00:08<00:00, 103.53it/s]


Memory CD8 Treg


100% 868/868 [00:08<00:00, 101.09it/s] 


GZMK+ memory CD4 Treg


100% 868/868 [00:09<00:00, 86.80it/s] 


# Clustering

In [5]:
import os

def list_files(directory):
    try:
        files = os.listdir(directory)
        return files
    except Exception as e:
        return str(e)

directory_path = 'h5_by_celltype/'
files = list_files(directory_path)
print(files)

['cDC1.h5ad', 'KLRF1- effector Vd1 gdT.h5ad', 'ISG+ CD16 monocyte.h5ad', 'CD14+ cDC2.h5ad', 'HLA-DRhi cDC2.h5ad', 'GZMK+ Vd2 gdT.h5ad', 'CD8aa.h5ad', 'GZMK+ CD56dim NK cell.h5ad', 'C1Q+ CD16 monocyte.h5ad', 'CD95 memory B cell.h5ad', 'GZMK- CD27+ EM CD8 T cell.h5ad', 'KLRF1+ GZMB+ CD27- EM CD8 T cell.h5ad', 'GZMK- CD56dim NK cell.h5ad', 'Memory CD8 Treg.h5ad', 'Proliferating NK cell.h5ad', 'CD4 MAIT.h5ad', 'Intermediate monocyte.h5ad', 'CMP cell.h5ad', 'SOX4+ Vd1 gdT.h5ad', 'KLRB1+ memory CD8 Treg.h5ad', 'pDC.h5ad', 'Erythrocyte.h5ad', 'Core CD14 monocyte.h5ad', 'GZMB+ Vd2 gdT.h5ad', 'Platelet.h5ad', 'ILC.h5ad', 'GZMB- CD27+ EM CD4 T cell.h5ad', 'GZMK+ memory CD4 Treg.h5ad', 'Transitional B cell.h5ad', 'CD56bright NK cell.h5ad', 'Core memory B cell.h5ad', 'KLRB1+ memory CD4 Treg.h5ad', 'CM CD4 T cell.h5ad', 'GZMK+ CD27+ EM CD8 T cell.h5ad', 'Activated memory B cell.h5ad', 'IL1B+ CD14 monocyte.h5ad', 'SOX4+ naive CD4 T cell.h5ad', 'BaEoMaP cell.h5ad', 'KLRF1+ effector Vd1 gdT.h5ad', 'CD

In [23]:
def clustering(input_h5ad_path):
        adata = sc.read_h5ad('h5_by_celltype/'+input_h5ad_path)
        output_file = 'h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad'
        if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
            print(f"File {output_file} already exists and is not empty. Skipping processing.")
            return 
        print("Start Processing: " +adata.obs['AIFI_L3'][0] )
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        sc.pp.highly_variable_genes(adata)
        adata.raw = adata
        adata = adata[:, adata.var.highly_variable]
        sc.pp.scale(adata, max_value=10)
        sc.tl.pca(adata, svd_solver='arpack')
        sc.pp.neighbors(adata, n_neighbors=50, n_pcs=30)
        sc.tl.umap(adata)
        print("Start Clustering: " +adata.obs['AIFI_L3'][0] )
        sc.tl.leiden(adata)
        adata.write_h5ad('h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad')
        del adata
        gc.collect()

In [None]:
input_h5ad_paths =files
with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
    executor.map(clustering, input_h5ad_paths)

  output_file = 'h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad'


File h5_by_celltype_clustered/cDC1.h5ad already exists and is not empty. Skipping processing.


  output_file = 'h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad'


File h5_by_celltype_clustered/KLRF1- effector Vd1 gdT.h5ad already exists and is not empty. Skipping processing.


  output_file = 'h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad'


File h5_by_celltype_clustered/CD8aa.h5ad already exists and is not empty. Skipping processing.


  output_file = 'h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad'


File h5_by_celltype_clustered/ISG+ CD16 monocyte.h5ad already exists and is not empty. Skipping processing.


  output_file = 'h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad'


File h5_by_celltype_clustered/CD14+ cDC2.h5ad already exists and is not empty. Skipping processing.


  output_file = 'h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad'


File h5_by_celltype_clustered/GZMK+ Vd2 gdT.h5ad already exists and is not empty. Skipping processing.


  output_file = 'h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad'


File h5_by_celltype_clustered/HLA-DRhi cDC2.h5ad already exists and is not empty. Skipping processing.


  output_file = 'h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad'


File h5_by_celltype_clustered/GZMK+ CD56dim NK cell.h5ad already exists and is not empty. Skipping processing.


  output_file = 'h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad'


File h5_by_celltype_clustered/CD95 memory B cell.h5ad already exists and is not empty. Skipping processing.


  output_file = 'h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad'


File h5_by_celltype_clustered/Memory CD8 Treg.h5ad already exists and is not empty. Skipping processing.


  output_file = 'h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad'


File h5_by_celltype_clustered/C1Q+ CD16 monocyte.h5ad already exists and is not empty. Skipping processing.


  output_file = 'h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad'


File h5_by_celltype_clustered/Proliferating NK cell.h5ad already exists and is not empty. Skipping processing.


  output_file = 'h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad'


File h5_by_celltype_clustered/GZMK- CD27+ EM CD8 T cell.h5ad already exists and is not empty. Skipping processing.


  output_file = 'h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad'


File h5_by_celltype_clustered/CD4 MAIT.h5ad already exists and is not empty. Skipping processing.


  output_file = 'h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad'


File h5_by_celltype_clustered/SOX4+ Vd1 gdT.h5ad already exists and is not empty. Skipping processing.


  output_file = 'h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad'


File h5_by_celltype_clustered/CMP cell.h5ad already exists and is not empty. Skipping processing.


  output_file = 'h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad'


File h5_by_celltype_clustered/KLRB1+ memory CD8 Treg.h5ad already exists and is not empty. Skipping processing.


  output_file = 'h5_by_celltype_clustered/'+adata.obs['AIFI_L3'][0]+'.h5ad'


File h5_by_celltype_clustered/Erythrocyte.h5ad already exists and is not empty. Skipping processing.


In [None]:
clustering