In [2]:
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import celltypist
import gc
import anndata
from celltypist import models
import h5py
import scipy.sparse as scs
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from adjustText import adjust_text
import scanpy.external as sce

In [3]:
def gen_mpl_labels(
    adata, groupby, exclude=(), ax=None, adjust_kwargs=None, text_kwargs=None
):
    if adjust_kwargs is None:
        adjust_kwargs = {"text_from_points": False}
    if text_kwargs is None:
        text_kwargs = {}

    medians = {}

    for g, g_idx in adata.obs.groupby(groupby).groups.items():
        if g in exclude:
            continue
        medians[g] = np.median(adata[g_idx].obsm["X_umap"], axis=0)

    if ax is None:
        texts = [
            plt.text(x=x, y=y, s=k, **text_kwargs) for k, (x, y) in medians.items()]
    else:
        texts = [ax.text(x=x, y=y, s=k, **text_kwargs) for k, (x, y) in medians.items()]

    adjust_text(texts, **adjust_kwargs)

In [4]:
def read_mat(h5_con):
    mat = scs.csc_matrix(
        (h5_con['matrix']['data'][:], # Count values
         h5_con['matrix']['indices'][:], # Row indices
         h5_con['matrix']['indptr'][:]), # Pointers for column positions
        shape = tuple(h5_con['matrix']['shape'][:]) # Matrix dimensions
    )
    return mat


def read_obs(h5con):
    bc = h5con['matrix']['barcodes'][:]
    bc = [x.decode('UTF-8') for x in bc]

    # Initialized the DataFrame with cell barcodes
    obs_df = pd.DataFrame({ 'barcodes' : bc })

    # Get the list of available metadata columns
    obs_columns = h5con['matrix']['observations'].keys()

    # For each column
    for col in obs_columns:
        # Read the values
        values = h5con['matrix']['observations'][col][:]
        # Check for byte storage
        if(isinstance(values[0], (bytes, bytearray))):
            # Decode byte strings
            values = [x.decode('UTF-8') for x in values]
        # Add column to the DataFrame
        obs_df[col] = values
    
    return obs_df
# define a function to construct anndata object from a h5 file
def read_h5_anndata(h5_file):
    h5_con = h5py.File(h5_file, mode = 'r')
    # extract the expression matrix
    mat = read_mat(h5_con)
    # extract gene names
    genes = h5_con['matrix']['features']['name'][:]
    genes = [x.decode('UTF-8') for x in genes]
    # extract metadata
    obs_df = read_obs(h5_con)
    # construct anndata
    adata = anndata.AnnData(mat.T,
                             obs = obs_df)
    # make sure the gene names aligned
    adata.var_names = genes

    adata.var_names_make_unique()
    return adata
def get_last_pattern(inputstr):
    pattern = r"[^/]+(?=$)"
    match = re.search(pattern, inputstr)
    if match:
        return match.group(0)
    else:
        return ""
def process_file(file_name):
    adata = read_h5_anndata(file_name)

    doublet_scores=pd.read_csv('Doublet_Scores/'+adata.obs['pbmc_sample_id'][0]+'.csv', index_col=0)
    L1_labels=pd.read_csv('Labels/'+adata.obs['pbmc_sample_id'][0]+'_L1_predicted_labels.csv', index_col=0)
    L2_labels=pd.read_csv('Labels/'+adata.obs['pbmc_sample_id'][0]+'_L2_predicted_labels.csv', index_col=0)
    L3_labels=pd.read_csv('Labels/'+adata.obs['pbmc_sample_id'][0]+'_L3_predicted_labels.csv', index_col=0)
    L1_labels.columns=[ 'barcodes', 'AIFI_L1']
    L2_labels.columns=[ 'barcodes', 'AIFI_L2']
    L3_labels.columns=[ 'barcodes', 'AIFI_L3']
    print(doublet_scores['barcodes'].tolist()==adata.obs['barcodes'].tolist(),
          L1_labels['barcodes'].tolist()==adata.obs['barcodes'].tolist(),
          L2_labels['barcodes'].tolist()==adata.obs['barcodes'].tolist(),
          L3_labels['barcodes'].tolist()==adata.obs['barcodes'].tolist())
    adata.obs= pd.merge(adata.obs, doublet_scores, on='barcodes', how='left')
    adata.obs= pd.merge(adata.obs, L1_labels, on='barcodes', how='left')
    adata.obs= pd.merge(adata.obs, L2_labels, on='barcodes', how='left')
    adata.obs= pd.merge(adata.obs, L3_labels, on='barcodes', how='left')
    adata.obs.index=adata.obs['barcodes']
    adata.var["mito"] = adata.var_names.str.startswith("MT-")
    sc.pp.calculate_qc_metrics(adata, qc_vars=["mito"], inplace=True)

    return adata

# Read MetaData

In [5]:
meta_data=pd.read_csv('meta_data_GEO.csv')
file_list=["GSE214546_Data/"+x+'.h5' for x in meta_data['combined_sample_id']]


# Combine RNA data

In [6]:
%%time
file_names= file_list
h5_list = []
with ThreadPoolExecutor(max_workers=16) as executor:
    future_to_file = {executor.submit(process_file, file_name): file_name for file_name in file_names}
    for future in tqdm(as_completed(future_to_file), total=len(file_names)):
        result = future.result()
        if result is not None:
            h5_list.append(result)

  0% 0/16 [00:00<?, ?it/s]

True True True True
True True True True


  6% 1/16 [00:16<04:10, 16.67s/it]

True True True True
True True True True
True True True True
True True True True
True True True True
True True True True
True True True True
True True True True
True True True True
True True True True
True True True True
True True True True
True True True True
True True True True


100% 16/16 [00:30<00:00,  1.90s/it]

CPU times: user 55.1 s, sys: 8.75 s, total: 1min 3s
Wall time: 30.4 s





In [7]:
combined= anndata.concat(h5_list)

In [8]:
combined=combined[(combined.obs["pct_counts_mito"] <15) & (combined.obs["n_genes_by_counts"] <2500) &(combined.obs["n_genes"] >200) ]
combined=combined[combined.obs['predicted_doublet']==False]

# Filter T cell only

In [9]:
combined_T=combined[~combined.obs['AIFI_L3'].isin(["Platelet","Plasma cell","Core memory B cell","Adaptive NK cell",
                                                  "CD56bright NK cell","ILC","CD14+ cDC2","HLA-DRhi cDC2","pDC",
                                                 "Core naive B cell","ISG+ CD16 monocyte","ISG+ CD14 monocyte","CD95 memory B cell",
                                                 "ISG+ CD56dim NK cell","IL1B+ CD14 monocyte","Core CD14 monocyte",
                                                 "GZMK- CD56dim NK cell","Erythrocyte","Core CD16 monocyte","GZMK+ CD56dim NK cell",
                                                "ISG+ naive B cell","Transitional B cell","Proliferating NK cell","CD27- effector B cell"])]

# Processing

In [10]:
sc.pp.normalize_total(combined_T, target_sum=1e4)
sc.pp.log1p(combined_T)
sc.pp.highly_variable_genes(combined_T, min_mean=0.0125, max_mean=3, min_disp=0.5)
combined_T.raw = combined_T
combined_T = combined_T[:, combined_T.var.highly_variable]
sc.pp.scale(combined_T, max_value=10)

In [11]:
sc.tl.pca(combined_T, svd_solver="arpack")

In [12]:
sce.pp.harmony_integrate(combined_T, 'batch_id',max_iter_harmony = 20)
sc.pp.neighbors(combined_T, n_neighbors=50,use_rep='X_pca_harmony', n_pcs=30)
sc.tl.umap(combined_T)

2024-06-01 18:57:38,010 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
Computing initial centroids with sklearn.KMeans...
2024-06-01 19:03:02,665 - harmonypy - INFO - Iteration 2 of 20
Iteration 2 of 20
2024-06-01 19:05:47,952 - harmonypy - INFO - Iteration 3 of 20
Iteration 3 of 20
2024-06-01 19:08:31,818 - harmonypy - INFO - Iteration 4 of 20
Iteration 4 of 20
2024-06-01 19:11:32,204 - harmonypy - INFO - Iteration 5 of 20
Iteration 5 of 20
2024-06-01 19:14:09,746 - harmonypy - INFO - Iteration 6 of 20
Iteration 6 of 20
2024-06-01 19:15:28,721 - harmonypy - INFO - Iteration 7 of 20
Iteration 7 of 20
2024-06-01 19:16:25,975 - harmonypy - INFO - Converged after 7 iterations
Converged after 7 iterations


  0%|          | 0/200 [00:00<?, ?it/s]

	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs


In [13]:
combined_T.write_h5ad('TEA_scRNA.h5ad')

... storing 'adt_qc_flag' as categorical
... storing 'batch_id' as categorical
... storing 'cell_name' as categorical
... storing 'chip_id' as categorical
... storing 'hto_barcode' as categorical
... storing 'hto_category' as categorical
... storing 'original_barcodes' as categorical
... storing 'pbmc_sample_id' as categorical
... storing 'pool_id' as categorical
... storing 'seurat_pbmc_type' as categorical
... storing 'well_id' as categorical
... storing 'AIFI_L1' as categorical
... storing 'AIFI_L2' as categorical
... storing 'AIFI_L3' as categorical
