In [19]:
import sys
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import celltypist
import gc
import anndata
from celltypist import models
import h5py
import scipy.sparse as scs
from multiprocessing import Pool
import glob
models.download_models()

📂 Storing models in /root/.celltypist/data/models
⏩ Skipping [1/44]: Immune_All_Low.pkl (file exists)
⏩ Skipping [2/44]: Immune_All_High.pkl (file exists)
⏩ Skipping [3/44]: Adult_CynomolgusMacaque_Hippocampus.pkl (file exists)
⏩ Skipping [4/44]: Adult_Human_PancreaticIslet.pkl (file exists)
⏩ Skipping [5/44]: Adult_Human_Skin.pkl (file exists)
⏩ Skipping [6/44]: Adult_Mouse_Gut.pkl (file exists)
⏩ Skipping [7/44]: Adult_Mouse_OlfactoryBulb.pkl (file exists)
⏩ Skipping [8/44]: Adult_Pig_Hippocampus.pkl (file exists)
⏩ Skipping [9/44]: Adult_RhesusMacaque_Hippocampus.pkl (file exists)
⏩ Skipping [10/44]: Autopsy_COVID19_Lung.pkl (file exists)
⏩ Skipping [11/44]: COVID19_HumanChallenge_Blood.pkl (file exists)
⏩ Skipping [12/44]: COVID19_Immune_Landscape.pkl (file exists)
⏩ Skipping [13/44]: Cells_Fetal_Lung.pkl (file exists)
⏩ Skipping [14/44]: Cells_Intestinal_Tract.pkl (file exists)
⏩ Skipping [15/44]: Cells_Lung_Airway.pkl (file exists)
⏩ Skipping [16/44]: Developing_Human_Brain.pkl (

In [6]:
def read_mat(h5_con):
    mat = scs.csc_matrix(
        (h5_con['matrix']['data'][:], # Count values
         h5_con['matrix']['indices'][:], # Row indices
         h5_con['matrix']['indptr'][:]), # Pointers for column positions
        shape = tuple(h5_con['matrix']['shape'][:]) # Matrix dimensions
    )
    return mat

# define a function to obeservation (i.e. metadata)

def read_obs(h5con):
    bc = h5con['matrix']['barcodes'][:]
    bc = [x.decode('UTF-8') for x in bc]

    # Initialized the DataFrame with cell barcodes
    obs_df = pd.DataFrame({ 'barcodes' : bc })

    # Get the list of available metadata columns
    obs_columns = h5con['matrix']['observations'].keys()

    # For each column
    for col in obs_columns:
        # Read the values
        values = h5con['matrix']['observations'][col][:]
        # Check for byte storage
        if(isinstance(values[0], (bytes, bytearray))):
            # Decode byte strings
            values = [x.decode('UTF-8') for x in values]
        # Add column to the DataFrame
        obs_df[col] = values
    
    return obs_df
# define a function to construct anndata object from a h5 file
def read_h5_anndata(h5_file):
    h5_con = h5py.File(h5_file, mode = 'r')
    # extract the expression matrix
    mat = read_mat(h5_con)
    # extract gene names
    genes = h5_con['matrix']['features']['name'][:]
    genes = [x.decode('UTF-8') for x in genes]
    # extract metadata
    obs_df = read_obs(h5_con)
    # construct anndata
    adata = anndata.AnnData(mat.T,
                             obs = obs_df)
    # make sure the gene names aligned
    adata.var_names = genes

    adata.var_names_make_unique()
    return adata

In [7]:
def process_and_annotate_pbmc(args):
    # Read data
    data_file, model_base_path, output_base_path = args
    pbmc = read_h5_anndata(data_file)
    
    # Processing testing data
    pbmc_sample_id = pbmc.obs['pbmc_sample_id'].unique().tolist()[0]
    pbmc.obs.index = pbmc.obs['barcodes']
    
    # Normalization and log transformation
    sc.pp.normalize_total(pbmc, target_sum=1e4)
    sc.pp.log1p(pbmc)
    
    # Annotations
    levels = ['L1', 'L2', 'L3', 'L3.5']
    models = {'L1': 'ovr', 'L2': 'ovr', 'L3': 'multinomial', 'L3.5': 'multinomial'}
    predictions = {}

    for level, model_type in models.items():
        model_file = f'{model_base_path}/model_AIFI_{level}_{model_type}.pkl'
        predictions[level] = celltypist.annotate(pbmc, model=model_file)

        # Write out labels
        predictions[level].predicted_labels.reset_index().to_csv(f'{output_base_path}/{pbmc_sample_id}_{level}_predicted_labels.csv')
        # Write out probability matrix
        predictions[level].probability_matrix.reset_index().to_parquet(f'{output_base_path}/{pbmc_sample_id}_{level}_probability_matrix.parquet')
        # Write out decision matrix
        predictions[level].decision_matrix.reset_index().to_parquet(f'{output_base_path}/{pbmc_sample_id}_{level}_decision_matrix.parquet')

In [24]:
def process_and_annotate_pbmc(args):
    # Read data
    data_file, model_base_path, output_base_path = args
    pbmc = read_h5_anndata(data_file)
    
    # Processing testing data
    pbmc_sample_id = pbmc.obs['pbmc_sample_id'].unique().tolist()[0]
    pbmc.obs.index = pbmc.obs['barcodes']
    
    # Normalization and log transformation
    sc.pp.normalize_total(pbmc, target_sum=1e4)
    sc.pp.log1p(pbmc)
    
    # Annotations
    levels = ['L1', 'L2', 'L3']
    models = {'L1': 'ovr', 'L2': 'ovr', 'L3': 'multinomial'}
    predictions = {}

    for level, model_type in models.items():
        model_file = f'{model_base_path}/ref_pbmc_clean_celltypist_model_{level}_{{}}.pkl'
        model_file = model_file.format('*')
        model_file = glob.glob(model_file)
        print(model_file)

        predictions[level] = celltypist.annotate(pbmc, model=model_file)

        # Write out labels
        predictions[level].predicted_labels.reset_index().to_csv(f'{output_base_path}/{pbmc_sample_id}_{level}_predicted_labels.csv')
        # Write out probability matrix
        predictions[level].probability_matrix.reset_index().to_parquet(f'{output_base_path}/{pbmc_sample_id}_{level}_probability_matrix.parquet')
        # Write out decision matrix
        predictions[level].decision_matrix.reset_index().to_parquet(f'{output_base_path}/{pbmc_sample_id}_{level}_decision_matrix.parquet')

In [9]:
def process_file(file_name):
    result = read_h5_anndata(file_name)
    output_file = 'Doublet_Scores/' + result.obs['pbmc_sample_id'][0] + '.csv'
    if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
        print(f"File {output_file} already exists and is not empty. Skipping processing.")
        return
    sc.external.pp.scrublet(result)
    result.obs[['barcodes', 'predicted_doublet', 'doublet_score']].to_csv(output_file)

# Labels

In [17]:
meta_data=pd.read_csv('meta_data_GEO.csv')

In [None]:
file_list=[x+'.h5' for x in meta_data['combined_sample_id']]
model_base_path='/home/jupyter/BRI_Analysis/scRNA/AIFI_Model_Celltypist_CertPro'
output_base_path='Labels_CertPro/'


args_list = [(file, model_base_path, output_base_path) for file in file_list]

with Pool(processes=60) as pool:
    pool.map(process_and_annotate_pbmc, args_list)

['L1', 'L2', 'L3']
['L1', 'L2', 'L3']


🔬 Input data has 11559 cells and 36601 genes
🔗 Matching reference genes in the model


['L1', 'L2', 'L3']


🔬 Input data has 9885 cells and 36601 genes
🔗 Matching reference genes in the model
🔬 Input data has 12795 cells and 36601 genes
🔗 Matching reference genes in the model


['L1', 'L2', 'L3']
['L1', 'L2', 'L3']


🔬 Input data has 13723 cells and 36601 genes
🔗 Matching reference genes in the model


['L1', 'L2', 'L3']


🔬 Input data has 13490 cells and 36601 genes
🔗 Matching reference genes in the model


['L1', 'L2', 'L3']


🔬 Input data has 13236 cells and 36601 genes
🔗 Matching reference genes in the model


['L1', 'L2', 'L3']


🔬 Input data has 13729 cells and 36601 genes
🔗 Matching reference genes in the model
🔬 Input data has 15516 cells and 36601 genes
🔗 Matching reference genes in the model


['L1', 'L2', 'L3']


🔬 Input data has 27387 cells and 36601 genes
🔗 Matching reference genes in the model


['L1', 'L2', 'L3']


🔬 Input data has 29859 cells and 36601 genes
🔗 Matching reference genes in the model


['L1', 'L2', 'L3']
['L1', 'L2', 'L3']


🔬 Input data has 30218 cells and 36601 genes
🔗 Matching reference genes in the model


['L1', 'L2', 'L3']


🔬 Input data has 30597 cells and 36601 genes
🔗 Matching reference genes in the model
🔬 Input data has 30987 cells and 36601 genes
🔗 Matching reference genes in the model


['L1', 'L2', 'L3']
['L1', 'L2', 'L3']
['L1', 'L2', 'L3']


🔬 Input data has 33629 cells and 36601 genes
🔗 Matching reference genes in the model
🔬 Input data has 32506 cells and 36601 genes
🔗 Matching reference genes in the model
🔬 Input data has 34052 cells and 36601 genes
🔗 Matching reference genes in the model
🧬 6147 features used for prediction
⚖️ Scaling input data
🧬 6147 features used for prediction
⚖️ Scaling input data
🧬 6147 features used for prediction
⚖️ Scaling input data
🧬 6147 features used for prediction
⚖️ Scaling input data
🧬 6147 features used for prediction
⚖️ Scaling input data
🧬 6147 features used for prediction
🧬 6147 features used for prediction
⚖️ Scaling input data
⚖️ Scaling input data
🧬 6147 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
🖋️ Predicting labels
🧬 6147 features used for prediction
⚖️ Scaling input data
✅ Prediction done!
🖋️ Predicting labels
✅ Prediction done!
🧬 6147 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
🧬 6147 features used for prediction
⚖️ Scal

['L1', 'L2', 'L3']

✅ Prediction done!
🖋️ Predicting labels
⚖️ Scaling input data





🧬 6147 features used for prediction
⚖️ Scaling input data
🔬 Input data has 9885 cells and 36601 genes
🔗 Matching reference genes in the model
🧬 6147 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!


['L1', 'L2', 'L3']


🔬 Input data has 11559 cells and 36601 genes
🔗 Matching reference genes in the model
🖋️ Predicting labels
🧬 6147 features used for prediction
⚖️ Scaling input data
🧬 6147 features used for prediction
✅ Prediction done!
⚖️ Scaling input data
✅ Prediction done!
✅ Prediction done!


['L1', 'L2', 'L3']


🔬 Input data has 12795 cells and 36601 genes
🔗 Matching reference genes in the model
✅ Prediction done!


['L1', 'L2', 'L3']


🔬 Input data has 13490 cells and 36601 genes
🔗 Matching reference genes in the model


['L1', 'L2', 'L3']


🔬 Input data has 13236 cells and 36601 genes
🔗 Matching reference genes in the model


['L1', 'L2', 'L3']
['L1', 'L2', 'L3']


🔬 Input data has 13723 cells and 36601 genes
🔗 Matching reference genes in the model
🔬 Input data has 13729 cells and 36601 genes
🔗 Matching reference genes in the model


['L1', 'L2', 'L3']


🔬 Input data has 15516 cells and 36601 genes
🔗 Matching reference genes in the model
🖋️ Predicting labels


# Doubelt Scores

In [2]:
meta_data=pd.read_csv('meta_data_GEO.csv')
file_list=[x+'.h5' for x in meta_data['combined_sample_id']]


In [12]:
from concurrent.futures import ThreadPoolExecutor

results = []

with ThreadPoolExecutor(max_workers=16) as executor:  
    for result in executor.map(process_file, file_list):
        results.append(result)

Automatically set threshold at doublet score = 0.61
Detected doublet rate = 0.2%
Estimated detectable doublet fraction = 17.0%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 1.1%
Automatically set threshold at doublet score = 0.63
Detected doublet rate = 0.2%
Estimated detectable doublet fraction = 14.2%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 1.5%
Automatically set threshold at doublet score = 0.44
Detected doublet rate = 0.7%
Estimated detectable doublet fraction = 44.6%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 1.5%
Automatically set threshold at doublet score = 0.65
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 10.7%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 1.0%
Automatically set threshold at doublet score = 0.54
Detected doublet rate = 0.2%
Estimated detectable doublet fraction = 25.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 1.0%
Automatically set threshold at doublet score = 0.64
Detected doub