In [34]:
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import celltypist
import gc
import anndata
from celltypist import models
import h5py
import scipy.sparse as scs
from multiprocessing import Pool

models.download_models()

📂 Storing models in /root/.celltypist/data/models
⏩ Skipping [1/44]: Immune_All_Low.pkl (file exists)
⏩ Skipping [2/44]: Immune_All_High.pkl (file exists)
⏩ Skipping [3/44]: Adult_CynomolgusMacaque_Hippocampus.pkl (file exists)
⏩ Skipping [4/44]: Adult_Human_PancreaticIslet.pkl (file exists)
⏩ Skipping [5/44]: Adult_Human_Skin.pkl (file exists)
⏩ Skipping [6/44]: Adult_Mouse_Gut.pkl (file exists)
⏩ Skipping [7/44]: Adult_Mouse_OlfactoryBulb.pkl (file exists)
⏩ Skipping [8/44]: Adult_Pig_Hippocampus.pkl (file exists)
⏩ Skipping [9/44]: Adult_RhesusMacaque_Hippocampus.pkl (file exists)
⏩ Skipping [10/44]: Autopsy_COVID19_Lung.pkl (file exists)
⏩ Skipping [11/44]: COVID19_HumanChallenge_Blood.pkl (file exists)
⏩ Skipping [12/44]: COVID19_Immune_Landscape.pkl (file exists)
⏩ Skipping [13/44]: Cells_Fetal_Lung.pkl (file exists)
⏩ Skipping [14/44]: Cells_Intestinal_Tract.pkl (file exists)
⏩ Skipping [15/44]: Cells_Lung_Airway.pkl (file exists)
⏩ Skipping [16/44]: Developing_Human_Brain.pkl (

In [3]:
def read_mat(h5_con):
    mat = scs.csc_matrix(
        (h5_con['matrix']['data'][:], # Count values
         h5_con['matrix']['indices'][:], # Row indices
         h5_con['matrix']['indptr'][:]), # Pointers for column positions
        shape = tuple(h5_con['matrix']['shape'][:]) # Matrix dimensions
    )
    return mat

# define a function to obeservation (i.e. metadata)

def read_obs(h5con):
    bc = h5con['matrix']['barcodes'][:]
    bc = [x.decode('UTF-8') for x in bc]

    # Initialized the DataFrame with cell barcodes
    obs_df = pd.DataFrame({ 'barcodes' : bc })

    # Get the list of available metadata columns
    obs_columns = h5con['matrix']['observations'].keys()

    # For each column
    for col in obs_columns:
        # Read the values
        values = h5con['matrix']['observations'][col][:]
        # Check for byte storage
        if(isinstance(values[0], (bytes, bytearray))):
            # Decode byte strings
            values = [x.decode('UTF-8') for x in values]
        # Add column to the DataFrame
        obs_df[col] = values
    
    return obs_df
# define a function to construct anndata object from a h5 file
def read_h5_anndata(h5_file):
    h5_con = h5py.File(h5_file, mode = 'r')
    # extract the expression matrix
    mat = read_mat(h5_con)
    # extract gene names
    genes = h5_con['matrix']['features']['name'][:]
    genes = [x.decode('UTF-8') for x in genes]
    # extract metadata
    obs_df = read_obs(h5_con)
    # construct anndata
    adata = anndata.AnnData(mat.T,
                             obs = obs_df)
    # make sure the gene names aligned
    adata.var_names = genes

    adata.var_names_make_unique()
    return adata

In [None]:
meta_data=pd.read_csv('meta_data.csv')

In [26]:

pbmc = read_h5_anndata('cache/db99298a-23b2-488b-ae6c-4da161e1fca6/B188-P2_PB00278-05_2023-12-06T00:53:51.440865054Z_labeled.h5')
# Processing testing data
pbmc_sample_id=pbmc.obs['pbmc_sample_id'].unique().tolist()[0]

pbmc.obs.index=pbmc.obs['barcodes']
sc.pp.normalize_total(pbmc, target_sum=1e4)
sc.pp.log1p(pbmc)
predictions_L1 = celltypist.annotate(pbmc, model=f'/home/jupyter/AIFI_Reference_Model_Celltypist/Models/model_AIFI_L1_ovr.pkl')
predictions_L2 = celltypist.annotate(pbmc, model=f'/home/jupyter/AIFI_Reference_Model_Celltypist/Models/model_AIFI_L2_ovr.pkl')
predictions_L3 = celltypist.annotate(pbmc, model=f'/home/jupyter/AIFI_Reference_Model_Celltypist/Models/model_AIFI_L3_multinomial.pkl')
predictions_L3_5 = celltypist.annotate(pbmc, model=f'/home/jupyter/AIFI_Reference_Model_Celltypist/Models/model_AIFI_L3.5_multinomial.pkl')


# write out labels
predictions_L1.predicted_labels.reset_index().to_csv(f'Labels/{pbmc_sample_id}_L1_predicted_labels.csv')
predictions_L2.predicted_labels.reset_index().to_csv(f'Labels/{pbmc_sample_id}_L2_predicted_labels.csv')
predictions_L3.predicted_labels.reset_index().to_csv(f'Labels/{pbmc_sample_id}_L3_predicted_labels.csv')
predictions_L3_5.predicted_labels.reset_index().to_csv(f'Labels/{pbmc_sample_id}_L3.5_predicted_labels.csv')
# write out prob matrix
predictions_L1.probability_matrix.reset_index().to_parquet(f'Labels/{pbmc_sample_id}_L1_probability_matrix.parquet')
predictions_L2.probability_matrix.reset_index().to_parquet(f'Labels/{pbmc_sample_id}_L2_probability_matrix.parquet')
predictions_L3.probability_matrix.reset_index().to_parquet(f'Labels/{pbmc_sample_id}_L3_probability_matrix.parquet')
predictions_L3_5.probability_matrix.reset_index().to_parquet(f'Labels/{pbmc_sample_id}_L3.5_probability_matrix.parquet')
# write out decision matrix
predictions_L1.decision_matrix.reset_index().to_parquet(f'Labels/{pbmc_sample_id}_L1_decision_matrix.parquet')
predictions_L2.decision_matrix.reset_index().to_parquet(f'Labels/{pbmc_sample_id}_L2_decision_matrix.parquet')
predictions_L3.decision_matrix.reset_index().to_parquet(f'Labels/{pbmc_sample_id}_L3_decision_matrix.parquet')
predictions_L3_5.decision_matrix.reset_index().to_parquet(f'Labels/{pbmc_sample_id}_L3.5_decision_matrix.parquet')


🔬 Input data has 11372 cells and 33538 genes
🔗 Matching reference genes in the model
🧬 1077 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
🔬 Input data has 11372 cells and 33538 genes
🔗 Matching reference genes in the model
🧬 1912 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
🔬 Input data has 11372 cells and 33538 genes
🔗 Matching reference genes in the model
🧬 2511 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
🔬 Input data has 11372 cells and 33538 genes
🔗 Matching reference genes in the model
🧬 2518 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!


In [32]:
def process_and_annotate_pbmc(args):
    # Read data
    data_file, model_base_path, output_base_path = args
    pbmc = read_h5_anndata(data_file)
    
    # Processing testing data
    pbmc_sample_id = pbmc.obs['pbmc_sample_id'].unique().tolist()[0]
    pbmc.obs.index = pbmc.obs['barcodes']
    
    # Normalization and log transformation
    sc.pp.normalize_total(pbmc, target_sum=1e4)
    sc.pp.log1p(pbmc)
    
    # Annotations
    levels = ['L1', 'L2', 'L3', 'L3.5']
    models = {'L1': 'ovr', 'L2': 'ovr', 'L3': 'multinomial', 'L3.5': 'multinomial'}
    predictions = {}

    for level, model_type in models.items():
        model_file = f'{model_base_path}/model_AIFI_{level}_{model_type}.pkl'
        predictions[level] = celltypist.annotate(pbmc, model=model_file)

        # Write out labels
        predictions[level].predicted_labels.reset_index().to_csv(f'{output_base_path}/{pbmc_sample_id}_{level}_predicted_labels.csv')
        # Write out probability matrix
        predictions[level].probability_matrix.reset_index().to_parquet(f'{output_base_path}/{pbmc_sample_id}_{level}_probability_matrix.parquet')
        # Write out decision matrix
        predictions[level].decision_matrix.reset_index().to_parquet(f'{output_base_path}/{pbmc_sample_id}_{level}_decision_matrix.parquet')

In [30]:
model_base_path='/home/jupyter/AIFI_Reference_Model_Celltypist/Models'
output_base_path='Labels/'
data_file='cache/db99298a-23b2-488b-ae6c-4da161e1fca6/B188-P2_PB00278-05_2023-12-06T00:53:51.440865054Z_labeled.h5'

In [38]:
file_list=[data_file]
model_base_path='/home/jupyter/AIFI_Reference_Model_Celltypist/Models'
output_base_path='Labels/'
args_list = [(file, model_base_path, output_base_path) for file in file_list]

with Pool(processes=len(file_list)) as pool:
    pool.map(process_and_annotate_pbmc, args_list)

TypeError: process_and_annotate_pbmc() missing 2 required positional arguments: 'model_base_path' and 'output_base_path'

In [33]:
process_and_annotate_pbmc(data_file,model_base_path,output_base_path)

🔬 Input data has 11372 cells and 33538 genes
🔗 Matching reference genes in the model
🧬 1077 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
🔬 Input data has 11372 cells and 33538 genes
🔗 Matching reference genes in the model
🧬 1912 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
🔬 Input data has 11372 cells and 33538 genes
🔗 Matching reference genes in the model
🧬 2511 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
🔬 Input data has 11372 cells and 33538 genes
🔗 Matching reference genes in the model
🧬 2518 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
