In [18]:
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import celltypist
import gc
import anndata
from celltypist import models
import h5py
import scipy.sparse as scs
from multiprocessing import Pool

models.download_models()

📂 Storing models in /root/.celltypist/data/models
⏩ Skipping [1/44]: Immune_All_Low.pkl (file exists)
⏩ Skipping [2/44]: Immune_All_High.pkl (file exists)
⏩ Skipping [3/44]: Adult_CynomolgusMacaque_Hippocampus.pkl (file exists)
⏩ Skipping [4/44]: Adult_Human_PancreaticIslet.pkl (file exists)
⏩ Skipping [5/44]: Adult_Human_Skin.pkl (file exists)
⏩ Skipping [6/44]: Adult_Mouse_Gut.pkl (file exists)
⏩ Skipping [7/44]: Adult_Mouse_OlfactoryBulb.pkl (file exists)
⏩ Skipping [8/44]: Adult_Pig_Hippocampus.pkl (file exists)
⏩ Skipping [9/44]: Adult_RhesusMacaque_Hippocampus.pkl (file exists)
⏩ Skipping [10/44]: Autopsy_COVID19_Lung.pkl (file exists)
⏩ Skipping [11/44]: COVID19_HumanChallenge_Blood.pkl (file exists)
⏩ Skipping [12/44]: COVID19_Immune_Landscape.pkl (file exists)
⏩ Skipping [13/44]: Cells_Fetal_Lung.pkl (file exists)
⏩ Skipping [14/44]: Cells_Intestinal_Tract.pkl (file exists)
⏩ Skipping [15/44]: Cells_Lung_Airway.pkl (file exists)
⏩ Skipping [16/44]: Developing_Human_Brain.pkl (

In [19]:
def read_mat(h5_con):
    mat = scs.csc_matrix(
        (h5_con['matrix']['data'][:], # Count values
         h5_con['matrix']['indices'][:], # Row indices
         h5_con['matrix']['indptr'][:]), # Pointers for column positions
        shape = tuple(h5_con['matrix']['shape'][:]) # Matrix dimensions
    )
    return mat

# define a function to obeservation (i.e. metadata)

def read_obs(h5con):
    bc = h5con['matrix']['barcodes'][:]
    bc = [x.decode('UTF-8') for x in bc]

    # Initialized the DataFrame with cell barcodes
    obs_df = pd.DataFrame({ 'barcodes' : bc })

    # Get the list of available metadata columns
    obs_columns = h5con['matrix']['observations'].keys()

    # For each column
    for col in obs_columns:
        # Read the values
        values = h5con['matrix']['observations'][col][:]
        # Check for byte storage
        if(isinstance(values[0], (bytes, bytearray))):
            # Decode byte strings
            values = [x.decode('UTF-8') for x in values]
        # Add column to the DataFrame
        obs_df[col] = values
    
    return obs_df
# define a function to construct anndata object from a h5 file
def read_h5_anndata(h5_file):
    h5_con = h5py.File(h5_file, mode = 'r')
    # extract the expression matrix
    mat = read_mat(h5_con)
    # extract gene names
    genes = h5_con['matrix']['features']['name'][:]
    genes = [x.decode('UTF-8') for x in genes]
    # extract metadata
    obs_df = read_obs(h5_con)
    # construct anndata
    adata = anndata.AnnData(mat.T,
                             obs = obs_df)
    # make sure the gene names aligned
    adata.var_names = genes

    adata.var_names_make_unique()
    return adata

In [20]:
meta_data=pd.read_csv('hise_meta_data_2024-01-23_fixed.csv')

In [21]:
def process_and_annotate_pbmc(args):
    # Read data
    data_file, model_base_path, output_base_path = args
    pbmc = read_h5_anndata(data_file)
    
    # Processing testing data
    pbmc_sample_id = pbmc.obs['pbmc_sample_id'].unique().tolist()[0]
    pbmc.obs.index = pbmc.obs['barcodes']
    
    # Normalization and log transformation
    sc.pp.normalize_total(pbmc, target_sum=1e4)
    sc.pp.log1p(pbmc)
    
    # Annotations
    levels = ['L1', 'L2', 'L3', 'L3.5']
    models = {'L1': 'ovr', 'L2': 'ovr', 'L3': 'multinomial', 'L3.5': 'multinomial'}
    predictions = {}

    for level, model_type in models.items():
        model_file = f'{model_base_path}/model_AIFI_{level}_{model_type}.pkl'
        predictions[level] = celltypist.annotate(pbmc, model=model_file)

        # Write out labels
        predictions[level].predicted_labels.reset_index().to_csv(f'{output_base_path}/{pbmc_sample_id}_{level}_predicted_labels.csv')
        # Write out probability matrix
        predictions[level].probability_matrix.reset_index().to_parquet(f'{output_base_path}/{pbmc_sample_id}_{level}_probability_matrix.parquet')
        # Write out decision matrix
        predictions[level].decision_matrix.reset_index().to_parquet(f'{output_base_path}/{pbmc_sample_id}_{level}_decision_matrix.parquet')

In [22]:
# Extracting the file paths from the 'meta_data' DataFrame.
file_list=meta_data['file.path']
# Setting the base path where the model files are stored and outpupath
model_base_path='/home/jupyter/AIFI_Reference_Model_Celltypist/Models'
output_base_path='Labels/'

# Creating a list of tuples, where each tuple contains the arguments required for a single function call.

args_list = [(file, model_base_path, output_base_path) for file in file_list]

with Pool(processes=60) as pool:
    pool.map(process_and_annotate_pbmc, args_list)

🔬 Input data has 1137 cells and 33538 genes
🔗 Matching reference genes in the model
🔬 Input data has 14418 cells and 33538 genes
🔗 Matching reference genes in the model
🔬 Input data has 12837 cells and 33538 genes
🔗 Matching reference genes in the model
🔬 Input data has 19537 cells and 33538 genes
🔬 Input data has 18094 cells and 33538 genes
🔗 Matching reference genes in the model
🔗 Matching reference genes in the model
🔬 Input data has 14433 cells and 33538 genes
🔗 Matching reference genes in the model
🧬 1077 features used for prediction
⚖️ Scaling input data
🔬 Input data has 21940 cells and 33538 genes
🔗 Matching reference genes in the model
🖋️ Predicting labels
✅ Prediction done!
🔬 Input data has 16266 cells and 33538 genes
🔗 Matching reference genes in the model
🔬 Input data has 19585 cells and 33538 genes
🔗 Matching reference genes in the model
🔬 Input data has 20013 cells and 33538 genes
🔗 Matching reference genes in the model
🔬 Input data has 18758 cells and 33538 genes
🔗 Matchi