# Label cell types using CellTypist Models

To build our reference, we would like to start with labels that originate from published cell type references. 

One of the approaches for this cell type labeling is CellTypist, a model-based approach to cell type labeling.  

CellTypist is described [on their website](https://www.celltypist.org/), and in this publication:  

Domínguez Conde, C. et al. Cross-tissue immune cell analysis reveals tissue-specific features in humans. Science 376, eabl5197 (2022)

Here, we'll load in our cells individually, and assign labels based on our 3-level annotated PBMC reference:  

- AIFI_L1:
    - 9 types
- AIFI_L2:  
    - 29 types
- AIFI_L3:
    - 71 types

## Load Packages

`anndata`: Data structures for scRNA-seq  
`celltypist`: Model-based cell type annotation  
`concurrent.futures`: parallelization methods  
`datetime`: date and time functions  
`h5py`: HDF5 file I/O  
`hisepy`: The HISE SDK for Python  
`numpy`: Mathematical data structures and computation  
`os`: operating system calls  
`pandas`: DataFrame data structures  
`re`: Regular expressions  
`scanpy`: scRNA-seq analysis  
`scipy.sparse`: Spare matrix data structures  
`shutil`: Shell utilities

In [1]:
import anndata
import celltypist
from celltypist import models
import concurrent.futures
from datetime import date
import h5py
import hisepy
import numpy as np
import os
import pandas as pd 
import re
import scanpy as sc
import scipy.sparse as scs
import shutil

Load a model to prevent CellTypist from loading all models per core

In [2]:
models.download_models(
    force_update = True,
    model = ['Immune_All_High.pkl']
)

📜 Retrieving model list from server https://celltypist.cog.sanger.ac.uk/models/models.json
📚 Total models in list: 44
📂 Storing models in /root/.celltypist/data/models
💾 Total models to download: 1
💾 Downloading model [1/1]: Immune_All_High.pkl


## Helper functions

This function allows easy reading of .csv files stored in HISE

In [3]:
def read_csv_uuid(csv_uuid):
    csv_path = '/home/jupyter/cache/{u}'.format(u = csv_uuid)
    if not os.path.isdir(csv_path):
        hise_res = hisepy.reader.cache_files([csv_uuid])
    csv_filename = os.listdir(csv_path)[0]
    csv_file = '{p}/{f}'.format(p = csv_path, f = csv_filename)
    df = pd.read_csv(csv_file, index_col = 0)
    return df

This function allows easy identification of the cached file path for files retrieved from HISE

In [4]:
def read_path_uuid(file_uuid):
    file_path = '/home/jupyter/cache/{u}'.format(u = file_uuid)
    if not os.path.isdir(file_path):
        hise_res = hisepy.reader.cache_files([file_uuid])
    filename = os.listdir(file_path)[0]
    full_path = '{p}/{f}'.format(p = file_path, f = filename)
    return full_path

These functions will retrieve data for a sample, assemble an AnnData object, perform normalization and log transformation, then generate predictions for each of the 3 models retrieved.

In [5]:
# define a function to read count data
def read_mat(h5_con):
    mat = scs.csc_matrix(
        (h5_con['matrix']['data'][:], # Count values
         h5_con['matrix']['indices'][:], # Row indices
         h5_con['matrix']['indptr'][:]), # Pointers for column positions
        shape = tuple(h5_con['matrix']['shape'][:]) # Matrix dimensions
    )
    return mat

# define a function to read obeservation metadata (i.e. cell metadata)
def read_obs(h5con):
    bc = h5con['matrix']['barcodes'][:]
    bc = [x.decode('UTF-8') for x in bc]

    # Initialized the DataFrame with cell barcodes
    obs_df = pd.DataFrame({ 'barcodes' : bc })

    # Get the list of available metadata columns
    obs_columns = h5con['matrix']['observations'].keys()

    # For each column
    for col in obs_columns:
        # Read the values
        values = h5con['matrix']['observations'][col][:]
        # Check for byte storage
        if(isinstance(values[0], (bytes, bytearray))):
            # Decode byte strings
            values = [x.decode('UTF-8') for x in values]
        # Add column to the DataFrame
        obs_df[col] = values

    obs_df = obs_df.set_index('barcodes', drop = False)
    
    return obs_df

# define a function to construct anndata object from a h5 file
def read_h5_anndata(h5_con):
    #h5_con = h5py.File(h5_file, mode = 'r')
    # extract the expression matrix
    mat = read_mat(h5_con)
    # extract gene names
    genes = h5_con['matrix']['features']['name'][:]
    genes = [x.decode('UTF-8') for x in genes]
    # extract metadata
    obs_df = read_obs(h5_con)
    # construct anndata
    adata = anndata.AnnData(mat.T,
                             obs = obs_df)
    # make sure the gene names aligned
    adata.var_names = genes

    adata.var_names_make_unique()
    return adata

In [6]:
def get_adata(uuid):
    # Load the file using HISE
    res = hisepy.reader.read_files([uuid])

    # If there's an error, read_files returns a list instead of a dictionary.
    # We should raise and exception with the message when this happens.
    if(isinstance(res, list)):
        error_message = res[0]['message']
        raise Exception(error_message)
    
    # Read the file to adata
    h5_con = res['values'][0]
    adata = read_h5_anndata(h5_con)
    
    # Close the file now that we're done with it
    h5_con.close()

    return(adata)

In [7]:
def run_prediction(adata, model, model_name, out_dir = "output"):
    # Perform prediction
    predictions = celltypist.annotate(
        adata, 
        model = model, 
        majority_voting = True)

    # Make output directories
    model_dir = "{d}/{m}".format(d = out_dir, m = model_name)
    if not os.path.isdir(model_dir):
        os.makedirs(model_dir)

    # Write output
    sample_id = adata.obs['pbmc_sample_id'].unique()[0]
    
    prob_file = "{d}/{s}_{m}_probability_mat.parquet".format(d = model_dir, s = sample_id, m = model_name)
    prob = predictions.probability_matrix
    prob.to_parquet(prob_file)

    dec_file = "{d}/{s}_{m}_decision_mat.parquet".format(d = model_dir, s = sample_id, m = model_name)
    predictions.decision_matrix.to_parquet(dec_file)
    
    label_file = "{d}/{s}_{m}_labels.csv".format(d = model_dir, s = sample_id, m = model_name)
    labels = predictions.predicted_labels
    labels = labels.rename({'predicted_labels': model_name}, axis = 1)
    
    prob_scores = []
    for i in range(labels.shape[0]):
        prob_scores.append(prob.loc[labels.index.to_list()[i],labels[model_name][i]])
    labels['{m}_score'.format(m = model_name)] = prob_scores
    labels.to_csv(label_file)
    
    
def process_data(file_uuid):
    out_dir = "output"

    # Load cells from HISE .h5 files
    adata = get_adata(file_uuid)
    
    # Normalize data
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    adata.obs.index = adata.obs['barcodes']
    
    # Predict cell types
    for model_name,model_path in model_paths.items():
        run_prediction(
            adata,
            model_path,
            model_name,
            out_dir
        )
    
    del adata

## Obtain CellTypist Models

In [8]:
model_uuids = {
    'AIFI_L1': '54d9399c-3bfd-4619-a32d-8c7f3c0911f6',
    'AIFI_L2': 'e2898db9-e121-4263-b0c6-a19eadd4217a',
    'AIFI_L3': 'd18fe8b3-b8e7-4b25-9966-0c05a1ba9d7a'
}
model_feature_uuids = {
    'AIFI_L1': 'd5c0707a-c566-4ddf-8c86-f9b223f9364f',
    'AIFI_L2': '9e684069-fbf5-4f95-8ae5-89d87918e571',
    'AIFI_L3': 'dd8fa995-a3ce-419f-844a-279cf69a4629'
}

In [9]:
model_paths = {}
for name,uuid in model_uuids.items():
    model_paths[name] = read_path_uuid(uuid)

In [10]:
model_paths

{'AIFI_L1': '/home/jupyter/cache/54d9399c-3bfd-4619-a32d-8c7f3c0911f6/ref_pbmc_clean_celltypist_model_AIFI_L1_2024-03-09.pkl',
 'AIFI_L2': '/home/jupyter/cache/e2898db9-e121-4263-b0c6-a19eadd4217a/ref_pbmc_clean_celltypist_model_AIFI_L2_2024-03-10.pkl',
 'AIFI_L3': '/home/jupyter/cache/d18fe8b3-b8e7-4b25-9966-0c05a1ba9d7a/ref_pbmc_clean_celltypist_model_AIFI_L3_2024-03-11.pkl'}

In [11]:
model_features = {}
for name,uuid in model_feature_uuids.items():
    model_features[name] = read_csv_uuid(uuid)

In [12]:
model_features['AIFI_L1'].head()

Unnamed: 0,gene
0,HES4
1,TTLL10
2,TNFRSF18
3,TNFRSF4
4,RPL22


## Read sample metadata from HISE

In [13]:
sample_meta_file_uuid = 'd82c5c42-ae5f-4e67-956e-cd3b7bf88105'
file_query = hisepy.reader.read_files(
    [sample_meta_file_uuid]
)

In [14]:
meta_data = file_query['values']

In [15]:
meta_data.shape

(868, 33)

## Apply across files

Here, we'll use `concurrent.futures` to apply the function above to our files in parallel.

In [16]:
out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)

In [17]:
file_uuids = meta_data['file.id'].to_list()

In [18]:
# Process each subset in parallel
pool_executor = concurrent.futures.ProcessPoolExecutor(max_workers = 60)
with pool_executor as executor:
    
    futures = []
    for file_uuid in file_uuids:
        futures.append(executor.submit(process_data, file_uuid))

    # Check for errors when parallel processes return results
    for future in concurrent.futures.as_completed(futures):
        try:
            future.result()
        except Exception as e:
            print(f'Error: {e}')

🔬 Input data has 19186 cells and 33538 genes
🔗 Matching reference genes in the model
🔬 Input data has 12699 cells and 33538 genes
🔗 Matching reference genes in the model
🧬 1102 features used for prediction
⚖️ Scaling input data
🧬 1102 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
🔬 Input data has 14433 cells and 33538 genes
🔗 Matching reference genes in the model
🔬 Input data has 15039 cells and 33538 genes
🔗 Matching reference genes in the model
🧬 1102 features used for prediction
⚖️ Scaling input data
🔬 Input data has 15160 cells and 33538 genes
🔗 Matching reference genes in the model
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
🔬 Input data has 24003 cel

## Assemble results

For each model, we'll assemble the results as a .csv file that we can utilize later for subclustering and analysis of major cell classes.

In [11]:
models = ['High', 'Low', 'Covid_Healthy']

In [12]:
out_files = []
for model in models:
    model_path = 'output/{m}'.format(m = model)
    model_files = os.listdir(model_path)
    model_list = []
    for model_file in model_files:
        df = pd.read_csv('output/{m}/{f}'.format(m = model, f = model_file))
        model_list.append(df)
    model_df = pd.concat(model_list)

    out_file = 'output/ref_celltypist_labels_{m}_{d}.csv'.format(m = model, d = date.today())
    out_files.append(out_file)
    
    model_df.to_csv(out_file)

## Upload assembled data to HISE

Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps.

In [14]:
study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'
title = 'Ref. CellTypist Predictions {d}'.format(d = date.today())

In [15]:
in_files = [sample_meta_file_uuid] + meta_data['file.id'].to_list()

In [16]:
in_files[0:5]

['2da66a1a-17cc-498b-9129-6858cf639caf',
 'fec489f9-9a74-4635-aa91-d2bf09d1faec',
 '7c0c7979-eebd-4aba-b5b2-6e76b4643623',
 '40efd03a-cb2f-4677-af42-a056cbfe5a17',
 '68fbcd34-1d63-461d-8195-df5b8dc61b31']

In [17]:
out_files

['output/ref_celltypist_labels_High_2024-02-18.csv',
 'output/ref_celltypist_labels_Low_2024-02-18.csv',
 'output/ref_celltypist_labels_Covid_Healthy_2024-02-18.csv']

In [18]:
hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files
)

you are trying to upload file_ids... ['output/ref_celltypist_labels_High_2024-02-18.csv', 'output/ref_celltypist_labels_Low_2024-02-18.csv', 'output/ref_celltypist_labels_Covid_Healthy_2024-02-18.csv']. Do you truly want to proceed?


(y/n) y


{'trace_id': '60c20ada-f8aa-4c7f-ae24-8973a487a491',
 'files': ['output/ref_celltypist_labels_High_2024-02-18.csv',
  'output/ref_celltypist_labels_Low_2024-02-18.csv',
  'output/ref_celltypist_labels_Covid_Healthy_2024-02-18.csv']}

In [19]:
import session_info
session_info.show()