In [1]:
import scanpy as sc
import pandas as pd 
import numpy as np
import sys
import matplotlib.pyplot as plt
import celltypist
import gc
import anndata
from celltypist import models
import h5py
import scipy.sparse as scs
from multiprocessing import Pool
import os



In [2]:
adata=sc.read_h5ad("dataset/08984b3c-3189-4732-be22-62f1fe8f15a4.h5ad")

In [3]:
adata.var_names=adata.var['feature_name'].tolist()

In [4]:
adata.obs['cell_barcode'] = adata.obs.index.tolist()

In [5]:
adata_list=[adata[adata.obs['donor_id']==donor].copy() for donor in adata.obs['donor_id'].unique()]

# AIFI Label Transfer

In [5]:
def process_and_annotate_pbmc(dataset):
    # Read data
    output_base_path="Labels_AIFI/"
    pbmc =dataset
    
    # Processing testing data
    sample_id = pbmc.obs['donor_id'].unique().tolist()[0]
    
    # Normalization and log transformation
    sc.pp.normalize_total(pbmc, target_sum=1e4)
    sc.pp.log1p(pbmc)
    
    # Annotations
    levels = ['L1', 'L2', 'L3', 'L3.5']
    models = {'L1': '/home//workspace/private/bri_figure_all_files_test/jupyter/BRI_Figures_Final_V2/Dataset/Celltypist_Models/ref_pbmc_clean_celltypist_model_AIFI_L1_2024-04-18.pkl',
              'L2': '/home//workspace/private/bri_figure_all_files_test/jupyter/BRI_Figures_Final_V2/Dataset/Celltypist_Models/ref_pbmc_clean_celltypist_model_AIFI_L2_2024-04-19.pkl',
              'L3': '/home//workspace/private/bri_figure_all_files_test/jupyter/BRI_Figures_Final_V2/Dataset/Celltypist_Models/ref_pbmc_clean_celltypist_model_AIFI_L3_2024-04-19.pkl'}
    predictions = {}

    for level, model_type in models.items():
        model_file = models[level]
        predictions[level] = celltypist.annotate(pbmc, model=model_file)

        # Write out labels
        predictions[level].predicted_labels.reset_index().to_csv(f'{output_base_path}/{sample_id}_{level}_predicted_labels.csv')
        # Write out probability matrix
        predictions[level].probability_matrix.reset_index().to_parquet(f'{output_base_path}/{sample_id}_{level}_probability_matrix.parquet')
        # Write out decision matrix
        predictions[level].decision_matrix.reset_index().to_parquet(f'{output_base_path}/{sample_id}_{level}_decision_matrix.parquet')

In [7]:
with Pool(processes=60) as pool:
    pool.map(process_and_annotate_pbmc, adata_list)

🔬 Input data has 1501 cells and 36469 genes
🔗 Matching reference genes in the model
🔬 Input data has 1278 cells and 36469 genes
🔗 Matching reference genes in the model
🔬 Input data has 1297 cells and 36469 genes
🔗 Matching reference genes in the model
🧬 1065 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
🧬 1065 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
🔬 Input data has 2216 cells and 36469 genes
🔗 Matching reference genes in the model
🧬 1065 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
🔬 Input data has 1776 cells and 36469 genes
🔗 Matching reference genes in the model
🧬 1065 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
🔗 Matching reference genes in the model
🔬 Input data has 1317 cells and 36469 genes
✅ Prediction done!
🔬 Input data has 1501 cells and 36469 genes
🔗 Matching reference genes in the model
🔬 Input data has 127

# Doublet Detection

In [6]:
def doublet_detect(input_adata):
    adata = input_adata
    output_file = 'Doublet_Score/' + adata.obs['donor_id'][0] + '.csv'
    if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
        print(f"File {output_file} already exists and is not empty. Skipping processing.")
        return
    sc.external.pp.scrublet(adata)
    adata.obs[['cell_barcode', 'predicted_doublet', 'doublet_score']].to_csv(output_file)

In [None]:
with Pool(processes=10) as pool:
    pool.map(doublet_detect, adata_list)

In [None]:
doublet_detect