In [3]:
import scanpy as sc
import pandas as pd 
import numpy as np

In [4]:
import sys
import matplotlib.pyplot as plt
import celltypist
import gc
import anndata
from celltypist import models
import h5py
import scipy.sparse as scs
from multiprocessing import Pool

models.download_models()

📂 Storing models in /root/.celltypist/data/models
⏩ Skipping [1/52]: Immune_All_Low.pkl (file exists)
⏩ Skipping [2/52]: Immune_All_High.pkl (file exists)
⏩ Skipping [3/52]: Adult_COVID19_PBMC.pkl (file exists)
⏩ Skipping [4/52]: Adult_CynomolgusMacaque_Hippocampus.pkl (file exists)
⏩ Skipping [5/52]: Adult_Human_MTG.pkl (file exists)
⏩ Skipping [6/52]: Adult_Human_PancreaticIslet.pkl (file exists)
⏩ Skipping [7/52]: Adult_Human_PrefrontalCortex.pkl (file exists)
⏩ Skipping [8/52]: Adult_Human_Skin.pkl (file exists)
⏩ Skipping [9/52]: Adult_Mouse_Gut.pkl (file exists)
⏩ Skipping [10/52]: Adult_Mouse_OlfactoryBulb.pkl (file exists)
⏩ Skipping [11/52]: Adult_Pig_Hippocampus.pkl (file exists)
⏩ Skipping [12/52]: Adult_RhesusMacaque_Hippocampus.pkl (file exists)
⏩ Skipping [13/52]: Autopsy_COVID19_Lung.pkl (file exists)
⏩ Skipping [14/52]: COVID19_HumanChallenge_Blood.pkl (file exists)
⏩ Skipping [15/52]: COVID19_Immune_Landscape.pkl (file exists)
⏩ Skipping [16/52]: Cells_Adult_Breast.pkl

In [5]:
def process_and_annotate_pbmc(dataset):
    # Read data
    output_base_path="Labels_AIFI/"
    pbmc =dataset
    
    # Processing testing data
    sample_id = pbmc.obs['sample_id'].unique().tolist()[0]
    
    # Normalization and log transformation
    sc.pp.normalize_total(pbmc, target_sum=1e4)
    sc.pp.log1p(pbmc)
    
    # Annotations
    levels = ['L1', 'L2', 'L3', 'L3.5']
    models = {'L1': '/home/jupyter/BRI_Figures_Final_V1/Dataset/Celltypist_Models/ref_pbmc_clean_celltypist_model_AIFI_L1_2024-04-18.pkl',
              'L2': '/home/jupyter/BRI_Figures_Final_V1/Dataset/Celltypist_Models/ref_pbmc_clean_celltypist_model_AIFI_L2_2024-04-19.pkl',
              'L3': '/home/jupyter/BRI_Figures_Final_V1/Dataset/Celltypist_Models/ref_pbmc_clean_celltypist_model_AIFI_L3_2024-04-19.pkl'}
    predictions = {}

    for level, model_type in models.items():
        model_file = models[level]
        predictions[level] = celltypist.annotate(pbmc, model=model_file)

        # Write out labels
        predictions[level].predicted_labels.reset_index().to_csv(f'{output_base_path}/{sample_id}_{level}_predicted_labels.csv')
        # Write out probability matrix
        predictions[level].probability_matrix.reset_index().to_parquet(f'{output_base_path}/{sample_id}_{level}_probability_matrix.parquet')
        # Write out decision matrix
        predictions[level].decision_matrix.reset_index().to_parquet(f'{output_base_path}/{sample_id}_{level}_decision_matrix.parquet')

# adt

In [2]:
ADT_INFO=pd.read_csv("/home/jupyter/Maxim_dataset_followup/GEX_HTO_processed/raw_counts_h5ad/pbmc_hto_raw.csv")
ADT_INFO.index=ADT_INFO["Unnamed: 0"].tolist()
del ADT_INFO["Unnamed: 0"]
ADT_INFO.columns = [s.replace(".", "-") for s in ADT_INFO.columns]

# gex

In [6]:
adata=sc.read_h5ad("GEX_HTO_processed/raw_counts_h5ad/pbmc_gex_raw_with_var_obs.h5ad")

In [7]:
meta_data=pd.read_csv("/home/jupyter/Maxim_dataset_followup/GEX_HTO_processed/all_pbmcs/all_pbmcs_metadata.csv")
meta_data.index=meta_data["Unnamed: 0"].tolist()
del meta_data["Unnamed: 0"]
meta_data=meta_data.loc[adata.obs.index,:]
adata.obs=meta_data

In [8]:
adata.obs['sample_id'] = adata.obs['Donor_id'].astype(str) + '_' + adata.obs['Sex'].astype(str) + '_' + adata.obs['Age'].astype(str)+ '_' + adata.obs['Batch'].astype(str)

# AIFI Label Transfer

In [9]:
adata_list=[adata[adata.obs['sample_id']==donor].copy() for donor in adata.obs['sample_id'].unique()]

In [None]:
with Pool(processes=60) as pool:
    pool.map(process_and_annotate_pbmc, adata_list)

🔬 Input data has 3405 cells and 36601 genes
🔗 Matching reference genes in the model
🧬 1099 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
🔬 Input data has 3405 cells and 36601 genes
🔗 Matching reference genes in the model
🔬 Input data has 3581 cells and 36601 genes
🔗 Matching reference genes in the model
🧬 1099 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
🧬 1916 features used for prediction
⚖️ Scaling input data
🔬 Input data has 3581 cells and 36601 genes
🔗 Matching reference genes in the model
🖋️ Predicting labels
✅ Prediction done!
🔬 Input data has 3405 cells and 36601 genes
🔗 Matching reference genes in the model
🔬 Input data has 3224 cells and 36601 genes
🔗 Matching reference genes in the model
🔬 Input data has 4897 cells and 36601 genes
🔗 Matching reference genes in the model
🧬 1916 features used for prediction
⚖️ Scaling input data
🧬 1099 features used for prediction
⚖️ Scaling input data
🖋️

# Doublet Detection