In [None]:
import scanpy as sc
import pandas as pd 
import numpy as np
import sys
import matplotlib.pyplot as plt
import celltypist
import gc
import anndata
from celltypist import models
import h5py
import scipy.sparse as scs
from multiprocessing import Pool
import os
models.download_models()

# adt

In [7]:
ADT_INFO=pd.read_csv("/home/jupyter/Maxim_dataset_followup/GEX_HTO_processed/raw_counts_h5ad/pbmc_hto_raw.csv")
ADT_INFO.index=ADT_INFO["Unnamed: 0"].tolist()
del ADT_INFO["Unnamed: 0"]
ADT_INFO.columns = [s.replace(".", "-") for s in ADT_INFO.columns]

# gex

In [3]:
adata=sc.read_h5ad("GEX_HTO_processed/raw_counts_h5ad/pbmc_gex_raw_with_var_obs.h5ad")

In [4]:
meta_data=pd.read_csv("/home/jupyter/Maxim_dataset_followup/GEX_HTO_processed/all_pbmcs/all_pbmcs_metadata.csv")
meta_data.index=meta_data["Unnamed: 0"].tolist()
del meta_data["Unnamed: 0"]
meta_data=meta_data.loc[adata.obs.index,:]
adata.obs=meta_data

In [5]:
adata.obs['sample_id'] = adata.obs['Donor_id'].astype(str) + '_' + adata.obs['Sex'].astype(str) + '_' + adata.obs['Age'].astype(str)+ '_' + adata.obs['Batch'].astype(str)

In [6]:
adata.obs['cell_barcode'] = adata.obs.index.tolist()

In [9]:
adata.write_h5ad("adata.h5ad")

... storing 'orig.ident' as categorical
... storing 'Donor_id' as categorical
... storing 'Age_group' as categorical
... storing 'Sex' as categorical
... storing 'Tube_id' as categorical
... storing 'Batch' as categorical
... storing 'File_name' as categorical
... storing 'Cluster_names' as categorical
... storing 'sample_id' as categorical


# AIFI Label Transfer

In [8]:
def process_and_annotate_pbmc(dataset):
    # Read data
    output_base_path="Labels_AIFI/"
    pbmc =dataset
    
    # Processing testing data
    sample_id = pbmc.obs['sample_id'].unique().tolist()[0]
    
    # Normalization and log transformation
    sc.pp.normalize_total(pbmc, target_sum=1e4)
    sc.pp.log1p(pbmc)
    
    # Annotations
    levels = ['L1', 'L2', 'L3', 'L3.5']
    models = {'L1': '/home/jupyter/BRI_Figures_Final_V1/Dataset/Celltypist_Models/ref_pbmc_clean_celltypist_model_AIFI_L1_2024-04-18.pkl',
              'L2': '/home/jupyter/BRI_Figures_Final_V1/Dataset/Celltypist_Models/ref_pbmc_clean_celltypist_model_AIFI_L2_2024-04-19.pkl',
              'L3': '/home/jupyter/BRI_Figures_Final_V1/Dataset/Celltypist_Models/ref_pbmc_clean_celltypist_model_AIFI_L3_2024-04-19.pkl'}
    predictions = {}

    for level, model_type in models.items():
        model_file = models[level]
        predictions[level] = celltypist.annotate(pbmc, model=model_file)

        # Write out labels
        predictions[level].predicted_labels.reset_index().to_csv(f'{output_base_path}/{sample_id}_{level}_predicted_labels.csv')
        # Write out probability matrix
        predictions[level].probability_matrix.reset_index().to_parquet(f'{output_base_path}/{sample_id}_{level}_probability_matrix.parquet')
        # Write out decision matrix
        predictions[level].decision_matrix.reset_index().to_parquet(f'{output_base_path}/{sample_id}_{level}_decision_matrix.parquet')

In [9]:
adata_list=[adata[adata.obs['sample_id']==donor].copy() for donor in adata.obs['sample_id'].unique()]

In [10]:
with Pool(processes=60) as pool:
    pool.map(process_and_annotate_pbmc, adata_list)

🔬 Input data has 3405 cells and 36601 genes
🔗 Matching reference genes in the model
🧬 1099 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
🔬 Input data has 3581 cells and 36601 genes
🔗 Matching reference genes in the model
🔬 Input data has 3405 cells and 36601 genes
🔗 Matching reference genes in the model
🧬 1099 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
🔬 Input data has 3581 cells and 36601 genes
🔗 Matching reference genes in the model
🧬 1916 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
🔬 Input data has 3405 cells and 36601 genes
🔗 Matching reference genes in the model
🔬 Input data has 3224 cells and 36601 genes
🔗 Matching reference genes in the model
🧬 1916 features used for prediction
⚖️ Scaling input data
🔬 Input data has 4897 cells and 36601 genes
🔗 Matching reference genes in the model
🧬 1099 features used for prediction
⚖️ Scaling input data
🖋️

# Doublet Detection

In [11]:
def doublet_detect(input_adata):
    adata = input_adata
    output_file = 'Doublet_Scores/' + adata.obs['sample_id'][0] + '.csv'
    if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
        print(f"File {output_file} already exists and is not empty. Skipping processing.")
        return
    sc.external.pp.scrublet(adata)
    adata.obs[['cell_barcode', 'predicted_doublet', 'doublet_score']].to_csv(output_file)

In [12]:
with Pool(processes=60) as pool:
    pool.map(doublet_detect, adata_list)

Automatically set threshold at doublet score = 0.48
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 26.6%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.6%
Automatically set threshold at doublet score = 0.48
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 29.3%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.1%
Automatically set threshold at doublet score = 0.45
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 33.6%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.1%
Automatically set threshold at doublet score = 0.35
Detected doublet rate = 0.3%
Estimated detectable doublet fraction = 47.0%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.7%
Automatically set threshold at doublet score = 0.40
Detected doublet rate = 0.2%
Estimated detectable doublet fraction = 36.5%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.5%
Automatically set threshold at doublet score = 0.47
Detected doub