In [2]:
import scanpy as sc
import pandas as pd 
import numpy as np
import sys
import matplotlib.pyplot as plt
import gc
import anndata
import glob
from multiprocessing import Pool
import os
from sklearn import metrics
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
import os
import warnings
warnings.filterwarnings("ignore")



In [3]:
def grouped_obs_sum_raw(adata_filt, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        idx = adata_filt.var_names.isin(gene_symbols)
        new_idx = adata_filt.var_names[idx]
    else:
        new_idx = adata_filt.var_names
    grouped = adata_filt.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((len(new_idx), len(grouped)), dtype=np.float64),
        columns=list(grouped.groups.keys()),
        index=new_idx
    )
    for group, idx in grouped.indices.items():
        X = getX(adata_filt[idx])
        out[group] = np.ravel(X.sum(axis=0, dtype=np.float64))
    return out

def grouped_obs_mean(adata_filt, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        idx = adata_filt.var_names.isin(gene_symbols)
        new_idx = adata_filt.var_names[idx]
    else:
        new_idx = adata_filt.var_names
    grouped = adata_filt.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((len(new_idx), len(grouped)), dtype=np.float64),
        columns=list(grouped.groups.keys()),
        index=new_idx
    )
    for group, idx in grouped.indices.items():
        X = getX(adata_filt[idx])
        out[group] = np.ravel(X.mean(axis=0, dtype=np.float64))
    return out

In [4]:
adata=sc.read_h5ad("ALTRA_certPro_scRNA_141_samples_combined_adata.h5ad")

In [5]:
adata

AnnData object with n_obs × n_vars = 2059581 × 33538
    obs: 'barcodes', 'batch_id', 'cell_name', 'cell_uuid', 'chip_id', 'hto_barcode', 'hto_category', 'n_genes', 'n_mito_umis', 'n_reads', 'n_umis', 'original_barcodes', 'pbmc_sample_id', 'pool_id', 'well_id', 'sample.sampleKitGuid', 'cohort.cohortGuid', 'subject.subjectGuid', 'subject.biologicalSex', 'subject.race', 'subject.ethnicity', 'subject.birthYear', 'sample.visitName', 'sample.drawDate', 'sample.diseaseStatesRecordedAtVisit', 'sample.daysSinceFirstVisit', 'file.id', 'subset_grp', 'predicted_doublet', 'doublet_score', 'AIFI_L1', 'AIFI_L1_score', 'AIFI_L2', 'AIFI_L2_score', 'AIFI_L3', 'AIFI_L3_score', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'leiden_harmony_2', 'doublets_manual', 'AIFI_L3_new', 'S

In [7]:
adata_list=[adata[adata.obs['sample.sampleKitGuid']==pbmc_sample_id].copy() for pbmc_sample_id in adata.obs['sample.sampleKitGuid'].unique()]

In [8]:
len(adata_list)

141

In [9]:
def process(adata_subset):
    adata_subset.write_h5ad("sample_adata/" + adata_subset.obs['sample.sampleKitGuid'][0] + ".h5ad")
    
    raw_count_sum = grouped_obs_sum_raw(adata_subset, "AIFI_L3")
    raw_count_sum.to_csv("sample_raw_count_sum/" + adata_subset.obs['sample.sampleKitGuid'][0] + ".csv")
    
    sc.pp.normalize_total(adata_subset, target_sum=1e4)
    sc.pp.log1p(adata_subset)

    normalized_count_average = grouped_obs_mean(adata_subset, "AIFI_L3")
    normalized_count_average.to_csv("sample_normalized_count_average/" + adata_subset.obs['sample.sampleKitGuid'][0] + ".csv")
    
with ProcessPoolExecutor(max_workers=15) as executor:
    executor.map(process, adata_list)