In [1]:
import scanpy as sc
import pandas as pd 
import numpy as np
import sys
import matplotlib.pyplot as plt
import gc
import anndata
import glob
from multiprocessing import Pool
import os
from sklearn import metrics
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
import os
import warnings
warnings.filterwarnings("ignore")



In [2]:
def grouped_obs_sum_raw(adata_filt, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        idx = adata_filt.var_names.isin(gene_symbols)
        new_idx = adata_filt.var_names[idx]
    else:
        new_idx = adata_filt.var_names
    grouped = adata_filt.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((len(new_idx), len(grouped)), dtype=np.float64),
        columns=list(grouped.groups.keys()),
        index=new_idx
    )
    for group, idx in grouped.indices.items():
        X = getX(adata_filt[idx])
        out[group] = np.ravel(X.sum(axis=0, dtype=np.float64))
    return out

def grouped_obs_mean(adata_filt, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        idx = adata_filt.var_names.isin(gene_symbols)
        new_idx = adata_filt.var_names[idx]
    else:
        new_idx = adata_filt.var_names
    grouped = adata_filt.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((len(new_idx), len(grouped)), dtype=np.float64),
        columns=list(grouped.groups.keys()),
        index=new_idx
    )
    for group, idx in grouped.indices.items():
        X = getX(adata_filt[idx])
        out[group] = np.ravel(X.mean(axis=0, dtype=np.float64))
    return out

In [None]:
adata=sc.read_h5ad('/home/workspace/private/bri_figure_all_files_test/jupyter/Onek1k_dataset_followup/dataset/08984b3c-3189-4732-be22-62f1fe8f15a4.h5ad')

In [None]:
labels=pd.read_csv("labels.csv",index_col=0)

In [None]:
df=adata.obs.merge(labels, how='inner',left_index=True, right_index=True)

In [None]:
adata=adata[df.index]

In [None]:
adata.obs=df

In [None]:
adata.var_names=adata.var['feature_name'].tolist()

In [9]:
adata

AnnData object with n_obs × n_vars = 1248980 × 36469
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'donor_id', 'pool_number', 'predicted.celltype.l2', 'predicted.celltype.l2.score', 'age', 'organism_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'cell_type_ontology_term_id_colors', 'citation', 'default_embedding', 'schema_reference', 'schema_v

In [10]:
adata_list=[adata[adata.obs['donor_id']==donor].copy() for donor in adata.obs['donor_id'].unique()]

def process(adata_subset):
    adata_subset.write_h5ad("/home/workspace/sample_adata/" + adata_subset.obs['donor_id'][0] + ".h5ad")
    
    raw_count_sum = grouped_obs_sum_raw(adata_subset, "AIFI_L3")
    raw_count_sum.to_csv("/home/workspace/sample_raw_count_sum/" + adata_subset.obs['donor_id'][0] + ".csv")
    
    sc.pp.normalize_total(adata_subset, target_sum=1e4)
    sc.pp.log1p(adata_subset)
    
    normalized_count_average = grouped_obs_mean(adata_subset, "AIFI_L3")
    normalized_count_average.to_csv("/home/workspace/sample_normalized_count_average/" + adata_subset.obs['donor_id'][0] + ".csv")
    
with ProcessPoolExecutor(max_workers=15) as executor:
    executor.map(process, adata_list)


In [11]:
for i in tqdm(np.unique(adata.obs["donor_id"]), desc="Processing donors"):
    adata_subset = adata[adata.obs["donor_id"] == i]
    adata_subset.write_h5ad("sample_adata/" + i + ".h5ad")
    
    raw_count_sum = grouped_obs_sum_raw(adata_subset, "AIFI_L3")
    raw_count_sum.to_csv("sample_raw_count_sum/" + i + ".csv")
    
    sc.pp.normalize_total(adata_subset, target_sum=1e4)
    sc.pp.log1p(adata_subset)
    
    normalized_count_average = grouped_obs_mean(adata_subset, "AIFI_L3")
    normalized_count_average.to_csv("sample_normalized_count_average/" + i + ".csv")

Processing donors: 100%|██████████| 981/981 [1:05:11<00:00,  3.99s/it]


In [12]:
adata_subset

AnnData object with n_obs × n_vars = 1049 × 36469
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'donor_id', 'pool_number', 'predicted.celltype.l2', 'predicted.celltype.l2.score', 'age', 'organism_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'cell_type_ontology_term_id_colors', 'citation', 'default_embedding', 'schema_reference', 'schema_vers

In [13]:
meta_data=adata.obs[["donor_id",'age', 'sex', 'organism', 'tissue',  'pool_number']].drop_duplicates().reset_index()

In [14]:
del meta_data['index']

In [15]:
pd.DataFrame(meta_data).to_csv("sample_meta_data.csv")