In [1]:

import scanpy as sc
import pandas as pd 
import numpy as np
import sys
import matplotlib.pyplot as plt
import gc
import anndata
import glob
from multiprocessing import Pool
import os
from sklearn import metrics

from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm
import os




In [2]:
def grouped_obs_sum_raw(adata_filt, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        idx = adata_filt.var_names.isin(gene_symbols)
        new_idx = adata_filt.var_names[idx]
    else:
        new_idx = adata_filt.var_names
    grouped = adata_filt.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((len(new_idx), len(grouped)), dtype=np.float64),
        columns=list(grouped.groups.keys()),
        index=new_idx
    )
    for group, idx in grouped.indices.items():
        X = getX(adata_filt[idx])
        out[group] = np.ravel(X.sum(axis=0, dtype=np.float64))
    return out

def grouped_obs_mean(adata_filt, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        idx = adata_filt.var_names.isin(gene_symbols)
        new_idx = adata_filt.var_names[idx]
    else:
        new_idx = adata_filt.var_names
    grouped = adata_filt.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((len(new_idx), len(grouped)), dtype=np.float64),
        columns=list(grouped.groups.keys()),
        index=new_idx
    )
    for group, idx in grouped.indices.items():
        X = getX(adata_filt[idx])
        out[group] = np.ravel(X.mean(axis=0, dtype=np.float64))
    return out

In [3]:
meta_data=pd.read_csv("meta_data.csv")

In [4]:
def process_sample(i):
    raw_path = f"sample_raw_count_sum/{i}.csv"
    norm_path = f"sample_normalized_count_average/{i}.csv"
    
    if os.path.exists(raw_path) and os.path.exists(norm_path):
        return i  # skip if both files already exist

    adata = sc.read_h5ad(f"sample_h5ad/{i}.h5ad")
    adata.var_names = [s.split("_")[0] for s in adata.var["feature_name"].tolist()]
    raw_count_sum = grouped_obs_sum_raw(adata, "cell_type")
    raw_count_sum.to_csv(raw_path)
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    normalized_count_average = grouped_obs_mean(adata, "cell_type")
    normalized_count_average.to_csv(norm_path)
    return i

sample_ids = meta_data["sample_id"].tolist()


num_workers = 10

with ProcessPoolExecutor(max_workers=num_workers) as executor:
    futures = {executor.submit(process_sample, i): i for i in sample_ids}
    for _ in tqdm(as_completed(futures), total=len(futures)):
        pass

  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby(group_key)
  grouped = adata_filt.obs.groupby