In [1]:
import scanpy as sc
from pathlib import Path
import os

sc.settings.verbosity = 0

In [2]:
### Quality control
def filter_pp_qc(sample):
    # Filtering for cell and genes
    sample.var_names_make_unique()
    sc.pp.filter_cells(sample, min_genes=200)
    sc.pp.filter_genes(sample, min_cells=3)
    # mitochondrial genes
    sample.var["mt"] = sample.var_names.str.startswith("MT-")
    # ribosomal genes
    sample.var["ribo"] = sample.var_names.str.startswith(("RPS", "RPL"))
    sc.pp.calculate_qc_metrics(sample, qc_vars=["mt", "ribo"], inplace=True)
    # Remove cells with high mitochondrial gene percentage
    sample = sample[sample.obs.pct_counts_mt < 15, :]
    # Optional: filter based on percent mitochondria or number of genes
    #sc.pp.normalize_total(sample, target_sum=1e4)
    #sc.pp.log1p(sample)
    # sc.pp.highly_variable_genes(sample, flavor='seurat', n_top_genes=2000)

    return sample

In [None]:
# dictionary to map sample identifiers to sample properties
files = {
    "GSM4446535": "week8_001",
    "GSM4446536": "week9_063",
    "GSM4446537": "week6_088",
    "GSM4446538": "week14_123",
    "GSM4446539": "week12_124",
    "GSM4446540": "week8_125",
    "GSM4446541": "week9_005",
    "GSM4446542": "week11_006",
    "GSM4446543": "week9_007",
    "GSM4734601": "week8_016",
    "GSM4734602": "week9_031_paraganglia",
    "GSM4734603": "week12_035",
    "GSM4734604": "week12_036_extraadrenal",
}

In [4]:
def concat_h5_files(
    files: dict, raw_dir="data/raw/GSE147821_RAW", out_dir="data/processed", overwrite=False, log=False,
):
    concat_file = Path(out_dir) / "concatenated.h5ad"
    Path(out_dir).mkdir(exist_ok=True, parents=True)
    if not Path(concat_file).exists() or overwrite:
        samples = []
        for key in files:
            # find the file
            h5_file = [key for key in os.listdir(raw_dir) if key.startswith(key)][0]
            # find the matching info
            info = files[key]
            # extract the information from the file name
            week_str = info.split("_")[0]  # gets the week as string
            week = week_str.split("week")[1]  # converts to integer
            sample_name = info.split("_")[1]  # gets the sample name

            # assign the full path
            full_path = Path(raw_dir) / h5_file

            # read the file
            sample = filter_pp_qc(sc.read_10x_h5(full_path))  # filter the sample
            sample.obs_names = [f"{info}_{cell}" for cell in sample.obs_names]

            # Add metadata
            sample.obs["sample_id"] = sample_name
            sample.obs["week"] = week
            sample.obs["batch"] = info

            # append to the list
            samples.append(sample)
        # concatenate the samples
        adt = sc.concat(samples)
        if log:
            # save the raw data
            adt.raw = adt
            # normalize the data
            sc.pp.log1p(adt)

        adt.layers["counts"] = adt.X.copy()  # Preserve original counts
        # Select highly variable genes globally
        sc.pp.normalize_total(adt, target_sum=1e4)
        # log transform the data
        sc.pp.log1p(adt)
        # save the file
        adt.write_h5ad(concat_file)
    else:
        adt = sc.read_h5ad(concat_file)

    return adt


In [5]:
adata = concat_h5_files(files, overwrite=True, log=False)

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_na

In [6]:
adata.write_h5ad("data/processed/concatenated_std.h5ad")

In [7]:
adata

AnnData object with n_obs × n_vars = 99073 × 22639
    obs: 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'sample_id', 'week', 'batch'
    uns: 'log1p'
    layers: 'counts'