Create paths for input and output data:

In [2]:
import os

path_liver = "data/liver/raw"
if not os.path.exists(path_liver):
    os.makedirs(path_liver, exist_ok=True)

path_results = "data/liver/results"
if not os.path.exists(path_results):
    os.makedirs(path_results, exist_ok=True)

Copy raw data into appropriate subfolder of working directory and convert to the right name/format to be read by scanpy:

In [3]:
import shutil

# download: https://www.kaggle.com/datasets/dschettler8845/cellxgene-10k?resource=download
filename="raw_scRNAseq_10X_precision_toxicology_combined_annotated_sep21.h5ad"
path_download = "Downloads/"

path_final = os.path.join(path_liver, filename)
if not os.path.exists(path_final):
    file = os.path.join(path_download, filename)
    shutil.move(file, path_liver)

Import necessary modules and set basic settings:

In [4]:
import numpy as np
import pandas as pd
import scanpy as sc
import warnings
import gc

warnings.simplefilter(action='ignore', category=Warning)
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=80)

Read in the data from the .h5ad file:

In [5]:
adata = sc.read_h5ad(path_final)
adata

AnnData object with n_obs × n_vars = 63527 × 19971
    obs: 'Age', 'Treatment', 'batch', 'sample_id', 'sample_name', 'n_counts', 'log_counts', 'n_genes', 'mt_fraction', 'replicate', 'subtype', 'subtype2', 'shared_clusters', 'donor_louvain', 'shared_clusters_sanity_check'
    var: 'gene_name'

Remove irrelevant columns:

In [6]:
for c in adata.obs.columns:
    if len(adata.obs[c].unique()) == 1:
        print(c)
        adata.obs.drop(c, axis=1, inplace=True)
adata

AnnData object with n_obs × n_vars = 63527 × 19971
    obs: 'Age', 'Treatment', 'batch', 'sample_id', 'sample_name', 'n_counts', 'log_counts', 'n_genes', 'mt_fraction', 'replicate', 'subtype', 'subtype2', 'shared_clusters', 'donor_louvain', 'shared_clusters_sanity_check'
    var: 'gene_name'