# Basic Pre-processing of 10X scRNA-seq data (Part 1)

Load the processed dataset from the snakemake pipeline

In [None]:
# Load params
import os

h5ad_file = os.getenv("SNAKEMAKE_H5AD_FILE")
if h5ad_file is None:
    raise ValueError("SNAKEMAKE_H5AD_FILE environment variable is not set.")
outlier_threshold = os.getenv("SNAKEMAKE_OUTLIER_THRESHOLD", "5")
outlier_threshold = int(outlier_threshold)
processed_filename = os.getenv("SNAKEMAKE_PROCESSED_FILENAME", "processed_adata.h5ad")
print("Scanpy anndata file:", h5ad_file)
print("Outlier threshold:", outlier_threshold)
print("Processed filename:", processed_filename)

Convert into a `Scanpy` object

In [None]:
import scanpy as sc
adata = sc.read_h5ad(h5ad_file)

print("Loaded AnnData object successfully.")
print("Shape: ", adata.shape)

# Perform basic QC analysis and filtering

First, we will evaluate low-quality cells. Much of this is based on the [single-cell best practices workflow](https://www.sc-best-practices.org/preprocessing_visualization/quality_control.html).

In [None]:
# mitochondrial genes
adata.var["mt"] = adata.var_names.str.startswith("MT-")
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes.
adata.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")
sc.pp.calculate_qc_metrics(
    adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True
)
adata

Evaluate and filter cells

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.displot(adata.obs["total_counts"], bins=100, kde=False)
plt.xlabel("Total counts per cell")
plt.ylabel("Number of cells")
plt.title("Distribution of total counts per cell")
plt.show()

In [None]:
fig, ax = plt.subplots()
ax = sc.pl.violin(adata, ["pct_counts_mt", "pct_counts_ribo", "pct_counts_hb"], jitter=0.4, ax=ax, show=False)
ax.set_ylabel("Percentage of counts")
ax.set_title("Percentage of counts in mitochondrial, ribosomal, and hemoglobin genes")
plt.show()

In [None]:
fig, ax = plt.subplots()
ax = sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt", show=False, ax=ax)
ax.set_xlabel("Total counts per cell")
ax.set_ylabel("Number of genes per cell")
ax.set_title("Total counts vs. number of genes per cell")
plt.show()

Compute median standard deviation (MAD): $MAD = \text{median}(|x_i - \text{median}(x)|)$ to automatically threshold outliers

In [None]:
from scipy.stats import median_abs_deviation
import numpy as np

# Save a copy in .raw
adata.raw = adata.copy()

def evaluate_outlier(observations: np.ndarray, mad_threshold: int) -> np.ndarray:
    """
    Evaluate outliers based on the median absolute deviation (MAD) method.
    :param observations: The observations to evaluate.
    :param mad_threshold: The number of standard deviations above which to consider an outlier.
    :return: A boolean array indicating which observations are outliers.
    """
    med = np.median(observations)
    outlier = (observations < med - mad_threshold * median_abs_deviation(observations)) | (
        med + mad_threshold * median_abs_deviation(observations) < observations
    )
    return outlier

# Evaluate outliers
adata.obs["outlier"] = (
    evaluate_outlier(adata.obs["log1p_total_counts"], outlier_threshold)
    | evaluate_outlier(adata.obs["log1p_n_genes_by_counts"], outlier_threshold)
    | evaluate_outlier(adata.obs["pct_counts_in_top_20_genes"], outlier_threshold)
)
adata.obs.outlier.value_counts()

In [None]:
# MT-outliers are slightly different
adata.obs["mt_outlier"] = evaluate_outlier(adata.obs["pct_counts_mt"], outlier_threshold - 2) | (adata.obs['pct_counts_mt'] > 10)
adata.obs.mt_outlier.value_counts()

In [None]:
to_filter_out = adata.obs["outlier"] | adata.obs["mt_outlier"]
print("Number of cells to filter out:", to_filter_out.sum())
print("Out of total cells:", adata.shape[0])

adata = adata[~to_filter_out].copy()
adata

In [None]:
fig, ax = plt.subplots()
ax = sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt", show=False, ax=ax)
ax.set_xlabel("Total counts per cell")
ax.set_ylabel("Number of genes per cell")
ax.set_title("Total counts vs. number of genes per cell (after filtering)")
plt.show()

# Saving the processed data

In [None]:
adata.write(processed_filename, compression="gzip")