# Basic Pre-processing of 10X scRNA-seq data (Part 2)

Load the processed dataset from the snakemake pipeline

In [None]:
# Load params
import os

h5ad_file = os.getenv("SNAKEMAKE_H5AD_FILE")
if h5ad_file is None:
    raise ValueError("SNAKEMAKE_H5AD_FILE environment variable is not set.")
doublet_table = os.getenv("SNAKEMAKE_DOUBLET_FILE")
if doublet_table is None:
    raise ValueError("SNAKEMAKE_DOUBLET_FILE environment variable is not set.")
n_hvgs = os.getenv("SNAKEMAKE_N_HVGS", "5000")
n_hvgs = int(n_hvgs)
processed_filename = os.getenv("SNAKEMAKE_PROCESSED_FILENAME", "processed_adata.h5ad")
print("Scanpy anndata file:", h5ad_file)
print("Doublet table file:", doublet_table)
print("Number of highly variable genes to compute:", n_hvgs)
print("Processed filename:", processed_filename)

Convert into a `Scanpy` object

In [None]:
import scanpy as sc
adata = sc.read_h5ad(h5ad_file)

# Delete the old raw data if it exists
adata.raw = None
# Copy the current object
adata.raw = adata.copy()

print("Loaded AnnData object successfully.")
print("Shape: ", adata.shape)

Read the doublet table and add it to the AnnData object

In [None]:
import pandas as pd
doublet_df = pd.read_csv(doublet_table, index_col=0)
doublet_df.index.name = "cell"
doublet_df['scDblFinder.class'].value_counts()

Plot doublet scores, then remove doublets from the AnnData object

In [None]:
import matplotlib.pyplot as plt
# Histogram of doublet scores
plt.figure(figsize=(8, 6))
plt.hist(doublet_df['scDblFinder.score'], bins=50, color='blue', alpha=0.7)
plt.xlabel('Doublet Score')
plt.ylabel('Frequency')
plt.title('Distribution of Doublet Scores')

In [None]:
# Add doublet info to the AnnData object by joining on the index
adata.obs = adata.obs.join(doublet_df, how='left')
adata

In [None]:
# Remove doublets from the AnnData object
adata = adata[adata.obs['scDblFinder.class'] != 'doublet'].copy()
adata

# Basic Pre-processing analysis

We will call HVGs using the pearson-residual method which acts on raw counts

In [None]:
sc.experimental.pp.highly_variable_genes(adata, n_top_genes=n_hvgs, flavor="pearson_residuals", layer=None, batch_key='sample')

print("Number of highly variable genes:", adata.var["highly_variable"].sum())

Compute the number of PCs needed using the kneedle method

In [None]:
# Create a logX layer
n_pcs = 100
adata.layers["logX"] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e4, layer="logX", inplace=True)
sc.pp.log1p(adata, layer="logX", copy=False)
sc.pp.pca(adata, mask_var='highly_variable', layer="logX", n_comps=n_pcs, svd_solver="arpack")
adata

In [None]:
import numpy as np
var_ratio = adata.uns["pca"]["variance_ratio"]
cum_var = np.cumsum(var_ratio)
cum_var

In [None]:
from kneed import KneeLocator
kl = KneeLocator(
    range(1, len(cum_var) + 1),
    cum_var,
    curve="concave",
    direction="increasing",
    S=1.0,
)
if not kl.knee:
    print("Warning: no knee point. Using all PCs.")
else:
    n_pcs = int(kl.knee)
    print(f"Number of PCs to use: {n_pcs}")

In [None]:
import matplotlib.pyplot as plt
# Plot the knee
sc.pl.pca_variance_ratio(adata, log=True, show=False)
ax = plt.gca()
ax.axvline(n_pcs, color="red", linestyle="--", label=f"n_pcs={n_pcs}")
ax.set_xlabel("Number of PCs")
ax.set_ylabel("Variance explained (log scale)")
ax.set_title("Variance explained by PCs (with knee point)")
plt.show()

# Finally, some basic clustering and visualization

In [None]:
sc.pp.neighbors(adata, n_pcs=n_pcs)
sc.tl.umap(adata, min_dist=0.5, spread=1.0)
sc.tl.leiden(adata, resolution=0.5)
sc.pl.umap(adata, color=["leiden", "n_genes_by_counts", "total_counts", "pct_counts_mt", 'sample', ], frameon=False, ncols=2)

# Saving the processed data

In [None]:
adata.write(processed_filename, compression="gzip")