In [7]:
import numpy as np
import scanpy as sc

# 1. Single-Cell RNA-seq Dataset Exploration

In [8]:
adata = sc.read(
    filename="s4d8_quality_control.h5ad",
    backup_url="https://figshare.com/ndownloader/files/40014331",
)



In [9]:
# Print summary of the dataset
adata

AnnData object with n_obs × n_vars = 14814 × 20171
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'scDblFinder_score', 'scDblFinder_class'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells'
    layers: 'counts', 'soupX_counts'

In [10]:
# Inspect the Dataset Shape
adata.shape

(14814, 20171)

- Shape: `(14814, 20171)`
  - 14814 cells (rows in `adata.obs`)
  - 20171 genes (columns in `adata.var`)

This tells that there are roughly **14814** single cells and **20171** measured features (genes) in the dataset.

## Observations (Cells)

Each row in `adata.obs` corresponds to a single cell.  
We'll inspect the first few rows to see what metadata fields are present.

In [11]:
# List all columns (metadata fields) in adata.obs
print("Cell metadata fields:", adata.obs_keys())

# Show the first five rows of the cell metadata
adata.obs.head()

Cell metadata fields: ['n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'scDblFinder_score', 'scDblFinder_class']


Unnamed: 0,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_20_genes,total_counts_mt,log1p_total_counts_mt,pct_counts_mt,total_counts_ribo,log1p_total_counts_ribo,pct_counts_ribo,total_counts_hb,log1p_total_counts_hb,pct_counts_hb,outlier,mt_outlier,scDblFinder_score,scDblFinder_class
AAACAGCCAAGCTTAT-1,1362,7.217443,2243.0,7.716015,9.719126,18.0,2.944439,0.802497,125.0,4.836282,5.572894,3.0,1.386294,0.133749,False,False,0.036574,singlet
AAACAGCCATAGCTTG-1,2126,7.662468,4085.0,8.315322,10.820073,63.0,4.158883,1.542228,69.0,4.248495,1.689106,10.0,2.397895,0.244798,False,False,0.718944,doublet
AAACAGCCATGTTTGG-1,1340,7.201171,2227.0,7.708859,12.977099,30.0,3.433987,1.347104,42.0,3.7612,1.885945,1.0,0.693147,0.044903,False,False,0.214918,singlet
AAACATGCAACGTGCT-1,828,6.72022,1211.0,7.100027,14.037985,20.0,3.044523,1.651528,44.0,3.806663,3.633361,6.0,1.94591,0.495458,False,False,0.201699,singlet
AAACATGCAATATAGG-1,551,6.313548,719.0,6.579251,15.020862,8.0,2.197225,1.112656,22.0,3.135494,3.059805,1.0,0.693147,0.139082,False,False,0.034913,singlet


## Variables (Genes)

Each row in `adata.var` corresponds to a gene (or feature).  
We'll inspect the first few rows to see what annotations (like gene IDs, chromosome info, etc.) are available.

In [12]:
# List all columns (metadata fields) in adata.var
print("Gene metadata fields:", adata.var_keys())

# Show the first five rows of the gene metadata
adata.var.head()

Gene metadata fields: ['gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells']


Unnamed: 0,gene_ids,feature_types,genome,mt,ribo,hb,n_cells_by_counts,mean_counts,log1p_mean_counts,pct_dropout_by_counts,total_counts,log1p_total_counts,n_cells
AL627309.1,ENSG00000238009,Gene Expression,GRCh38,False,False,False,30,0.001831,0.001829,99.822842,31.0,3.465736,27
AL627309.5,ENSG00000241860,Gene Expression,GRCh38,False,False,False,146,0.008976,0.008936,99.137829,152.0,5.030438,126
LINC01409,ENSG00000237491,Gene Expression,GRCh38,False,False,False,883,0.063009,0.061104,94.785638,1067.0,6.973543,758
LINC01128,ENSG00000228794,Gene Expression,GRCh38,False,False,False,533,0.03614,0.035503,96.852486,612.0,6.418365,437
LINC00115,ENSG00000225880,Gene Expression,GRCh38,False,False,False,72,0.00437,0.00436,99.57482,74.0,4.317488,64


## Layers

Sometimes, an `AnnData` object contains multiple expression layers.  
We can take a look of these layer.

In [13]:
print("Available layers:", list(adata.layers.keys()))

Available layers: ['counts', 'soupX_counts']


In [14]:
# Inspect a small slice of 'counts' layer 
print("First 5 cells x 5 genes in 'counts' layer:\n", adata.layers["counts"][:5, :5].toarray())

First 5 cells x 5 genes in 'counts' layer:
 [[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]


In [15]:
# Inspect a small slice of 'soupX_counts' layer 
print("First 5 cells x 5 genes in 'soupX_counts' layer:\n", adata.layers["soupX_counts"][:5, :5].toarray())

First 5 cells x 5 genes in 'soupX_counts' layer:
 [[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]


# 2. Data Preprocessing

## Quality Control

In this section, we will:

1. **Calculate QC metrics**: Total counts, number of detected genes, mitochondrial/ribosomal content, etc.
2. **Filter cells**: Remove cells with too few genes or too many genes (potential doublets), and cells with high mitochondrial gene percentages.
3. **Visualize** the effect of filtering.

### 1. Calculate QC Metrics

We'll use `sc.pp.calculate_qc_metrics`, which can automatically compute:
- `n_genes_by_counts` (number of genes per cell)
- `total_counts` (sum of UMIs per cell)
- `adata.var` as `mt`(Mitochondrial percentages) 

In [16]:
sc.pp.calculate_qc_metrics(
    adata,
    qc_vars=['mt', 'ribo', 'hb'],  # adjust to what you have in your var
    inplace=True
)

print("QC metrics added to adata.obs:")
print(adata.obs_keys())
adata.obs.head()


QC metrics added to adata.obs:
['n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'scDblFinder_score', 'scDblFinder_class', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes']


Unnamed: 0,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_20_genes,total_counts_mt,log1p_total_counts_mt,pct_counts_mt,total_counts_ribo,log1p_total_counts_ribo,...,log1p_total_counts_hb,pct_counts_hb,outlier,mt_outlier,scDblFinder_score,scDblFinder_class,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes
AAACAGCCAAGCTTAT-1,1272,7.149132,2091.0,7.645876,9.719126,17.0,2.890372,0.813008,123.0,4.820282,...,1.386294,0.143472,False,False,0.036574,singlet,16.834051,25.2989,37.58967,63.079866
AAACAGCCATAGCTTG-1,1783,7.486613,3494.0,8.159089,10.820073,56.0,4.043051,1.602748,61.0,4.127134,...,2.302585,0.257584,False,False,0.718944,doublet,18.717802,26.617058,37.893532,58.643389
AAACAGCCATGTTTGG-1,1217,7.104965,2016.0,7.609367,12.977099,28.0,3.367296,1.388889,35.0,3.583519,...,0.693147,0.049603,False,False,0.214918,singlet,20.882937,29.464286,41.468254,64.434524
AAACATGCAACGTGCT-1,778,6.658011,1141.0,7.040536,14.037985,19.0,2.995732,1.665206,42.0,3.7612,...,1.94591,0.525855,False,False,0.201699,singlet,22.874671,32.60298,49.342682,75.635408
AAACATGCAATATAGG-1,528,6.270988,689.0,6.536692,15.020862,8.0,2.197225,1.161103,19.0,2.995732,...,0.0,0.0,False,False,0.034913,singlet,24.818578,37.880987,52.394775,95.936139
