In [1]:
import numpy as np
import scanpy as sc

# 1. Single-Cell RNA-seq Dataset Exploration

In [2]:
# Load dataset
adata = sc.read(
    filename="s4d8_quality_control.h5ad",
    backup_url="https://figshare.com/ndownloader/files/40014331",
)

  0%|          | 0.00/501M [00:00<?, ?B/s]

In [5]:
# Print summary of the dataset
adata

AnnData object with n_obs × n_vars = 14814 × 20171
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'scDblFinder_score', 'scDblFinder_class'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells'
    layers: 'counts', 'soupX_counts'

In [4]:
# Inspect the Dataset Shape
adata.shape

(14814, 20171)

- Shape: `(14814, 20171)`
  - 14814 cells (rows in `adata.obs`)
  - 20171 genes (columns in `adata.var`)

This tells you that there are roughly **14814** single cells and **20171** measured features (genes) in the dataset.

## Observations (Cells)

Each row in `adata.obs` corresponds to a single cell.  
We'll inspect the first few rows to see what metadata fields are present.

In [6]:
# List all columns (metadata fields) in adata.obs
print("Cell metadata fields:", adata.obs_keys())

# Show the first five rows of the cell metadata
adata.obs.head()

Cell metadata fields: ['n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'scDblFinder_score', 'scDblFinder_class']


Unnamed: 0,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_20_genes,total_counts_mt,log1p_total_counts_mt,pct_counts_mt,total_counts_ribo,log1p_total_counts_ribo,pct_counts_ribo,total_counts_hb,log1p_total_counts_hb,pct_counts_hb,outlier,mt_outlier,scDblFinder_score,scDblFinder_class
AAACAGCCAAGCTTAT-1,1362,7.217443,2243.0,7.716015,9.719126,18.0,2.944439,0.802497,125.0,4.836282,5.572894,3.0,1.386294,0.133749,False,False,0.036574,singlet
AAACAGCCATAGCTTG-1,2126,7.662468,4085.0,8.315322,10.820073,63.0,4.158883,1.542228,69.0,4.248495,1.689106,10.0,2.397895,0.244798,False,False,0.718944,doublet
AAACAGCCATGTTTGG-1,1340,7.201171,2227.0,7.708859,12.977099,30.0,3.433987,1.347104,42.0,3.7612,1.885945,1.0,0.693147,0.044903,False,False,0.214918,singlet
AAACATGCAACGTGCT-1,828,6.72022,1211.0,7.100027,14.037985,20.0,3.044523,1.651528,44.0,3.806663,3.633361,6.0,1.94591,0.495458,False,False,0.201699,singlet
AAACATGCAATATAGG-1,551,6.313548,719.0,6.579251,15.020862,8.0,2.197225,1.112656,22.0,3.135494,3.059805,1.0,0.693147,0.139082,False,False,0.034913,singlet


## Variables (Genes)

Each row in `adata.var` corresponds to a gene (or feature).  
We'll inspect the first few rows to see what annotations (like gene IDs, chromosome info, etc.) are available.

In [7]:
# List all columns (metadata fields) in adata.var
print("Gene metadata fields:", adata.var_keys())

# Show the first five rows of the gene metadata
adata.var.head()

Gene metadata fields: ['gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells']


Unnamed: 0,gene_ids,feature_types,genome,mt,ribo,hb,n_cells_by_counts,mean_counts,log1p_mean_counts,pct_dropout_by_counts,total_counts,log1p_total_counts,n_cells
AL627309.1,ENSG00000238009,Gene Expression,GRCh38,False,False,False,30,0.001831,0.001829,99.822842,31.0,3.465736,27
AL627309.5,ENSG00000241860,Gene Expression,GRCh38,False,False,False,146,0.008976,0.008936,99.137829,152.0,5.030438,126
LINC01409,ENSG00000237491,Gene Expression,GRCh38,False,False,False,883,0.063009,0.061104,94.785638,1067.0,6.973543,758
LINC01128,ENSG00000228794,Gene Expression,GRCh38,False,False,False,533,0.03614,0.035503,96.852486,612.0,6.418365,437
LINC00115,ENSG00000225880,Gene Expression,GRCh38,False,False,False,72,0.00437,0.00436,99.57482,74.0,4.317488,64


## Layers

Sometimes, an `AnnData` object contains multiple expression layers (e.g., raw counts, corrected counts).  
We can check which layers are available and decide which one to use for downstream analysis.

In [9]:
print("Available layers:", list(adata.layers.keys()))

Available layers: ['counts', 'soupX_counts']


In [10]:
# Inspect a small slice of 'counts' layer 
print("First 5 cells x 5 genes in 'counts' layer:\n", adata.layers["counts"][:5, :5].toarray())

First 5 cells x 5 genes in 'counts' layer:
 [[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]


In [11]:
# Inspect a small slice of 'soupX_counts' layer 
print("First 5 cells x 5 genes in 'soupX_counts' layer:\n", adata.layers["soupX_counts"][:5, :5].toarray())

First 5 cells x 5 genes in 'soupX_counts' layer:
 [[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
