# 01 - Merge Datasets

This notebook merges multiple h5ad files into a single AnnData object for integration.

## Inputs
- List of h5ad file paths
- Batch key name
- Optional metadata mapping

## Outputs
- `merged.h5ad` - Combined dataset
- `merge_summary.txt` - QC report

In [1]:
import scanpy as sc
import anndata as ad
import pandas as pd
import numpy as np
from pathlib import Path
import yaml

# For nice progress bars
from tqdm.auto import tqdm

sc.settings.verbosity = 2

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


  from .autonotebook import tqdm as notebook_tqdm


## Configuration

Edit the parameters below or load from a config file.

In [2]:
# Option 1: Load from config file
# config_path = "../config/my_config.yaml"
# with open(config_path) as f:
#     config = yaml.safe_load(f)

# Option 2: Define parameters directly
config = {
    "input": {
        "files": [
            {"path": "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/4_integration/2025_11_30/rna/integrate/round_1/annotation/annotated.h5ad", "name": "igvf_sc-islet_10X-Multiome"},
            {"path": "/cellar/users/aklie/data/datasets/Augsornworawat2023_sc-islet_10X-Multiome/results/4_cell_annotation/rna/integrate/round_1/annotation/annotated.h5ad", "name": "Augsornworawat2023_sc-islet_10X-Multiome"},
        ],
        "batch_key": "dataset",  # Column name for batch after merge
    },
    "output": {
        "dir": "/cellar/users/aklie/projects/igvf/beta_cell_networks/scratch/2026_01_11/integration/results",
    },
}

In [3]:
# Extract config values
input_files = config["input"]["files"]
batch_key = config["input"]["batch_key"]
output_dir = Path(config["output"]["dir"])

# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)

print(f"Input files: {len(input_files)}")
print(f"Batch key: {batch_key}")
print(f"Output directory: {output_dir}")

Input files: 2
Batch key: dataset
Output directory: /cellar/users/aklie/projects/igvf/beta_cell_networks/scratch/2026_01_11/integration/results


## Load datasets

In [4]:
adatas = {}

for file_info in tqdm(input_files, desc="Loading datasets"):
    path = file_info["path"]
    name = file_info["name"]
    
    print(f"\nLoading {name} from {path}")
    adata = sc.read_h5ad(path)
    
    # Add batch label
    adata.obs[batch_key] = name
    
    # Store
    adatas[name] = adata
    
    print(f"  Shape: {adata.shape}")
    print(f"  Obs columns: {list(adata.obs.columns)[:10]}...")

Loading datasets:   0%|          | 0/2 [00:00<?, ?it/s]


Loading igvf_sc-islet_10X-Multiome from /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/4_integration/2025_11_30/rna/integrate/round_1/annotation/annotated.h5ad


Loading datasets:  50%|█████     | 1/2 [02:44<02:44, 164.55s/it]

  Shape: (328420, 36601)
  Obs columns: ['gex_barcode_cellranger', 'atac_barcode_cellranger', 'is_cell_cellranger', 'excluded_reason_cellranger', 'gex_raw_reads_cellranger', 'gex_mapped_reads_cellranger', 'gex_conf_intergenic_reads_cellranger', 'gex_conf_exonic_reads_cellranger', 'gex_conf_intronic_reads_cellranger', 'gex_conf_exonic_unique_reads_cellranger']...

Loading Augsornworawat2023_sc-islet_10X-Multiome from /cellar/users/aklie/data/datasets/Augsornworawat2023_sc-islet_10X-Multiome/results/4_cell_annotation/rna/integrate/round_1/annotation/annotated.h5ad


Loading datasets: 100%|██████████| 2/2 [04:23<00:00, 131.74s/it]

  Shape: (44060, 36601)
  Obs columns: ['gex_barcode_cellranger', 'atac_barcode_cellranger', 'is_cell_cellranger', 'excluded_reason_cellranger', 'gex_raw_reads_cellranger', 'gex_mapped_reads_cellranger', 'gex_conf_intergenic_reads_cellranger', 'gex_conf_exonic_reads_cellranger', 'gex_conf_intronic_reads_cellranger', 'gex_conf_exonic_unique_reads_cellranger']...





## Inspect datasets before merging

In [5]:
# Summary table
summary = []
for name, adata in adatas.items():
    summary.append({
        "dataset": name,
        "n_cells": adata.n_obs,
        "n_genes": adata.n_vars,
        "has_raw": adata.raw is not None,
        "layers": list(adata.layers.keys()) if adata.layers else [],
    })

summary_df = pd.DataFrame(summary)
display(summary_df)

Unnamed: 0,dataset,n_cells,n_genes,has_raw,layers
0,igvf_sc-islet_10X-Multiome,328420,36601,True,[counts]
1,Augsornworawat2023_sc-islet_10X-Multiome,44060,36601,True,[counts]


In [6]:
# Check gene overlap
gene_sets = [set(adata.var_names) for adata in adatas.values()]
common_genes = set.intersection(*gene_sets)
all_genes = set.union(*gene_sets)

print(f"Common genes: {len(common_genes)}")
print(f"Total unique genes: {len(all_genes)}")
print(f"Overlap: {len(common_genes) / len(all_genes) * 100:.1f}%")

Common genes: 36601
Total unique genes: 36601
Overlap: 100.0%


## Merge datasets

In [7]:
# Use outer join to keep all genes, or inner to keep only common
join_type = "inner"  # or "outer"

print(f"Concatenating with join={join_type}...")
adata_merged = ad.concat(
    list(adatas.values()),
    join=join_type,
    label=batch_key,
    keys=list(adatas.keys()),
    index_unique="_",  # Add suffix to duplicate indices
)

print(f"\nMerged shape: {adata_merged.shape}")
print(f"Cells per batch:")
display(adata_merged.obs[batch_key].value_counts())

Concatenating with join=inner...

Merged shape: (372480, 36601)
Cells per batch:


dataset
igvf_sc-islet_10X-Multiome                  328420
Augsornworawat2023_sc-islet_10X-Multiome     44060
Name: count, dtype: int64

## Verify merge and clean up

In [9]:
# Ensure batch_key is categorical
adata_merged.obs[batch_key] = adata_merged.obs[batch_key].astype("category")

# Remove any duplicate var columns that may have been created
# (keep first occurrence)
if adata_merged.var.columns.duplicated().any():
    adata_merged.var = adata_merged.var.loc[:, ~adata_merged.var.columns.duplicated()]

# Summary of obs columns
print("Merged obs columns:")
print(adata_merged.obs.columns.tolist())

Merged obs columns:
['gex_barcode_cellranger', 'atac_barcode_cellranger', 'is_cell_cellranger', 'excluded_reason_cellranger', 'gex_raw_reads_cellranger', 'gex_mapped_reads_cellranger', 'gex_conf_intergenic_reads_cellranger', 'gex_conf_exonic_reads_cellranger', 'gex_conf_intronic_reads_cellranger', 'gex_conf_exonic_unique_reads_cellranger', 'gex_conf_exonic_antisense_reads_cellranger', 'gex_conf_exonic_dup_reads_cellranger', 'gex_exonic_umis_cellranger', 'gex_conf_intronic_unique_reads_cellranger', 'gex_conf_intronic_antisense_reads_cellranger', 'gex_conf_intronic_dup_reads_cellranger', 'gex_intronic_umis_cellranger', 'gex_conf_txomic_unique_reads_cellranger', 'gex_umis_count_cellranger', 'gex_genes_count_cellranger', 'atac_raw_reads_cellranger', 'atac_unmapped_reads_cellranger', 'atac_lowmapq_cellranger', 'atac_dup_reads_cellranger', 'atac_chimeric_reads_cellranger', 'atac_mitochondrial_reads_cellranger', 'atac_fragments_cellranger', 'atac_TSS_fragments_cellranger', 'atac_peak_region_f

In [10]:
# Quick QC check
print("\nMissing values per column:")
missing = adata_merged.obs.isnull().sum()
missing = missing[missing > 0]
if len(missing) > 0:
    display(missing)
else:
    print("No missing values in obs.")


Missing values per column:
No missing values in obs.


## Save merged dataset

In [11]:
# Save full merged object
output_path = output_dir / "merged.h5ad"
print(f"Saving to {output_path}...")
adata_merged.write_h5ad(output_path)
print("Done!")

Saving to /cellar/users/aklie/projects/igvf/beta_cell_networks/scratch/2026_01_11/integration/results/merged.h5ad...
Done!


In [12]:
# Save summary report
summary_path = output_dir / "merge_summary.txt"
with open(summary_path, "w") as f:
    f.write("=== Merge Summary ===\n\n")
    f.write(f"Total cells: {adata_merged.n_obs}\n")
    f.write(f"Total genes: {adata_merged.n_vars}\n")
    f.write(f"Batch key: {batch_key}\n\n")
    f.write("Cells per batch:\n")
    for batch, count in adata_merged.obs[batch_key].value_counts().items():
        f.write(f"  {batch}: {count}\n")
    f.write(f"\nOutput: {output_path}\n")

print(f"Summary saved to {summary_path}")

Summary saved to /cellar/users/aklie/projects/igvf/beta_cell_networks/scratch/2026_01_11/integration/results/merge_summary.txt


## Optional: Filter by cell type

Uncomment and modify to create a filtered version with specific cell types.

In [None]:
# # Filter to specific cell types
# celltype_col = "celltype"  # Adjust to your column name
# keep_types = ["alpha", "beta", "delta"]  # Adjust to your cell types

# if celltype_col in adata_merged.obs.columns:
#     mask = adata_merged.obs[celltype_col].isin(keep_types)
#     adata_filt = adata_merged[mask].copy()
#     print(f"Filtered to {adata_filt.n_obs} cells")
#     
#     # Save filtered
#     filt_path = output_dir / "merged_celltype_filt.h5ad"
#     adata_filt.write_h5ad(filt_path)
#     print(f"Saved to {filt_path}")

## Next steps

The merged dataset is now ready for integration. Proceed to one of:

- `02_harmony_integration.ipynb` - Harmony (via R/rpy2)
- `03_seurat_integration.ipynb` - Seurat methods (CCA, RPCA, FastMNN)
- `04_python_integration.ipynb` - scVI and Scanorama (pure Python)