# 01 - Merge Datasets

This notebook merges multiple h5ad files into a single AnnData object for integration.

## Inputs
- List of h5ad file paths
- Batch key name
- Optional downsampling configuration

## Outputs
- `merged.h5ad` - Full combined dataset (always saved)
- `merged_random_{n}.h5ad` - Random downsampled versions (optional)
- `merged_celltype_{n}pertype.h5ad` - Cell type aware downsampled versions (optional)
- `*.summary.txt` - Summary reports for each output

In [1]:
import scanpy as sc
import anndata as ad
import pandas as pd
import numpy as np
from pathlib import Path
import yaml

# For nice progress bars
from tqdm.auto import tqdm

sc.settings.verbosity = 2

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


  from .autonotebook import tqdm as notebook_tqdm


## Configuration

Edit the parameters below or load from a config file.

In [None]:
# Option 1: Load from config file
# config_path = "../config/sc-islet_integration_test.yaml"
# with open(config_path) as f:
#     config = yaml.safe_load(f)

# Option 2: Define parameters directly
config = {
    "input": {
        "files": [
            {"path": "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/4_integration/2025_11_30/rna/integrate/round_1/annotation/annotated.h5ad", "name": "igvf_sc-islet_10X-Multiome"},
            {"path": "/cellar/users/aklie/data/datasets/Augsornworawat2023_sc-islet_10X-Multiome/results/4_cell_annotation/rna/integrate/round_1/annotation/annotated.h5ad", "name": "Augsornworawat2023_sc-islet_10X-Multiome"},
        ],
        "batch_key": "dataset",
    },
    "output": {
        "dir": "/cellar/users/aklie/projects/igvf/beta_cell_networks/scratch/2026_01_11/integration/results",
    },
    # Downsampling configuration (optional)
    # The full merged dataset is ALWAYS saved
    # Downsampled versions are saved as additional files
    "downsampling": {
        "seed": 42,
        # Random downsampling - list of target cell counts (or single value)
        # Each value creates: merged_random_{n}.h5ad
        "random": {
            "n_cells": [],  # e.g., [10000, 25000, 50000] or 50000
        },
        # Cell type aware downsampling
        # Each value creates: merged_celltype_{n}pertype.h5ad
        "celltype_aware": {
            "celltype_column": "cell_type",
            "n_cells_per_type": [],  # e.g., [500, 1000] or 1000
            "keep_unshared": False,
            "min_cells_per_type": 10,
        },
    },
}

In [3]:
# Extract config values
input_files = config["input"]["files"]
batch_key = config["input"]["batch_key"]
output_dir = Path(config["output"]["dir"])

# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)

print(f"Input files: {len(input_files)}")
print(f"Batch key: {batch_key}")
print(f"Output directory: {output_dir}")

Input files: 2
Batch key: dataset
Output directory: /cellar/users/aklie/projects/igvf/beta_cell_networks/scratch/2026_01_11/integration/results


## Load datasets

In [4]:
adatas = {}

for file_info in tqdm(input_files, desc="Loading datasets"):
    path = file_info["path"]
    name = file_info["name"]
    
    print(f"\nLoading {name} from {path}")
    adata = sc.read_h5ad(path)
    
    # Add batch label
    adata.obs[batch_key] = name
    
    # Store
    adatas[name] = adata
    
    print(f"  Shape: {adata.shape}")
    print(f"  Obs columns: {list(adata.obs.columns)[:10]}...")

Loading datasets:   0%|          | 0/2 [00:00<?, ?it/s]


Loading igvf_sc-islet_10X-Multiome from /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/4_integration/2025_11_30/rna/integrate/round_1/annotation/annotated.h5ad


Loading datasets:  50%|█████     | 1/2 [02:44<02:44, 164.55s/it]

  Shape: (328420, 36601)
  Obs columns: ['gex_barcode_cellranger', 'atac_barcode_cellranger', 'is_cell_cellranger', 'excluded_reason_cellranger', 'gex_raw_reads_cellranger', 'gex_mapped_reads_cellranger', 'gex_conf_intergenic_reads_cellranger', 'gex_conf_exonic_reads_cellranger', 'gex_conf_intronic_reads_cellranger', 'gex_conf_exonic_unique_reads_cellranger']...

Loading Augsornworawat2023_sc-islet_10X-Multiome from /cellar/users/aklie/data/datasets/Augsornworawat2023_sc-islet_10X-Multiome/results/4_cell_annotation/rna/integrate/round_1/annotation/annotated.h5ad


Loading datasets: 100%|██████████| 2/2 [04:23<00:00, 131.74s/it]

  Shape: (44060, 36601)
  Obs columns: ['gex_barcode_cellranger', 'atac_barcode_cellranger', 'is_cell_cellranger', 'excluded_reason_cellranger', 'gex_raw_reads_cellranger', 'gex_mapped_reads_cellranger', 'gex_conf_intergenic_reads_cellranger', 'gex_conf_exonic_reads_cellranger', 'gex_conf_intronic_reads_cellranger', 'gex_conf_exonic_unique_reads_cellranger']...





## Inspect datasets before merging

In [5]:
# Summary table
summary = []
for name, adata in adatas.items():
    summary.append({
        "dataset": name,
        "n_cells": adata.n_obs,
        "n_genes": adata.n_vars,
        "has_raw": adata.raw is not None,
        "layers": list(adata.layers.keys()) if adata.layers else [],
    })

summary_df = pd.DataFrame(summary)
display(summary_df)

Unnamed: 0,dataset,n_cells,n_genes,has_raw,layers
0,igvf_sc-islet_10X-Multiome,328420,36601,True,[counts]
1,Augsornworawat2023_sc-islet_10X-Multiome,44060,36601,True,[counts]


In [6]:
# Check gene overlap
gene_sets = [set(adata.var_names) for adata in adatas.values()]
common_genes = set.intersection(*gene_sets)
all_genes = set.union(*gene_sets)

print(f"Common genes: {len(common_genes)}")
print(f"Total unique genes: {len(all_genes)}")
print(f"Overlap: {len(common_genes) / len(all_genes) * 100:.1f}%")

Common genes: 36601
Total unique genes: 36601
Overlap: 100.0%


## Merge datasets

In [7]:
# Use outer join to keep all genes, or inner to keep only common
join_type = "inner"  # or "outer"

print(f"Concatenating with join={join_type}...")
adata_merged = ad.concat(
    list(adatas.values()),
    join=join_type,
    label=batch_key,
    keys=list(adatas.keys()),
    index_unique="_",  # Add suffix to duplicate indices
)

print(f"\nMerged shape: {adata_merged.shape}")
print(f"Cells per batch:")
display(adata_merged.obs[batch_key].value_counts())

Concatenating with join=inner...

Merged shape: (372480, 36601)
Cells per batch:


dataset
igvf_sc-islet_10X-Multiome                  328420
Augsornworawat2023_sc-islet_10X-Multiome     44060
Name: count, dtype: int64

## Verify merge and clean up

In [9]:
# Ensure batch_key is categorical
adata_merged.obs[batch_key] = adata_merged.obs[batch_key].astype("category")

# Remove any duplicate var columns that may have been created
# (keep first occurrence)
if adata_merged.var.columns.duplicated().any():
    adata_merged.var = adata_merged.var.loc[:, ~adata_merged.var.columns.duplicated()]

# Summary of obs columns
print("Merged obs columns:")
print(adata_merged.obs.columns.tolist())

Merged obs columns:
['gex_barcode_cellranger', 'atac_barcode_cellranger', 'is_cell_cellranger', 'excluded_reason_cellranger', 'gex_raw_reads_cellranger', 'gex_mapped_reads_cellranger', 'gex_conf_intergenic_reads_cellranger', 'gex_conf_exonic_reads_cellranger', 'gex_conf_intronic_reads_cellranger', 'gex_conf_exonic_unique_reads_cellranger', 'gex_conf_exonic_antisense_reads_cellranger', 'gex_conf_exonic_dup_reads_cellranger', 'gex_exonic_umis_cellranger', 'gex_conf_intronic_unique_reads_cellranger', 'gex_conf_intronic_antisense_reads_cellranger', 'gex_conf_intronic_dup_reads_cellranger', 'gex_intronic_umis_cellranger', 'gex_conf_txomic_unique_reads_cellranger', 'gex_umis_count_cellranger', 'gex_genes_count_cellranger', 'atac_raw_reads_cellranger', 'atac_unmapped_reads_cellranger', 'atac_lowmapq_cellranger', 'atac_dup_reads_cellranger', 'atac_chimeric_reads_cellranger', 'atac_mitochondrial_reads_cellranger', 'atac_fragments_cellranger', 'atac_TSS_fragments_cellranger', 'atac_peak_region_f

In [10]:
# Quick QC check
print("\nMissing values per column:")
missing = adata_merged.obs.isnull().sum()
missing = missing[missing > 0]
if len(missing) > 0:
    display(missing)
else:
    print("No missing values in obs.")


Missing values per column:
No missing values in obs.


## Save full merged dataset

In [None]:
# Save full merged dataset (always)
output_path = output_dir / "merged.h5ad"
print(f"Saving full dataset to {output_path}...")
adata_merged.write_h5ad(output_path)

# Save summary
summary_path = output_dir / "merged.summary.txt"
with open(summary_path, "w") as f:
    f.write("=== Merge Summary ===\n\n")
    f.write(f"Total cells: {adata_merged.n_obs}\n")
    f.write(f"Total genes: {adata_merged.n_vars}\n")
    f.write(f"Batch key: {batch_key}\n\n")
    f.write("Cells per batch:\n")
    for batch, count in adata_merged.obs[batch_key].value_counts().items():
        f.write(f"  {batch}: {count}\n")

print(f"Full dataset saved: {output_path}")
print(f"Summary saved: {summary_path}")

## Downsampling (Optional)

Two downsampling methods are available for creating smaller test datasets:

1. **Random**: Simple random downsampling to a target cell count
2. **Cell type aware**: Match cell types between datasets, optionally remove non-shared types

Downsampled versions are saved as additional files:
- Random: `merged_random_{n_cells}.h5ad`
- Cell type aware: `merged_celltype_{n_per_type}pertype.h5ad`

In [None]:
def ensure_list(value):
    """Convert a single value to a list if needed."""
    if value is None:
        return []
    if isinstance(value, (list, tuple)):
        return list(value)
    return [value]


def downsample_random(adatas, n_cells, per_dataset=None, seed=42):
    """Randomly downsample datasets to a target cell count."""
    np.random.seed(seed)
    
    if per_dataset is None:
        total = sum(adata.n_obs for adata in adatas.values())
        per_dataset = {
            name: int(n_cells * adata.n_obs / total)
            for name, adata in adatas.items()
        }
    
    downsampled = {}
    for name, adata in adatas.items():
        target = per_dataset.get(name, adata.n_obs)
        target = min(target, adata.n_obs)
        
        if target < adata.n_obs:
            indices = np.random.choice(adata.n_obs, size=target, replace=False)
            indices = np.sort(indices)
            downsampled[name] = adata[indices].copy()
            print(f"  {name}: {adata.n_obs} -> {target} cells")
        else:
            downsampled[name] = adata.copy()
            print(f"  {name}: {adata.n_obs} cells (no downsampling needed)")
    
    return downsampled


def downsample_celltype_aware(adatas, celltype_column, n_cells_per_type=1000,
                               keep_unshared=False, min_cells_per_type=10, seed=42):
    """Downsample datasets with cell type awareness."""
    np.random.seed(seed)
    
    # Find cell types in each dataset
    celltype_sets = {
        name: set(adata.obs[celltype_column].dropna().unique())
        for name, adata in adatas.items()
    }
    
    shared_celltypes = set.intersection(*celltype_sets.values())
    all_celltypes = set.union(*celltype_sets.values())
    
    print(f"\n  Cell types per dataset:")
    for name, ct_set in celltype_sets.items():
        print(f"    {name}: {len(ct_set)} types")
    print(f"  Shared cell types: {len(shared_celltypes)}")
    print(f"  Total unique cell types: {len(all_celltypes)}")
    
    if not keep_unshared:
        print(f"  Keeping only shared cell types: {sorted(shared_celltypes)}")
        celltypes_to_keep = shared_celltypes
    else:
        celltypes_to_keep = all_celltypes
    
    downsampled = {}
    for name, adata in adatas.items():
        indices_to_keep = []
        
        for celltype in celltypes_to_keep:
            mask = adata.obs[celltype_column] == celltype
            ct_indices = np.where(mask)[0]
            
            if len(ct_indices) < min_cells_per_type:
                continue
            
            if len(ct_indices) > n_cells_per_type:
                sampled = np.random.choice(ct_indices, size=n_cells_per_type, replace=False)
            else:
                sampled = ct_indices
            
            indices_to_keep.extend(sampled)
        
        indices_to_keep = np.sort(indices_to_keep)
        downsampled[name] = adata[indices_to_keep].copy()
        
        orig_types = adata.obs[celltype_column].nunique()
        new_types = downsampled[name].obs[celltype_column].nunique()
        print(f"  {name}: {adata.n_obs} -> {len(indices_to_keep)} cells "
              f"({orig_types} -> {new_types} cell types)")
    
    return downsampled


def merge_adatas(adatas, batch_key, join_type="inner"):
    """Merge a dictionary of AnnData objects."""
    adata_merged = ad.concat(
        list(adatas.values()),
        join=join_type,
        label=batch_key,
        keys=list(adatas.keys()),
        index_unique="_",
    )
    adata_merged.obs[batch_key] = adata_merged.obs[batch_key].astype("category")
    return adata_merged


print("Downsampling functions defined.")

In [None]:
# Apply downsampling and save based on config
ds_config = config.get("downsampling", {})
seed = ds_config.get("seed", 42)
saved_files = [str(output_path)]

# Random downsampling
random_config = ds_config.get("random", {})
random_n_cells = ensure_list(random_config.get("n_cells", []))

for n_cells in random_n_cells:
    print(f"\n{'='*50}")
    print(f"Random downsampling: {n_cells} total cells")
    print('='*50)
    adatas_ds = downsample_random(adatas, n_cells=n_cells, seed=seed)
    adata_ds = merge_adatas(adatas_ds, batch_key)
    
    ds_path = output_dir / f"merged_random_{n_cells}.h5ad"
    print(f"\nSaving to {ds_path}...")
    adata_ds.write_h5ad(ds_path)
    saved_files.append(str(ds_path))
    print(f"Shape: {adata_ds.shape}")
    
    del adatas_ds, adata_ds

# Cell type aware downsampling
ct_config = ds_config.get("celltype_aware", {})
ct_n_per_type = ensure_list(ct_config.get("n_cells_per_type", []))
ct_column = ct_config.get("celltype_column", "cell_type")
keep_unshared = ct_config.get("keep_unshared", False)
min_cells = ct_config.get("min_cells_per_type", 10)

for n_per_type in ct_n_per_type:
    print(f"\n{'='*50}")
    print(f"Cell type aware downsampling: {n_per_type} cells per type")
    print('='*50)
    adatas_ds = downsample_celltype_aware(
        adatas,
        celltype_column=ct_column,
        n_cells_per_type=n_per_type,
        keep_unshared=keep_unshared,
        min_cells_per_type=min_cells,
        seed=seed,
    )
    adata_ds = merge_adatas(adatas_ds, batch_key)
    
    ds_path = output_dir / f"merged_celltype_{n_per_type}pertype.h5ad"
    print(f"\nSaving to {ds_path}...")
    adata_ds.write_h5ad(ds_path)
    saved_files.append(str(ds_path))
    print(f"Shape: {adata_ds.shape}")
    
    del adatas_ds, adata_ds

# Summary
print("\n" + "="*50)
print("All saved files:")
print("="*50)
for f in saved_files:
    print(f"  {f}")
print("\nDone!")

## Next steps

The merged dataset(s) are now ready for integration. Proceed to one of:

- `02_harmony_integration.ipynb` - Harmony (via R/rpy2)
- `03_seurat_integration.ipynb` - Seurat methods (CCA, RPCA, FastMNN)  
- `04_python_integration.ipynb` - scVI and Scanorama (pure Python)
- `04_seurat_integrationR.ipynb` - Pure R Seurat integration

For testing, use the downsampled files (e.g., `merged_celltype_1000pertype.h5ad`).