# 04 - Python Integration (scVI & Scanorama)

Pure Python integration methods without R dependency.

## Methods
- **scVI**: Deep learning VAE approach, learns latent representation
- **Scanorama**: Panoramic stitching algorithm, fast and memory-efficient

## Workflow
1. Load merged h5ad
2. Preprocess (HVGs, raw counts)
3. Run scVI integration
4. Run Scanorama integration
5. Compare results

## Outputs
- `integrated_python.h5ad` - Object with scVI and Scanorama embeddings
- `scvi_model/` - Saved scVI model
- `figures/` - Comparison plots

In [None]:
import sys
sys.path.insert(0, "..")

import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import yaml

# Local utilities
from utils.preprocessing import store_raw_counts, normalize_and_log, find_hvgs
from utils.integration import run_scvi, run_scanorama, compute_neighbors_and_umap, run_leiden_clustering
from utils.evaluation import compare_integration_methods
from utils.visualization import plot_method_comparison, plot_umap_grid, plot_metrics_comparison

sc.settings.verbosity = 2
sc.settings.set_figure_params(dpi=100, facecolor="white")

## Configuration

In [None]:
config = {
    "input": {
        "h5ad_path": "./results/merged.h5ad",
        "batch_key": "dataset",
    },
    "preprocessing": {
        "n_top_genes": 3000,
    },
    "integration": {
        "key": "sample_id",
        "scvi": {
            "n_latent": 30,
            "n_layers": 2,
            "n_epochs": 400,
            "early_stopping": True,
        },
        "scanorama": {
            "knn": 20,
        },
    },
    "clustering": {
        "resolutions": [0.2, 0.5, 0.8, 1.0],
        "n_neighbors": 30,
    },
    "output": {
        "dir": "./results/python/",
        "save_model": True,
    },
}

In [None]:
# Extract config
input_path = Path(config["input"]["h5ad_path"])
batch_key = config["input"]["batch_key"]
integration_key = config["integration"]["key"]
output_dir = Path(config["output"]["dir"])

# Create output directories
output_dir.mkdir(parents=True, exist_ok=True)
(output_dir / "figures").mkdir(exist_ok=True)
(output_dir / "metrics").mkdir(exist_ok=True)

print(f"Input: {input_path}")
print(f"Integration key: {integration_key}")
print(f"Output: {output_dir}")

## Load data

In [None]:
print(f"Loading {input_path}...")
adata = sc.read_h5ad(input_path)
print(f"Shape: {adata.shape}")
print(f"Batches ({integration_key}): {adata.obs[integration_key].nunique()}")

In [None]:
# Store raw counts if not present (required for scVI)
if "counts" not in adata.layers:
    print("Storing raw counts in layers['counts']...")
    store_raw_counts(adata, layer_name="counts")

## Preprocessing

In [None]:
# Normalize and find HVGs for Scanorama (scVI will use raw counts)
print("Normalizing...")
normalize_and_log(adata)

print("Finding HVGs...")
find_hvgs(adata, n_top_genes=config["preprocessing"]["n_top_genes"])

print(f"HVGs: {adata.var['highly_variable'].sum()}")

In [None]:
# Run PCA on uncorrected data for comparison
print("Running PCA on uncorrected data...")
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, n_comps=50)

# Compute UMAP for uncorrected
sc.pp.neighbors(adata, use_rep="X_pca", n_neighbors=30)
sc.tl.umap(adata)
adata.obsm["X_umap_uncorrected"] = adata.obsm["X_umap"].copy()

In [None]:
# Visualize uncorrected
fig = plot_umap_grid(
    adata,
    color_keys=[batch_key, integration_key],
    basis="X_umap_uncorrected",
    title_prefix="Uncorrected - ",
    save_path=output_dir / "figures" / "umap_uncorrected.png",
)
plt.show()

## scVI Integration

In [None]:
try:
    import scvi
    print(f"scvi-tools version: {scvi.__version__}")
    SCVI_AVAILABLE = True
except ImportError:
    print("scvi-tools not installed. Skipping scVI integration.")
    print("Install with: pip install scvi-tools")
    SCVI_AVAILABLE = False

In [None]:
if SCVI_AVAILABLE:
    print(f"Running scVI with batch key: {integration_key}...")
    
    scvi_config = config["integration"]["scvi"]
    model_dir = output_dir / "scvi_model" if config["output"]["save_model"] else None
    
    latent = run_scvi(
        adata,
        batch_key=integration_key,
        n_latent=scvi_config["n_latent"],
        n_layers=scvi_config["n_layers"],
        n_epochs=scvi_config.get("n_epochs"),
        early_stopping=scvi_config["early_stopping"],
        use_gpu=True,
        key_added="X_scVI",
        model_dir=str(model_dir) if model_dir else None,
    )
    
    print(f"scVI latent shape: {latent.shape}")
    if model_dir:
        print(f"Model saved to {model_dir}")

In [None]:
if SCVI_AVAILABLE and "X_scVI" in adata.obsm:
    # Compute neighbors and UMAP on scVI latent
    print("Computing neighbors and UMAP on scVI latent...")
    sc.pp.neighbors(adata, use_rep="X_scVI", n_neighbors=config["clustering"]["n_neighbors"])
    sc.tl.umap(adata)
    adata.obsm["X_umap_scvi"] = adata.obsm["X_umap"].copy()
    
    # Clustering
    for res in config["clustering"]["resolutions"]:
        key = f"leiden_scvi_{res}"
        sc.tl.leiden(adata, resolution=res, key_added=key)
        print(f"  Resolution {res}: {adata.obs[key].nunique()} clusters")

In [None]:
if SCVI_AVAILABLE and "X_umap_scvi" in adata.obsm:
    # Visualize scVI results
    fig = plot_umap_grid(
        adata,
        color_keys=[batch_key, integration_key],
        basis="X_umap_scvi",
        title_prefix="scVI - ",
        save_path=output_dir / "figures" / "umap_scvi_batch.png",
    )
    plt.show()

## Scanorama Integration

In [None]:
try:
    import scanorama
    SCANORAMA_AVAILABLE = True
    print("Scanorama available")
except ImportError:
    print("Scanorama not installed. Skipping Scanorama integration.")
    print("Install with: pip install scanorama")
    SCANORAMA_AVAILABLE = False

In [None]:
if SCANORAMA_AVAILABLE:
    print(f"Running Scanorama with batch key: {integration_key}...")
    
    scanorama_config = config["integration"]["scanorama"]
    
    # Scanorama needs log-normalized data with HVGs
    # Subset to HVGs
    adata_hvg = adata[:, adata.var["highly_variable"]].copy()
    
    # Split by batch
    batches = adata_hvg.obs[integration_key].unique()
    adatas_batch = [adata_hvg[adata_hvg.obs[integration_key] == b].copy() for b in batches]
    
    print(f"Split into {len(adatas_batch)} batches")
    
    # Run Scanorama
    scanorama.integrate_scanpy(
        adatas_batch,
        knn=scanorama_config["knn"],
    )
    
    # Concatenate back
    adata_integrated = adatas_batch[0].concatenate(
        adatas_batch[1:],
        batch_key="_scanorama_batch",
        index_unique=None,
    )
    
    # Reorder to match original
    adata_integrated = adata_integrated[adata.obs_names]
    
    # Store in original adata
    adata.obsm["X_scanorama"] = adata_integrated.obsm["X_scanorama"]
    
    print(f"Scanorama embedding shape: {adata.obsm['X_scanorama'].shape}")

In [None]:
if SCANORAMA_AVAILABLE and "X_scanorama" in adata.obsm:
    # Compute neighbors and UMAP on Scanorama
    print("Computing neighbors and UMAP on Scanorama...")
    
    # Run PCA on Scanorama-corrected embedding
    sc.pp.pca(adata, use_rep="X_scanorama")
    adata.obsm["X_pca_scanorama"] = adata.obsm["X_pca"].copy()
    
    sc.pp.neighbors(adata, use_rep="X_pca_scanorama", n_neighbors=config["clustering"]["n_neighbors"])
    sc.tl.umap(adata)
    adata.obsm["X_umap_scanorama"] = adata.obsm["X_umap"].copy()
    
    # Clustering
    for res in config["clustering"]["resolutions"]:
        key = f"leiden_scanorama_{res}"
        sc.tl.leiden(adata, resolution=res, key_added=key)
        print(f"  Resolution {res}: {adata.obs[key].nunique()} clusters")

In [None]:
if SCANORAMA_AVAILABLE and "X_umap_scanorama" in adata.obsm:
    # Visualize Scanorama results
    fig = plot_umap_grid(
        adata,
        color_keys=[batch_key, integration_key],
        basis="X_umap_scanorama",
        title_prefix="Scanorama - ",
        save_path=output_dir / "figures" / "umap_scanorama_batch.png",
    )
    plt.show()

## Compare Methods

In [None]:
# Build embeddings dict for available methods
umap_embeddings = {"Uncorrected": "X_umap_uncorrected"}
latent_embeddings = {"Uncorrected": "X_pca"}

if "X_umap_scvi" in adata.obsm:
    umap_embeddings["scVI"] = "X_umap_scvi"
    latent_embeddings["scVI"] = "X_scVI"

if "X_umap_scanorama" in adata.obsm:
    umap_embeddings["Scanorama"] = "X_umap_scanorama"
    latent_embeddings["Scanorama"] = "X_pca_scanorama"

print(f"Available methods: {list(umap_embeddings.keys())}")

In [None]:
# Plot comparison by batch
fig = plot_method_comparison(
    adata,
    embeddings=umap_embeddings,
    color_by=integration_key,
    save_path=output_dir / "figures" / "comparison_by_batch.png",
)
plt.show()

In [None]:
# If celltype available, plot by celltype
if "celltype" in adata.obs.columns:
    fig = plot_method_comparison(
        adata,
        embeddings=umap_embeddings,
        color_by="celltype",
        save_path=output_dir / "figures" / "comparison_by_celltype.png",
    )
    plt.show()

In [None]:
# Compute and compare metrics
metrics_df = compare_integration_methods(
    adata,
    batch_key=integration_key,
    embeddings=latent_embeddings,
)

display(metrics_df)

In [None]:
# Save metrics
metrics_df.to_csv(output_dir / "metrics" / "integration_metrics.csv")
print(f"Metrics saved to {output_dir / 'metrics' / 'integration_metrics.csv'}")

In [None]:
# Plot metrics
fig = plot_metrics_comparison(
    metrics_df,
    save_path=output_dir / "figures" / "metrics_comparison.png",
)
plt.show()

## Save Results

In [None]:
# Save integrated object
output_path = output_dir / "integrated_python.h5ad"
print(f"Saving to {output_path}...")
adata.write_h5ad(output_path)
print("Done!")

In [None]:
# Save cell metadata
metadata_path = output_dir / "cell_metadata.tsv"
adata.obs.to_csv(metadata_path, sep="\t")
print(f"Saved metadata to {metadata_path}")

## Summary

Python integration complete. Available embeddings:

- `X_pca` - Uncorrected PCA
- `X_scVI` - scVI latent representation (if scvi-tools installed)
- `X_scanorama` / `X_pca_scanorama` - Scanorama corrected (if scanorama installed)

And corresponding UMAPs for visualization.

### Notes on methods:

**scVI**:
- Deep learning approach, best for complex batch effects
- Saves model for transfer learning / imputation
- Can take longer to train, benefits from GPU

**Scanorama**:
- Fast panoramic stitching algorithm
- Memory-efficient, good for quick integration
- May be less effective for severe batch effects