In [None]:
# load ref adata
adata_ref = sc.read_h5ad('/ocean/projects/cis240075p/asachan/datasets/TA_muscle/ERCC1_KO_mice/aging_all_2024/objects/major_celltype_updated_ref.h5ad')
adata_query = sc.read_h5ad('/ocean/projects/cis240075p/asachan/datasets/TA_muscle/ERCC1_KO_mice/samples_2025/cellranger_aggr/ERCC1_KO_mice_aggr_updated.h5ad')
# Fix Seurat-to-scanpy compatibility
# adata_ref.uns['pca']['params'] = {
#     'zero_center': True,
#     'use_highly_variable': True,
#     'n_comps': adata_ref.obsm['X_pca'].shape[1]
# }
# sc.pp.neighbors(adata_ref, n_neighbors=10, n_pcs=40)

In [None]:
adata_ref

In [None]:
adata_query

In [None]:
sc.pl.umap(adata_ref, color=['cell_type'])

In [None]:
# Subset to common genes (99% overlap)
common_genes = adata_ref.var_names.intersection(adata_query.var_names)
adata_ref_subset = adata_ref[:, common_genes].copy()
adata_query_subset = adata_query[:, common_genes].copy()

In [None]:
# Step 4: Add batch information
adata_ref_subset.obs['batch'] = 'reference'
adata_ref_subset.obs['dataset'] = 'ref'
adata_query_subset.obs['batch'] = 'query'
adata_query_subset.obs['dataset'] = 'query'

In [None]:
# Ensure both have cell_type column
if 'cell_type' not in adata_query_subset.obs.columns:
    adata_query_subset.obs['cell_type'] = 'unknown'

In [None]:
# Step 5: Combine datasets
adata_combined = ad.concat([adata_ref_subset, adata_query_subset], 
                            join='outer', index_unique='-')
print(f"Combined: {adata_combined.n_obs} cells × {adata_combined.n_vars} genes")

In [None]:
adata_combined

In [None]:
# Step 6: Preprocessing (counts norma nd feature selection)
print("\n6. Preprocessing combined dataset...")

# Use raw counts if available
if 'counts' in adata_combined.layers:
    adata_combined.X = adata_combined.layers['counts'].copy()

# Normalize and log transform
sc.pp.normalize_total(adata_combined, target_sum=1e4)
sc.pp.log1p(adata_combined)
adata_combined.raw = adata_combined

# Find highly variable genes (batch-aware)
sc.pp.highly_variable_genes(adata_combined, 
                            min_mean=0.0125, max_mean=3, min_disp=0.5,
                            batch_key='batch')

# Filter to HVGs
adata_combined = adata_combined[:, adata_combined.var.highly_variable]
print(f"Highly variable genes: {adata_combined.n_vars}")

# Scale data
sc.pp.scale(adata_combined, max_value=10)

In [None]:
# Step 7: PCA
print("\n7. Computing PCA...")
sc.tl.pca(adata_combined, svd_solver='arpack', n_comps=50)

In [None]:
# Step 8: Harmony integration
print("\n8. Running Harmony integration...")
try:
    import harmonypy as hm
    harmony_out = hm.run_harmony(adata_combined.obsm['X_pca'], 
                                adata_combined.obs, 
                                vars_use=['batch'])
    adata_combined.obsm['X_harmony'] = harmony_out.Z_corr.T
    use_rep = 'X_harmony'
    print("Harmony integration completed!")
except ImportError:
    print("Harmony not available, using PCA for downstream analysis")
    use_rep = 'X_pca'

In [None]:
adata_combined

In [None]:
# Step 9: Downstream analysis
print("\n9. Downstream analysis...")
sc.pp.neighbors(adata_combined, use_rep=use_rep, n_neighbors=15)
sc.tl.umap(adata_combined)
sc.tl.leiden(adata_combined, resolution=0.5)

In [None]:
# Step 10: Visualization
print("\n10. Creating visualizations...")

# Plot integration results
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Before integration
sc.pl.pca(adata_combined, color='batch', ax=axes[0,0], show=False)
axes[0,0].set_title('PCA - Batch Effect')

sc.pl.pca(adata_combined, color='dataset', ax=axes[0,1], show=False)
axes[0,1].set_title('PCA - Dataset')

if 'condition' in adata_combined.obs.columns:
    sc.pl.pca(adata_combined, color='condition', ax=axes[0,2], show=False)
    axes[0,2].set_title('PCA - Condition')
else:
    axes[0,2].axis('off')

# After integration
sc.pl.umap(adata_combined, color='batch', ax=axes[1,0], show=False)
axes[1,0].set_title('UMAP - After Integration')

sc.pl.umap(adata_combined, color='dataset', ax=axes[1,1], show=False)
axes[1,1].set_title('UMAP - Dataset')

sc.pl.umap(adata_combined, color='leiden', ax=axes[1,2], show=False)
axes[1,2].set_title('UMAP - Clusters')

plt.tight_layout()
plt.show()

In [None]:
# Step 11: Quality assessment
print("\n11. Integration quality assessment...")

# Batch mixing in clusters
mixing_df = pd.crosstab(adata_combined.obs['leiden'], adata_combined.obs['batch'])
print("\nBatch mixing in clusters:")
print(mixing_df)

# Save results
output_dir = '/ocean/projects/cis240075p/asachan/datasets/TA_muscle/ERCC1_KO_mice/integrated_12_samples'
adata_combined.write(f'{output_dir}/integrated_dataset.h5ad')
mixing_df.to_csv(f'{output_dir}/batch_mixing_analysis.csv')

print(f"\nIntegration completed! Files saved to {output_dir}")

In [None]:
adata_combined

In [None]:
sc.pl.umap(adata_combined, color='cell_type', show=False)