# TRANS_017 - Sample 2 Preprocessing

This notebook processes Sample 2 using the same pipeline as Sample 1. See sample1_preprocessing.ipynb for detailed explanations of each step.

In [None]:
# Import required libraries
import scanpy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=100, facecolor='white', frameon=False)
sns.set_style('whitegrid')

print(f"Scanpy version: {sc.__version__}")

In [None]:
# Define sample information
SAMPLE_NAME = "sample2"

# Define paths
CELLRANGER_OUTPUT = Path("../../data/cellranger_outputs/sample2/outs/")
OUTPUT_DIR = Path("../../data/processed/sample2/")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# QC thresholds
QC_PARAMS = {
    'min_genes': 200,
    'min_cells': 3,
    'max_genes': 6000,
    'max_counts': 30000,
    'max_mito_pct': 20,
    'mouse_content_max': 5,
    'adt_min_counts': 100,
}

print(f"Processing sample: {SAMPLE_NAME}")

In [None]:
# Load CellRanger output
adata = sc.read_10x_mtx(
    CELLRANGER_OUTPUT / "filtered_feature_bc_matrix",
    var_names='gene_symbols',
    cache=True
)

print(f"Loaded data shape: {adata.shape}")
print(adata.var['feature_types'].value_counts())

In [None]:
# Separate GEX and ADT
adata_gex = adata[:, adata.var['feature_types'] == 'Gene Expression'].copy()
adata_adt = adata[:, adata.var['feature_types'] == 'Antibody Capture'].copy()

adata_gex.obs['sample'] = SAMPLE_NAME
adata_adt.obs['sample'] = SAMPLE_NAME

print(f"GEX: {adata_gex.shape}, ADT: {adata_adt.shape}")

In [None]:
# Calculate QC metrics
adata_gex.var['mt'] = adata_gex.var_names.str.startswith('MT-')
adata_gex.var['mouse'] = adata_gex.var_names.str.match(r'^[a-z]')

sc.pp.calculate_qc_metrics(adata_gex, qc_vars=['mt', 'mouse'], percent_top=None, log1p=False, inplace=True)
sc.pp.calculate_qc_metrics(adata_adt, percent_top=None, log1p=False, inplace=True)

print("QC metrics calculated")

In [None]:
# QC visualizations
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

axes[0, 0].hist(adata_gex.obs['total_counts'], bins=100)
axes[0, 0].set_xlabel('Total counts')
axes[0, 0].set_title('Total RNA counts')

axes[0, 1].hist(adata_gex.obs['n_genes_by_counts'], bins=100)
axes[0, 1].set_xlabel('Number of genes')
axes[0, 1].set_title('Genes per cell')

axes[0, 2].hist(adata_gex.obs['pct_counts_mt'], bins=100)
axes[0, 2].set_xlabel('Mitochondrial %')
axes[0, 2].set_title('Mitochondrial %')

axes[1, 0].hist(adata_gex.obs['pct_counts_mouse'], bins=100)
axes[1, 0].set_xlabel('Mouse %')
axes[1, 0].set_title('Mouse contamination')

axes[1, 1].hist(adata_adt.obs['total_counts'], bins=100)
axes[1, 1].set_xlabel('ADT counts')
axes[1, 1].set_title('Protein counts')

axes[1, 2].scatter(adata_gex.obs['total_counts'], adata_gex.obs['n_genes_by_counts'], alpha=0.3, s=1)
axes[1, 2].set_xlabel('Total counts')
axes[1, 2].set_ylabel('Genes')
axes[1, 2].set_title('Genes vs Counts')

plt.tight_layout()
plt.savefig(OUTPUT_DIR / f"{SAMPLE_NAME}_qc_before.png", dpi=300)
plt.show()

In [None]:
# Apply filters
n_cells_initial = adata_gex.n_obs
print(f"Cells before filtering: {n_cells_initial}")

adata_gex = adata_gex[adata_gex.obs['pct_counts_mouse'] < QC_PARAMS['mouse_content_max'], :]
print(f"After mouse filter: {adata_gex.n_obs}")

adata_gex = adata_gex[
    (adata_gex.obs['n_genes_by_counts'] >= QC_PARAMS['min_genes']) &
    (adata_gex.obs['n_genes_by_counts'] <= QC_PARAMS['max_genes']) &
    (adata_gex.obs['total_counts'] <= QC_PARAMS['max_counts']) &
    (adata_gex.obs['pct_counts_mt'] < QC_PARAMS['max_mito_pct']),
    :
]
print(f"After QC filters: {adata_gex.n_obs}")

adata_adt = adata_adt[adata_gex.obs_names, :]
adt_pass = adata_adt.obs['total_counts'] >= QC_PARAMS['adt_min_counts']
adata_gex = adata_gex[adt_pass, :]
adata_adt = adata_adt[adt_pass, :]

sc.pp.filter_genes(adata_gex, min_cells=QC_PARAMS['min_cells'])
adata_gex = adata_gex[:, ~adata_gex.var['mouse']]

print(f"Final: {adata_gex.n_obs} cells, {adata_gex.n_vars} genes")

In [None]:
# Normalize RNA
adata_gex.layers['counts'] = adata_gex.X.copy()
sc.pp.normalize_total(adata_gex, target_sum=1e4)
sc.pp.log1p(adata_gex)
adata_gex.layers['log_normalized'] = adata_gex.X.copy()

print("RNA normalization completed")

In [None]:
# Highly variable genes
sc.pp.highly_variable_genes(adata_gex, n_top_genes=2000, flavor='seurat_v3', layer='counts')
print(f"HVG: {adata_gex.var['highly_variable'].sum()}")
sc.pl.highly_variable_genes(adata_gex, save=f'_{SAMPLE_NAME}_hvg.png')

In [None]:
# Scale and PCA
sc.pp.scale(adata_gex, max_value=10)
sc.tl.pca(adata_gex, svd_solver='arpack', n_comps=50)
sc.pl.pca_variance_ratio(adata_gex, log=True, n_pcs=50, save=f'_{SAMPLE_NAME}_variance.png')

In [None]:
# Neighbors and UMAP
sc.pp.neighbors(adata_gex, n_neighbors=15, n_pcs=30)
sc.tl.umap(adata_gex)
print("UMAP completed")

In [None]:
# Clustering
for res in [0.4, 0.6, 0.8, 1.0]:
    sc.tl.leiden(adata_gex, resolution=res, key_added=f'leiden_r{res}')

adata_gex.obs['leiden'] = adata_gex.obs['leiden_r0.6']
sc.pl.umap(adata_gex, color=['leiden', 'leiden_r0.4', 'leiden_r0.8'], ncols=3, save=f'_{SAMPLE_NAME}_clustering.png')

In [None]:
# Process ADT
adata_adt.layers['counts'] = adata_adt.X.copy()

def clr_normalize(adata):
    from scipy.stats import gmean
    import scipy.sparse as sp
    X = adata.X.toarray() if sp.issparse(adata.X) else adata.X.copy()
    X = X + 1
    geo_means = gmean(X, axis=1)
    return np.log(X / geo_means[:, np.newaxis])

adata_adt.X = clr_normalize(adata_adt)
adata_adt.layers['clr_normalized'] = adata_adt.X.copy()

sc.pp.scale(adata_adt)
n_comps = min(20, adata_adt.n_vars - 1)
sc.tl.pca(adata_adt, n_comps=n_comps)
sc.pp.neighbors(adata_adt, n_neighbors=15, n_pcs=min(10, n_comps))
sc.tl.umap(adata_adt)

print("ADT processing completed")

In [None]:
# Transfer clustering
adata_adt.obs = adata_gex.obs.copy()
sc.pl.umap(adata_adt, color='leiden', save=f'_{SAMPLE_NAME}_adt_leiden.png')

In [None]:
# Combine modalities
adata_gex.obsm['protein_counts'] = adata_adt.layers['counts']
adata_gex.obsm['protein_clr'] = adata_adt.layers['clr_normalized']
adata_gex.obsm['X_pca_protein'] = adata_adt.obsm['X_pca']
adata_gex.obsm['X_umap_protein'] = adata_adt.obsm['X_umap']
adata_gex.uns['protein_names'] = adata_adt.var_names.tolist()

print(f"Combined object: {adata_gex.shape}, {len(adata_gex.uns['protein_names'])} proteins")

In [None]:
# Save
output_file = OUTPUT_DIR / f"{SAMPLE_NAME}_processed.h5ad"
adata_gex.write(output_file)
print(f"Saved to: {output_file}")

# Summary
summary = {
    'sample': SAMPLE_NAME,
    'n_cells_raw': n_cells_initial,
    'n_cells_filtered': adata_gex.n_obs,
    'retention_rate': f"{adata_gex.n_obs/n_cells_initial*100:.1f}%",
    'n_genes': adata_gex.n_vars,
    'n_proteins': len(adata_gex.uns['protein_names']),
    'n_clusters': len(adata_gex.obs['leiden'].unique()),
}

pd.DataFrame([summary]).to_csv(OUTPUT_DIR / f"{SAMPLE_NAME}_summary.csv", index=False)
print(f"\nâœ“ Processing complete for {SAMPLE_NAME}!")