# TRANS_017 - Sample 3 Preprocessing

This notebook processes Sample 3 using the same pipeline as Sample 1. See sample1_preprocessing.ipynb for detailed explanations.

In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=100, facecolor='white', frameon=False)
sns.set_style('whitegrid')

In [None]:
SAMPLE_NAME = "sample3"
CELLRANGER_OUTPUT = Path("../../data/cellranger_outputs/sample3/outs/")
OUTPUT_DIR = Path("../../data/processed/sample3/")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

QC_PARAMS = {
    'min_genes': 200, 'min_cells': 3, 'max_genes': 6000,
    'max_counts': 30000, 'max_mito_pct': 20,
    'mouse_content_max': 5, 'adt_min_counts': 100,
}

In [None]:
adata = sc.read_10x_mtx(CELLRANGER_OUTPUT / "filtered_feature_bc_matrix", var_names='gene_symbols', cache=True)
adata_gex = adata[:, adata.var['feature_types'] == 'Gene Expression'].copy()
adata_adt = adata[:, adata.var['feature_types'] == 'Antibody Capture'].copy()
adata_gex.obs['sample'] = SAMPLE_NAME
adata_adt.obs['sample'] = SAMPLE_NAME

In [None]:
adata_gex.var['mt'] = adata_gex.var_names.str.startswith('MT-')
adata_gex.var['mouse'] = adata_gex.var_names.str.match(r'^[a-z]')
sc.pp.calculate_qc_metrics(adata_gex, qc_vars=['mt', 'mouse'], percent_top=None, log1p=False, inplace=True)
sc.pp.calculate_qc_metrics(adata_adt, percent_top=None, log1p=False, inplace=True)

In [None]:
n_cells_initial = adata_gex.n_obs
adata_gex = adata_gex[
    (adata_gex.obs['pct_counts_mouse'] < QC_PARAMS['mouse_content_max']) &
    (adata_gex.obs['n_genes_by_counts'] >= QC_PARAMS['min_genes']) &
    (adata_gex.obs['n_genes_by_counts'] <= QC_PARAMS['max_genes']) &
    (adata_gex.obs['total_counts'] <= QC_PARAMS['max_counts']) &
    (adata_gex.obs['pct_counts_mt'] < QC_PARAMS['max_mito_pct']), :
]

adata_adt = adata_adt[adata_gex.obs_names, :]
adt_pass = adata_adt.obs['total_counts'] >= QC_PARAMS['adt_min_counts']
adata_gex = adata_gex[adt_pass, :]
adata_adt = adata_adt[adt_pass, :]

sc.pp.filter_genes(adata_gex, min_cells=QC_PARAMS['min_cells'])
adata_gex = adata_gex[:, ~adata_gex.var['mouse']]
print(f"Filtered: {adata_gex.n_obs}/{n_cells_initial} cells")

In [None]:
adata_gex.layers['counts'] = adata_gex.X.copy()
sc.pp.normalize_total(adata_gex, target_sum=1e4)
sc.pp.log1p(adata_gex)
adata_gex.layers['log_normalized'] = adata_gex.X.copy()
sc.pp.highly_variable_genes(adata_gex, n_top_genes=2000, flavor='seurat_v3', layer='counts')
sc.pp.scale(adata_gex, max_value=10)
sc.tl.pca(adata_gex, svd_solver='arpack', n_comps=50)
sc.pp.neighbors(adata_gex, n_neighbors=15, n_pcs=30)
sc.tl.umap(adata_gex)

In [None]:
for res in [0.4, 0.6, 0.8, 1.0]:
    sc.tl.leiden(adata_gex, resolution=res, key_added=f'leiden_r{res}')
adata_gex.obs['leiden'] = adata_gex.obs['leiden_r0.6']
sc.pl.umap(adata_gex, color='leiden', save=f'_{SAMPLE_NAME}_leiden.png')

In [None]:
adata_adt.layers['counts'] = adata_adt.X.copy()

def clr_normalize(adata):
    from scipy.stats import gmean
    import scipy.sparse as sp
    X = adata.X.toarray() if sp.issparse(adata.X) else adata.X.copy()
    X = X + 1
    return np.log(X / gmean(X, axis=1)[:, np.newaxis])

adata_adt.X = clr_normalize(adata_adt)
adata_adt.layers['clr_normalized'] = adata_adt.X.copy()
sc.pp.scale(adata_adt)
n_comps = min(20, adata_adt.n_vars - 1)
sc.tl.pca(adata_adt, n_comps=n_comps)
sc.pp.neighbors(adata_adt, n_neighbors=15, n_pcs=min(10, n_comps))
sc.tl.umap(adata_adt)
adata_adt.obs = adata_gex.obs.copy()

In [None]:
adata_gex.obsm['protein_counts'] = adata_adt.layers['counts']
adata_gex.obsm['protein_clr'] = adata_adt.layers['clr_normalized']
adata_gex.obsm['X_pca_protein'] = adata_adt.obsm['X_pca']
adata_gex.obsm['X_umap_protein'] = adata_adt.obsm['X_umap']
adata_gex.uns['protein_names'] = adata_adt.var_names.tolist()

output_file = OUTPUT_DIR / f"{SAMPLE_NAME}_processed.h5ad"
adata_gex.write(output_file)

summary = {'sample': SAMPLE_NAME, 'n_cells_raw': n_cells_initial, 'n_cells_filtered': adata_gex.n_obs,
           'n_genes': adata_gex.n_vars, 'n_proteins': len(adata_gex.uns['protein_names'])}
pd.DataFrame([summary]).to_csv(OUTPUT_DIR / f"{SAMPLE_NAME}_summary.csv", index=False)
print(f"âœ“ {SAMPLE_NAME} complete!")