# Analysis of Human Kidney Immunne (Lupus Nephritis vs. Healthy control)

## Data source: E-MTAB-13596

## **ScRNA-seq of human kidney immune cells of patients with ANCA-associated glomerulonephritis, Lupus Nephritis against a healthy nephrectomy control**

### "ANCA-associated glomerulonephritis (AGN) associates with a high risk of end-stage kidneydisease. The role of kidney immune cells in local inflammation remains unclear. Herewe investigate kidney immune cell diversity and function. Kidney tissue from AGN patients (n=5) and a lupus nephritis (LN) patient (n=1) were aquired during a biopsy procedure for a clinical indication. Needle-core biopsies were obtained for histopathological examination, and an additional pass was performed to retrieve kidney tissue for scRNA-seq. Healthy kidney tissue (n=1) was obtained from a kidney that was surgically removed do tue due to a (non-invasive) papillary urothelial carcinoma. Immediately after collection, kidney tissue was processed into a single-cell suspension and sorted using a 4-color flow cytometry panel to isolate living, CD45+immune cells. To aid in the multi-omic characterization, surface markers and T and B cell repertoires were sequenced in 2 samples (1 AGN patient and the nephrectomy control). These samples were incubated with an oligo-antibody TotalSeq-C cocktail containing 130 unique cell surface antigens."

In [None]:
PDIR = '/Users/aumchampaneri/VSCode Projects/complement-receptor-blockade/'

# Convert raw data to AnnData object

## Parse metadata

In [None]:
# --- Parse SDRF metadata ---
import pandas as pd
import os
# Use the flat file location, which exists
sdrf_path = f"{PDIR}/lupus-emtab/raw-data/E-MTAB-13596.sdrf.txt"
print("Parsing SDRF metadata from:", sdrf_path)
sdrf = pd.read_csv(sdrf_path, sep="\t")

# Extract relevant columns and sample file info
samples = []
for idx, row in sdrf.iterrows():
    sample_id = row["Source Name"]
    disease = row["Characteristics[disease]"]
    cell_type = row["Characteristics[cell type]"]
    disease_stage = row.get("Characteristics[disease staging]", "")
    # Use the correct columns for file names
    matrix_file = row.get("Protocol REF.44", None)
    barcode_file = row.get("Protocol REF.42", None)
    feature_file = row.get("Protocol REF.43", None)
    # Fallback: infer from sample_id
    if not isinstance(matrix_file, str) or not matrix_file.endswith(".mtx"):
        matrix_file = f"{sample_id}_matrix.mtx"
    if not isinstance(barcode_file, str) or not barcode_file.endswith(".tsv"):
        barcode_file = f"{sample_id}_barcodes.tsv"
    if not isinstance(feature_file, str) or not feature_file.endswith(".tsv"):
        feature_file = f"{sample_id}_features.tsv"
    samples.append({
        "sample_id": sample_id,
        "disease": disease,
        "cell_type": cell_type,
        "disease_stage": disease_stage,
        "matrix_file": matrix_file,
        "barcode_file": barcode_file,
        "feature_file": feature_file,
    })

print(f"Found {len(samples)} samples in SDRF.")

## Load and annotate all samples

In [None]:
import scipy.io
from anndata.utils import make_index_unique
import pandas as pd
import scanpy as sc

adatas = []
for sample in samples:
    print(f"Loading sample {sample['sample_id']} ...")
    matrix_path = f"{PDIR}/lupus-emtab/raw-data/{sample['matrix_file']}"
    barcode_path = f"{PDIR}/lupus-emtab/raw-data/{sample['barcode_file']}"
    feature_path = f"{PDIR}/lupus-emtab/raw-data/{sample['feature_file']}"
    print(f"Loading matrix from: {matrix_path}")
    # Load matrix
    matrix = scipy.io.mmread(matrix_path)
    # Load barcodes
    barcodes = pd.read_csv(barcode_path, header=None)[0].tolist()
    # Load features
    features = pd.read_csv(feature_path, sep="\t", header=None)
    gene_names = features[1].tolist()  # column 1 is usually gene symbol
    # Check shape and transpose if needed
    if matrix.shape[0] == len(barcodes):
        matrix = matrix.tocsc()
    elif matrix.shape[1] == len(barcodes):
        matrix = matrix.T.tocsc()
    else:
        raise ValueError(f"Matrix dimensions {matrix.shape} do not match barcodes length {len(barcodes)} for sample {sample['sample_id']}.")
    # Diagnostic: check for unique barcodes
    prefixed_barcodes = [f"{sample['sample_id']}_" + bc for bc in barcodes]
    unique_barcodes = set(prefixed_barcodes)
    print(f"Sample {sample['sample_id']} - Total barcodes: {len(barcodes)}, Unique barcodes: {len(unique_barcodes)}")
    # Create AnnData
    ad = sc.AnnData(matrix)
    ad.obs_names = prefixed_barcodes
    ad.var_names = make_index_unique(pd.Index(gene_names))
    # Annotate sample metadata
    ad.obs["sample_id"] = sample["sample_id"]
    ad.obs["disease"] = sample["disease"]
    ad.obs["cell_type_sort"] = sample["cell_type"]
    ad.obs["disease_stage"] = sample["disease_stage"]
    ad.obs["batch"] = sample["sample_id"]
    adatas.append(ad)

print("Merging all samples into a single AnnData object...")
adata = adatas[0].concatenate(
    adatas[1:], batch_key="batch", batch_categories=[s["sample_id"] for s in samples]
)

print(f"Merged AnnData: {adata.n_obs} cells, {adata.n_vars} genes.")
print("AnnData .obs columns:", adata.obs.columns.tolist())
print("Preview of .obs (first 5 rows):")
print(adata.obs.head())

## Save AnnData object

In [None]:
# Preserve raw counts for downstream normalization
adata.raw = adata.copy()

In [None]:
# Save merged AnnData to .h5ad file in /input-data "lupus-emtab/input-data/lupus-emtab_merged.h5ad"
import os
output_dir = f"{PDIR}/lupus-emtab/input-data"
os.makedirs(output_dir, exist_ok=True)
adata.write_h5ad(os.path.join(output_dir, "lupus-emtab_merged.h5ad"))
print(f"Saved AnnData to {os.path.join(output_dir, 'lupus-emtab_merged.h5ad')}")

# Data processing (Quality Control)

1. Load adata
2. scanpy zheng17 recipe
3. Harmony batch correction
4. UMAP
5. Save adata

In [None]:
import scanpy as sc

adata = sc.read_h5ad(f"{PDIR}/lupus-emtab/input-data/lupus-emtab_merged.h5ad")

In [None]:
sc.pp.recipe_zheng17(
    adata,
    n_top_genes=2000,
    log = True,
    plot=True,
    copy=True
)

## Harmony batch correction

In [None]:
import scanpy.external as sce
import numpy as np

# PCA
sc.pp.pca(adata, n_comps=50, svd_solver='arpack')

print("Harmony will use 'batch' column for integration. Unique batch values:", adata.obs["batch"].unique())
sce.pp.harmony_integrate(adata, "batch")

if "X_pca_harmony" in adata.obsm:
    print("Harmony-corrected PCA found. Using for downstream analysis.")
    print("Shape of X_pca_harmony:", adata.obsm["X_pca_harmony"].shape)
    print("Any NaNs in X_pca_harmony?", np.isnan(adata.obsm["X_pca_harmony"]).any())
    print("Min/Max in X_pca_harmony:", adata.obsm["X_pca_harmony"].min(), adata.obsm["X_pca_harmony"].max())
    adata.obsm["X_pca"] = adata.obsm["X_pca_harmony"]
else:
    print("WARNING: Harmony-corrected PCA not found. Using default PCA.")

## Neighbors and UMAP on Harmony-corrected PCA

In [None]:
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=20)
sc.tl.umap(adata)
sc.tl.leiden(adata)

# Plots
sc.pl.pca(adata, color="batch")
sc.pl.umap(
    adata,
    color=["batch", "disease", "disease_stage", "leiden"]
)

## Save AnnData Object

In [None]:
adata.write_h5ad(f"{PDIR}/lupus-emtab/input-data/lupus-emtab_processed.h5ad")

# Cell type annotation - CellTypist

In [None]:
import scanpy as sc

adata = sc.read_h5ad(f"{PDIR}/lupus-emtab/input-data/lupus-emtab_processed.h5ad")

In [None]:
import scanpy as sc
import numpy as np
import scipy.sparse

# Reset adata.X from adata.raw.X (raw counts)
if hasattr(adata, 'raw') and adata.raw is not None:
    if scipy.sparse.issparse(adata.raw.X):
        adata.X = adata.raw.X.toarray().copy()
    else:
        adata.X = adata.raw.X.copy()
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    adata.X = np.nan_to_num(adata.X)
else:
    raise ValueError("AnnData.raw is missing; cannot reset adata.X for CellTypist.")

In [None]:
# CellTypist annotation pipeline (refactored for new data processing)
import celltypist
import mygene
from celltypist.models import Model
import numpy as np
import os
import pandas as pd
from anndata import AnnData

# Use processed AnnData (already log1p-normalized and batch-corrected)
# Map gene symbols to official gene symbols using mygene
mg = mygene.MyGeneInfo()
result = mg.querymany(list(adata.var.index), scopes="symbol", fields="symbol", species="human")
symbol_map = {r["query"]: r.get("symbol") for r in result if "symbol" in r}
valid_genes = [g for g in adata.var.index if g in symbol_map and symbol_map[g] is not None]
adata = adata[:, valid_genes]
adata.var_names = [symbol_map[g] for g in valid_genes]

# Aggregate duplicated gene symbols by sum
if len(set(adata.var_names)) < len(adata.var_names):
    df = pd.DataFrame(adata.X.toarray(), columns=adata.var_names, index=adata.obs_names)
    df = df.groupby(df.columns, axis=1).sum()
    adata = AnnData(df.values, obs=adata.obs, var=pd.DataFrame(index=df.columns))

# Fill NaNs in .X with zeros
if np.isnan(adata.X).any():
    adata.X = np.nan_to_num(adata.X)

# CellTypist expects log1p-normalized data, so no further normalization needed
if adata.n_obs == 0 or adata.n_vars == 0:
    raise ValueError("AnnData is empty after gene mapping/aggregation.")

# Load or download CellTypist model
model_dir = os.path.expanduser("~/.celltypist/data/models")
model_path = os.path.join(model_dir, "Immune_All_Low.pkl")
if os.path.exists(model_path):
    model = Model.load(model_path)
else:
    model = celltypist.models.download_models("Immune_All_Low.pkl")

# Run CellTypist annotation
predictions = celltypist.annotate(adata, model=model, majority_voting=True)
if "majority_voting" in predictions.predicted_labels.columns:
    adata.obs["celltypist_label"] = predictions.predicted_labels["majority_voting"]
else:
    adata.obs["celltypist_label"] = predictions.predicted_labels.iloc[:, 0]

# Save cell type labels to CSV
os.makedirs(f"{PDIR}/lupus-emtab/outputs/", exist_ok=True)
adata.obs["celltypist_label"].to_csv(f"{PDIR}/lupus-emtab/outputs/celltypist_labels.csv")

## Check annotations

In [None]:
# Check CellTypist annotations
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# View unique cell types and counts
print("CellTypist label value counts:")
print(adata.obs["celltypist_label"].value_counts())

# Plot cell type distribution
plt.figure(figsize=(10, 5))
sns.countplot(y=adata.obs["celltypist_label"], order=adata.obs["celltypist_label"].value_counts().index)
plt.title("CellTypist Cell Type Distribution")
plt.xlabel("Number of Cells")
plt.ylabel("Cell Type")
plt.tight_layout()
plt.show()

# Cross-tabulate cell type by disease
if "disease" in adata.obs.columns:
    ctab = pd.crosstab(adata.obs["celltypist_label"], adata.obs["disease"])
    print("\nCell type by disease:")
    print(ctab)
    ctab.plot(kind="bar", stacked=True, figsize=(12, 6))
    plt.title("Cell Type Distribution by Disease Status")
    plt.xlabel("Cell Type")
    plt.ylabel("Number of Cells")
    plt.tight_layout()
    plt.show()

## Save annotated AnnData object

In [None]:
# Save prepared AnnData object for Geneformer tokenization
adata.write_h5ad(f"{PDIR}/lupus-emtab/input-data/lupus-emtab_annotated.h5ad")

# Data exploration

In [None]:
import scanpy as sc

# adata = sc.read_h5ad(f"{PDIR}/lupus-emtab/input-data/lupus-emtab_annotated.h5ad")

adata = sc.read_h5ad(f"{PDIR}/lupus-emtab/input-data/lupus-emtab_processed.h5ad")

In [None]:
adata

In [None]:
adata.obs

In [None]:
adata.var