In [25]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, adjusted_rand_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

import umap
import scanpy as sc
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

In [26]:
# ---------------------------
# Data loading & preprocessing
# ---------------------------
def load_and_preprocess(ref_path="SmallData/xenium_reference_common_genes.h5ad",
                        spat_path="SmallData/xenium_spatial_common_genes.h5ad"):
    # load
    B = sc.read_h5ad(ref_path)
    A = sc.read_h5ad(spat_path)

    # Prefer obs column with broad labels (user should have created 'broad_cell_type' earlier)
    possible_ref_cols = ["broad_cell_type", "celltype_major", "celltype", "cell_type"]
    ref_label_col = next((c for c in possible_ref_cols if c in B.obs.columns), None)
    possible_spat_cols = ["broad_cell_type", "cell_type", "predicted_label"]
    spat_label_col = next((c for c in possible_spat_cols if c in A.obs.columns), None)

    if ref_label_col is None:
        raise RuntimeError("Reference label column not found. Set 'broad_cell_type' or similar in B.obs.")
    if spat_label_col is None:
        print("Warning: spatial label column not found. Spatial labels will be None (can't compute test acc).")

    # Normalize total and log1p (inplace on copies)
    Bc = B.copy()
    Ac = A.copy()
    sc.pp.normalize_total(Bc, target_sum=1e4)
    sc.pp.log1p(Bc)
    sc.pp.normalize_total(Ac, target_sum=1e4)
    sc.pp.log1p(Ac)

    # Convert to dense arrays (small 2k dataset assumed)
    X_ref = Bc.X.toarray() if hasattr(Bc.X, "toarray") else Bc.X
    X_spat = Ac.X.toarray() if hasattr(Ac.X, "toarray") else Ac.X

    # labels
    y_ref = Bc.obs[ref_label_col].astype(str).values
    y_spat = Ac.obs[spat_label_col].astype(str).values if spat_label_col in Ac.obs.columns else None

    # map labels to ints
    all_labels = np.unique(np.concatenate([y_ref] + ([y_spat] if y_spat is not None else [])))
    label_to_id = {lab: i for i, lab in enumerate(all_labels)}
    y_ref_id = np.array([label_to_id[l] for l in y_ref])
    y_spat_id = np.array([label_to_id[l] for l in y_spat]) if y_spat is not None else None

    # spatial coords
    if "spatial" not in Ac.obsm_keys():
        raise RuntimeError("Spatial coords key 'spatial' not found in A.obsm.")
    coords = Ac.obsm["spatial"]

    return (X_ref.astype(np.float32), y_ref_id, Bc.obs_names.values, label_to_id,
            X_spat.astype(np.float32), y_spat_id, Ac.obs_names.values, coords, all_labels)

In [27]:
X_ref, y_ref, ref_ids, label_to_id, X_spat, y_spat, spat_ids, coords, all_labels = load_and_preprocess(
         ref_path="SmallData/xenium_reference_common_genes.h5ad",
         spat_path="SmallData/xenium_spatial_common_genes.h5ad"
)

print('Loaded datasets:')
print('X_ref shape:', getattr(X_ref, 'shape', None))
print('X_spat shape:', getattr(X_spat, 'shape', None))
print('coords shape:', getattr(coords, 'shape', None))

Loaded datasets:
X_ref shape: (2000, 308)
X_spat shape: (2000, 308)
coords shape: (2000, 2)


  return fn(*args_all, **kw)


In [28]:
# ----------------------------------------------------------
# 1) Load latent embeddings
# ----------------------------------------------------------
latent = np.load("SCModal_embeddings/latent.npy")   # shape: (N_ref + N_spat, latent_dim)


In [29]:
# ----------------------------------------------------------
# 2) Use the already-loaded labels to split embeddings
#    y_ref and y_spat come from your load_and_preprocess()
# ----------------------------------------------------------
N_ref = len(y_ref)
N_spat = len(y_spat)

Z_ref = latent[:N_ref, :]       # reference embeddings
Z_spat = latent[N_ref:, :]      # spatial embeddings

print("Z_ref:", Z_ref.shape)
print("Z_spat:", Z_spat.shape)

# Standardize (important for KNN)
scaler = StandardScaler()
Z_ref_scaled = scaler.fit_transform(Z_ref)
Z_spat_scaled = scaler.transform(Z_spat)


Z_ref: (2000, 16)
Z_spat: (2000, 16)


In [30]:
# ----------------------------------------------------------
# 3 a) Train logistic regression classifier
# ----------------------------------------------------------
clf = LogisticRegression(
    multi_class="multinomial",
    max_iter=2000,
    solver="lbfgs"
)

clf.fit(Z_ref, y_ref)   # train on ref latent + labels




0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,2000


In [31]:
# ----------------------------------------------------------
# 3 b) Training accuracy (reference data)
# ----------------------------------------------------------
y_ref_pred = clf.predict(Z_ref)
train_acc = accuracy_score(y_ref, y_ref_pred)

print(f"\n=== TRAINING ACCURACY ===")
print("Train Accuracy:", train_acc)


=== TRAINING ACCURACY ===
Train Accuracy: 0.3575


In [32]:
# ----------------------------------------------------------
# 4) Predict spatial labels
# ----------------------------------------------------------
y_spat_pred = clf.predict(Z_spat)

# ----------------------------------------------------------
# 5) Evaluate Accuracy and ARI
# ----------------------------------------------------------
acc = accuracy_score(y_spat, y_spat_pred)
ari = adjusted_rand_score(y_spat, y_spat_pred)

print("Label Transfer Accuracy:", acc)
print("ARI:", ari)

# Optional: confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_spat, y_spat_pred))

Label Transfer Accuracy: 0.0885
ARI: 0.0
Confusion Matrix:
[[  0   0   0   0   0   0  62   0]
 [  0   0   0   0   0   0 496   0]
 [  0   0   0   0   0   0 752   0]
 [  0   0   0   0   0   0 124   0]
 [  0   0   0   0   0   0 175   0]
 [  0   0   0   0   0   0 125   0]
 [  0   0   0   0   0   0 177   0]
 [  0   0   0   0   0   0  89   0]]


In [33]:
# Try KNN with different neighbors
for k in [3, 5, 8, 10, 20]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(Z_ref_scaled, y_ref)

    # Training accuracy
    y_ref_pred = knn.predict(Z_ref_scaled)
    train_acc = accuracy_score(y_ref, y_ref_pred)

    # Predict spatial
    y_spat_pred = knn.predict(Z_spat_scaled)
    test_acc = accuracy_score(y_spat, y_spat_pred)
    ari = adjusted_rand_score(y_spat, y_spat_pred)

    print(f"\n=== KNN (k={k}) ===")
    print("Train Acc:", train_acc)
    print("Test Acc:", test_acc)
    print("ARI:", ari)



=== KNN (k=3) ===
Train Acc: 0.5205
Test Acc: 0.221
ARI: -0.0008516336374123659

=== KNN (k=5) ===
Train Acc: 0.4695
Test Acc: 0.211
ARI: 0.00419699629339395

=== KNN (k=8) ===
Train Acc: 0.434
Test Acc: 0.207
ARI: -0.0010342653240111618

=== KNN (k=10) ===
Train Acc: 0.43
Test Acc: 0.212
ARI: -0.003669650741549867

=== KNN (k=20) ===
Train Acc: 0.3815
Test Acc: 0.195
ARI: -0.009850575115679073


In [34]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)

rf.fit(Z_ref, y_ref)

# Training accuracy
y_ref_pred = rf.predict(Z_ref)
train_acc = accuracy_score(y_ref, y_ref_pred)

# Spatial prediction
y_spat_pred = rf.predict(Z_spat)
test_acc = accuracy_score(y_spat, y_spat_pred)
ari = adjusted_rand_score(y_spat, y_spat_pred)

print("\n=== Random Forest ===")
print("Train Acc:", train_acc)
print("Test Acc:", test_acc)
print("ARI:", ari)



=== Random Forest ===
Train Acc: 1.0
Test Acc: 0.1865
ARI: -0.0006720871230209599


In [35]:
from sklearn.svm import SVC

svm = SVC(
    kernel="rbf",
    C=5,
    gamma="scale"
)
svm.fit(Z_ref, y_ref)

# Training accuracy
y_ref_pred = svm.predict(Z_ref)
train_acc = accuracy_score(y_ref, y_ref_pred)

# Spatial prediction
y_spat_pred = svm.predict(Z_spat)
test_acc = accuracy_score(y_spat, y_spat_pred)
ari = adjusted_rand_score(y_spat, y_spat_pred)

print("\n=== SVM (RBF) ===")
print("Train Acc:", train_acc)
print("Test Acc:", test_acc)
print("ARI:", ari)



=== SVM (RBF) ===
Train Acc: 0.369
Test Acc: 0.0885
ARI: 0.0
