In [None]:
## VDX 3 gene data

In [1]:
from pathlib import Path
import sys
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

REPO_ROOT = Path("..").resolve()
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

from apclust import run_mcss_ap

DATA_ROOT = REPO_ROOT / "Data"

# VDX 3-gene matrix (no header in the file)
vdx_3g = pd.read_csv(DATA_ROOT / "VDX_3_SV.csv", header=None)
X = vdx_3g.to_numpy()


OUTPUT_BASE = REPO_ROOT / "results" / "mcss" / "VDX_3G"

for damping in (0.5, 0.85):
    run_mcss_ap(
        X,
        dataset_name=f"damping{damping:.2f}",
        out_dir=OUTPUT_BASE,
        b=200,
        train_frac=0.8,
        random_seed=0,          # reproducible 80/20 splits
        ap_params={
            "damping": damping,
            "max_iter": 400,
            "convergence_iter": 50,
            "random_state": 0,  # AP is deterministic with these settings
            "affinity": "euclidean",
        },
        metrics=("ari", "ami"),
    )

In [2]:
for damping in (0.5, 0.85):
    summary = pd.read_csv(OUTPUT_BASE / f"damping{damping:.2f}" / "mcss_summary.csv")
    mean_k = summary["train_n_clusters"].mean()
    var_k = summary["train_n_clusters"].var()
    mode_k = summary["train_n_clusters"].mode()[0]
    mode_pct = (summary["train_n_clusters"] == mode_k).mean() * 100
    conv_pct = summary["train_converged"].mean() * 100

    print(f"damping={damping:.2f}")
    print(f"  E[K]      : {mean_k:.3f}")
    print(f"  Var[K]    : {var_k:.3f}")
    print(f"  Mode[K]   : {mode_k} ({mode_pct:.1f} %)")
    print(f"  %Converged: {conv_pct:.1f} %\n")

damping=0.50
  E[K]      : 13.690
  Var[K]    : 1.039
  Mode[K]   : 14 (34.5 %)
  %Converged: 100.0 %

damping=0.85
  E[K]      : 12.290
  Var[K]    : 0.910
  Mode[K]   : 12 (39.5 %)
  %Converged: 100.0 %



In [None]:
## z-scored 

In [1]:
from pathlib import Path
import sys
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

REPO_ROOT = Path("..").resolve()
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

from apclust import run_mcss_ap

DATA_ROOT = REPO_ROOT / "Data"

# VDX 3-gene matrix (no header in the file)
vdx_3g = pd.read_csv(DATA_ROOT / "VDX_3_SV.csv", header=None)
X = vdx_3g.to_numpy()

# Standardise each gene
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

OUTPUT_BASE = REPO_ROOT / "results" / "mcss" / "VDX_3G"

for damping in (0.5, 0.85):
    run_mcss_ap(
        X_std,
        dataset_name=f"damping{damping:.2f}",
        out_dir=OUTPUT_BASE,
        b=200,
        train_frac=0.8,
        random_seed=0,          # reproducible 80/20 splits
        ap_params={
            "damping": damping,
            "max_iter": 400,
            "convergence_iter": 50,
            "random_state": 0,  # AP is deterministic with these settings
            "affinity": "euclidean",
        },
        metrics=("ari", "ami"),
    )

In [3]:
for damping in (0.5, 0.85):
    summary = pd.read_csv(OUTPUT_BASE / f"damping{damping:.2f}" / "mcss_summary.csv")
    mean_k = summary["train_n_clusters"].mean()
    var_k = summary["train_n_clusters"].var()
    mode_k = summary["train_n_clusters"].mode()[0]
    mode_pct = (summary["train_n_clusters"] == mode_k).mean() * 100
    conv_pct = summary["train_converged"].mean() * 100

    print(f"damping={damping:.2f}")
    print(f"  E[K]      : {mean_k:.3f}")
    print(f"  Var[K]    : {var_k:.3f}")
    print(f"  Mode[K]   : {mode_k} ({mode_pct:.1f} %)")
    print(f"  %Converged: {conv_pct:.1f} %\n")

damping=0.50
  E[K]      : 14.790
  Var[K]    : 1.192
  Mode[K]   : 15 (39.5 %)
  %Converged: 100.0 %

damping=0.85
  E[K]      : 14.780
  Var[K]    : 0.826
  Mode[K]   : 15 (38.5 %)
  %Converged: 100.0 %



In [None]:
## VDX full gene data

In [3]:
from apclust import load_matrix
from sklearn.preprocessing import StandardScaler
import pandas as pd

DATA_ROOT = REPO_ROOT / "Data"
vdx_full = load_matrix(DATA_ROOT / "vdx_dict.npy", key="all")   # full expression panel
X_full = vdx_full.values
print("Full VDX shape:", X_full.shape)  # (rows = samples, cols = features)

Full VDX shape: (344, 22283)


In [None]:
first_row = X_full[0, :]             # row 0, all columns
print(first_row)

[11.96513493  7.89542381  8.25927249 ...  1.43295941  2.45943162
  1.5849625 ]


In [4]:
OUTPUT_BASE = REPO_ROOT / "results" / "mcss" / "VDX_full"
for damping in (0.50, 0.85):
    run_mcss_ap(
        X_full,
        dataset_name=f"damping{damping:.2f}",
        out_dir=OUTPUT_BASE,
        b=200,
        train_frac=0.8,
        random_seed=0,
        ap_params={
            "damping": damping,
            "max_iter": 400,
            "convergence_iter": 50,
            "random_state": 0,
            "affinity": "euclidean",
        },
        metrics=("ari", "ami"),
    )

In [5]:
for damping in (0.50, 0.85):
    summary = pd.read_csv(OUTPUT_BASE / f"damping{damping:.2f}" / "mcss_summary.csv")
    mean_k  = summary["train_n_clusters"].mean()
    var_k   = summary["train_n_clusters"].var()
    mode_k  = summary["train_n_clusters"].mode()[0]
    mode_pct = (summary["train_n_clusters"] == mode_k).mean() * 100
    conv_pct = summary["train_converged"].mean() * 100
    print(f"VDX full, damping={damping:.2f}: E[K]={mean_k:.2f}, Var[K]={var_k:.2f}, Mode[K]={mode_k} ({mode_pct:.1f}%), %Converge={conv_pct:.1f}%")

VDX full, damping=0.50: E[K]=19.82, Var[K]=2.30, Mode[K]=20 (26.0%), %Converge=100.0%
VDX full, damping=0.85: E[K]=19.77, Var[K]=2.29, Mode[K]=20 (27.5%), %Converge=100.0%


In [None]:
## z-scored 

In [5]:
scaler = StandardScaler()
X_full_std = scaler.fit_transform(X_full)

OUTPUT_BASE = REPO_ROOT / "results" / "mcss" / "VDX_full"
for damping in (0.50, 0.85):
    run_mcss_ap(
        X_full_std,
        dataset_name=f"damping{damping:.2f}",
        out_dir=OUTPUT_BASE,
        b=200,
        train_frac=0.8,
        random_seed=0,
        ap_params={
            "damping": damping,
            "max_iter": 400,
            "convergence_iter": 50,
            "random_state": 0,
            "affinity": "euclidean",
        },
        metrics=("ari", "ami"),
    )



In [6]:
for damping in (0.50, 0.85):
    summary = pd.read_csv(OUTPUT_BASE / f"damping{damping:.2f}" / "mcss_summary.csv")
    mean_k  = summary["train_n_clusters"].mean()
    var_k   = summary["train_n_clusters"].var()
    mode_k  = summary["train_n_clusters"].mode()[0]
    mode_pct = (summary["train_n_clusters"] == mode_k).mean() * 100
    conv_pct = summary["train_converged"].mean() * 100
    print(f"VDX full, damping={damping:.2f}: E[K]={mean_k:.2f}, Var[K]={var_k:.2f}, Mode[K]={mode_k} ({mode_pct:.1f}%), %Converge={conv_pct:.1f}%")

VDX full, damping=0.50: E[K]=22.38, Var[K]=2.50, Mode[K]=22 (25.5%), %Converge=99.5%
VDX full, damping=0.85: E[K]=22.17, Var[K]=2.63, Mode[K]=22 (23.5%), %Converge=99.5%
