In [None]:
## case t > p

In [1]:
import sys
import subprocess
from pathlib import Path

REPO_ROOT = Path("/Users/aqibsyed/Documents/Research/Clustering/AP code implementations")
SCRIPT = REPO_ROOT / "scripts" / "run_synthetic_mcss.py"
PYTHON = sys.executable  # keeps us in the same environment as the notebook kernel

In [2]:
# Cell B – identity covariance, p=100, t=1k
subprocess.run(
    [
        PYTHON,
        str(SCRIPT),
        "--scenario",
        "tgt_gt_iso_p100_t1000",
        "--b",
        "50",
        "--overwrite-mcss",
    ],
    check=True,
)

[scenario] tgt_gt_iso_p100_t1000  (t>p, identity_var1)
  [run]  tgt_gt_iso_p100_t1000 @ damping0.50 → b=50, train_frac=0.8




  [run]  tgt_gt_iso_p100_t1000 @ damping0.85 → b=50, train_frac=0.8


CompletedProcess(args=['/opt/anaconda3/bin/python', '/Users/aqibsyed/Documents/Research/Clustering/AP code implementations/scripts/run_synthetic_mcss.py', '--scenario', 'tgt_gt_iso_p100_t1000', '--b', '50', '--overwrite-mcss'], returncode=0)

In [None]:
# Cell 3 – run every t > p case (identity, Toeplitz, high-variance; b=200; damping 0.50 & 0.85)
subprocess.run([PYTHON, str(SCRIPT), "--regime", "t>p"], check=True)

In [3]:
# Cell B – identity covariance, p=100, t=1k
subprocess.run(
    [
        PYTHON,
        str(SCRIPT),
        "--scenario",
        "tgt_gt_iso_p100_t1000",
        "--b",
        "50",
        "--overwrite-mcss",
    ],
    check=True,
)

[scenario] tgt_gt_iso_p100_t1000  (t>p, identity_var1)
  [run]  tgt_gt_iso_p100_t1000 @ damping0.50 → b=50, train_frac=0.8




  [run]  tgt_gt_iso_p100_t1000 @ damping0.85 → b=50, train_frac=0.8


CompletedProcess(args=['/opt/anaconda3/bin/python', '/Users/aqibsyed/Documents/Research/Clustering/AP code implementations/scripts/run_synthetic_mcss.py', '--scenario', 'tgt_gt_iso_p100_t1000', '--b', '50', '--overwrite-mcss'], returncode=0)

In [4]:
# Cell C – Toeplitz covariance, p=100, t=1k
subprocess.run(
    [
        PYTHON,
        str(SCRIPT),
        "--scenario",
        "tgt_gt_toeplitz_p100_t1000",
        "--b",
        "50",
        "--overwrite-mcss",
    ],
    check=True,
)

[scenario] tgt_gt_toeplitz_p100_t1000  (t>p, toeplitz)
  [run]  tgt_gt_toeplitz_p100_t1000 @ damping0.50 → b=50, train_frac=0.8
  [run]  tgt_gt_toeplitz_p100_t1000 @ damping0.85 → b=50, train_frac=0.8


CompletedProcess(args=['/opt/anaconda3/bin/python', '/Users/aqibsyed/Documents/Research/Clustering/AP code implementations/scripts/run_synthetic_mcss.py', '--scenario', 'tgt_gt_toeplitz_p100_t1000', '--b', '50', '--overwrite-mcss'], returncode=0)

In [5]:
# Cell D – high-variance diagonal, p=100, t=1k
subprocess.run(
    [
        PYTHON,
        str(SCRIPT),
        "--scenario",
        "tgt_gt_var10_p100_t1000",
        "--b",
        "50",
        "--overwrite-mcss",
    ],
    check=True,
)

[scenario] tgt_gt_var10_p100_t1000  (t>p, identity_var10)
  [run]  tgt_gt_var10_p100_t1000 @ damping0.50 → b=50, train_frac=0.8




  [run]  tgt_gt_var10_p100_t1000 @ damping0.85 → b=50, train_frac=0.8


CompletedProcess(args=['/opt/anaconda3/bin/python', '/Users/aqibsyed/Documents/Research/Clustering/AP code implementations/scripts/run_synthetic_mcss.py', '--scenario', 'tgt_gt_var10_p100_t1000', '--b', '50', '--overwrite-mcss'], returncode=0)

In [8]:
import pandas as pd
from pathlib import Path

REPO_ROOT = Path("/Users/aqibsyed/Documents/Research/Clustering/AP code implementations")
MCSS_ROOT = REPO_ROOT / "Results" / "mcss"

def summarize(csv_path: Path) -> dict:
    df = pd.read_csv(csv_path)
    k_series = df["test_n_clusters"]
    mode_val = k_series.value_counts().idxmax()
    mode_freq = k_series.value_counts().max() / len(k_series)
    return {
        "E[K]": float(k_series.mean()),
        "Var[K]": float(k_series.var(ddof=1)),
        "Mode[K]": int(mode_val),
        "Mode freq": float(mode_freq),
        "% Converge": float(df["test_converged"].mean() * 100.0),
    }

records = []
for scenario in [
    "tgt_gt_iso_p100_t1000",
    "tgt_gt_toeplitz_p100_t1000",
    "tgt_gt_var10_p100_t1000",
]:
    for damping in (0.50, 0.85):
        csv_path = MCSS_ROOT / scenario / f"damping{damping:0.2f}" / "mcss_summary.csv"
        stats = summarize(csv_path)
        records.append(
            {
                "scenario": scenario,
                "damping": damping,
                **stats,
            }
        )

summary_df = pd.DataFrame(records)
display(summary_df)

Unnamed: 0,scenario,damping,E[K],Var[K],Mode[K],Mode freq,% Converge
0,tgt_gt_iso_p100_t1000,0.5,4.0,0.0,4,1.0,98.0
1,tgt_gt_iso_p100_t1000,0.85,3.98,0.02,4,0.98,100.0
2,tgt_gt_toeplitz_p100_t1000,0.5,4.78,0.583265,4,0.42,100.0
3,tgt_gt_toeplitz_p100_t1000,0.85,4.92,0.687347,5,0.44,100.0
4,tgt_gt_var10_p100_t1000,0.5,11.26,1.094286,11,0.44,100.0
5,tgt_gt_var10_p100_t1000,0.85,11.3,1.357143,11,0.4,100.0


## Initial runs with Siamak's data

In [1]:
from pathlib import Path
import sys
import numpy as np
import pandas as pd

REPO_ROOT = Path("..").resolve()
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

from apclust import run_mcss_ap

DATA_ROOT = REPO_ROOT / "Data"
synthetic_path = DATA_ROOT / "samples_original_1_for_GaussMix4_DiagCovar_highVar_2Kdim_n1K.csv"
synthetic = pd.read_csv(synthetic_path, header=None)
X = synthetic.to_numpy(dtype=np.float64)

OUTPUT_BASE = REPO_ROOT / "results" / "mcss" / "synthetic_gaussmix4_highvar"
OUTPUT_BASE.mkdir(parents=True, exist_ok=True)

for damping in (0.50, 0.85):
    run_mcss_ap(
        X,
        dataset_name=f"damping{damping:.2f}",
        out_dir=OUTPUT_BASE,
        b=200,
        train_frac=0.80,
        random_seed=0,
        ap_params={
            "damping": damping,
            "max_iter": 400,
            "convergence_iter": 50,
            "random_state": 0,
            "affinity": "euclidean",
        },
        metrics=("ari", "ami"),
    )



In [2]:
import pandas as pd

for damping in (0.50, 0.85):
    summary = pd.read_csv(OUTPUT_BASE / f"damping{damping:.2f}" / "mcss_summary.csv")
    mean_k = summary["train_n_clusters"].mean()
    var_k = summary["train_n_clusters"].var()
    mode_k = summary["train_n_clusters"].mode()[0]
    mode_pct = (summary["train_n_clusters"] == mode_k).mean() * 100
    conv_pct = summary["train_converged"].mean() * 100

    print(f"damping={damping:.2f}")
    print(f"  E[K]      : {mean_k:.3f}")
    print(f"  Var[K]    : {var_k:.3f}")
    print(f"  Mode[K]   : {mode_k} ({mode_pct:.1f} %)")
    print(f"  %Converged: {conv_pct:.1f} %\n")

damping=0.50
  E[K]      : 14.095
  Var[K]    : 0.840
  Mode[K]   : 14 (46.0 %)
  %Converged: 99.0 %

damping=0.85
  E[K]      : 14.140
  Var[K]    : 0.935
  Mode[K]   : 14 (46.0 %)
  %Converged: 100.0 %

