In [1]:
## check if our data generated aligns with the synthetic data we were given. 

In [1]:
from pathlib import Path
import json
import sys
import numpy as np
import pandas as pd

REPO_ROOT = Path("..").resolve()
if str(REPO_ROOT) not in sys.path:
    sys.path.append(str(REPO_ROOT))

from apclust.synthetic import generate_gaussian_mixture

In [2]:
# scenario: 4-component diagonal Gaussian with high variance
n_samp = 1000
n_feat = 2000
comp_means = np.array([1.0, 4.0, 7.0, 10.0], dtype=np.float64)
diag_var = 10.0

noise_seed = 123        # controls Gaussian noise draws (NumPy RNG)
label_seed = 456        # independent RNG for component memberships

covariance_spec = [diag_var] * 4  # one scalar variance per component

data_matrix, component_labels = generate_gaussian_mixture(
    n_samples=n_samp,
    n_features=n_feat,
    means=comp_means,
    covariance=covariance_spec,
    noise_seed=noise_seed,
    label_seed=label_seed,
    labels=None,        # let the generator sample memberships
    label_probs=None,   # uniform 1/4 weights
)

data_df = pd.DataFrame(data_matrix)

In [3]:
display(data_df.head())

out_path = Path("/Users/aqibsyed/Documents/Research/Clustering/AP code implementations/Data/generated_gaussmix4_highvar.csv")
data_df.to_csv(out_path, index=False, header=False)
print(f"Wrote {out_path}")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,3.566935,10.153883,7.894857,2.236678,5.170305,12.222301,-0.673834,5.643659,11.003242,4.259126,...,3.583434,7.508776,5.09449,9.211471,12.842161,9.674342,3.458494,5.863804,1.909697,7.042912
1,4.389413,6.200911,13.466668,12.722838,5.192177,8.58507,11.465675,11.241173,4.854446,10.822223,...,9.673835,4.430239,9.414451,11.594031,14.283516,7.128803,4.250838,9.611885,15.674663,8.98646
2,2.509696,7.428138,2.800791,2.854389,1.846265,4.364319,3.944424,4.514298,4.325277,-0.925186,...,6.298109,3.662084,8.076713,8.609882,7.090236,3.931976,9.40684,8.189756,2.574563,4.76139
3,10.717542,2.415108,8.583749,10.624184,11.983903,13.495673,11.345695,6.646214,9.523853,15.783607,...,14.074797,16.813833,13.219443,7.911068,15.576534,12.398729,11.841915,4.91811,8.764305,13.618658
4,5.978837,8.584808,7.429854,8.255467,6.155714,10.108626,12.503091,2.053408,6.115354,9.349437,...,3.645647,2.936224,10.387606,13.0429,9.052861,5.109219,2.579352,5.506562,8.840333,10.069347


Wrote /Users/aqibsyed/Documents/Research/Clustering/AP code implementations/Data/generated_gaussmix4_highvar.csv


In [4]:
# wrap both matrices for display/inspection
data_csv = Path("/Users/aqibsyed/Documents/Research/Clustering/AP code implementations/Data/samples_original_1_for_GaussMix4_DiagCovar_highVar_2Kdim_n1K.csv")
original = pd.read_csv(data_csv, header=None).to_numpy()
original_df = pd.DataFrame(original)

display(
    original_df.head(),   # first few rows of the CSV version
    data_df.head(),      # first few rows of the regenerated version
)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,-2.433065,4.153883,1.894857,-3.763322,-0.829695,6.222301,-6.673834,-0.356341,5.003242,-1.740874,...,-2.416566,1.508776,-0.90551,3.211471,6.842161,3.674342,-2.541506,-0.136196,-4.090303,1.042912
1,-4.610587,-2.799089,4.466668,3.722838,-3.807823,-0.41493,2.465675,2.241173,-4.145554,1.822223,...,0.673835,-4.569761,0.414451,2.594031,5.283516,-1.871197,-4.749162,0.611885,6.674663,-0.01354
2,2.509696,7.428138,2.800791,2.854389,1.846265,4.364319,3.944424,4.514298,4.325277,-0.925186,...,6.298109,3.662084,8.076713,8.609882,7.090236,3.931976,9.40684,8.189756,2.574563,4.76139
3,1.717542,-6.584892,-0.416251,1.624184,2.983903,4.495673,2.345695,-2.353786,0.523853,6.783607,...,5.074797,7.813833,4.219443,-1.088932,6.576534,3.398729,2.841915,-4.08189,-0.235695,4.618658
4,8.978837,11.584808,10.429854,11.255467,9.155714,13.108626,15.503091,5.053408,9.115354,12.349437,...,6.645647,5.936224,13.387606,16.0429,12.052861,8.109219,5.579352,8.506562,11.840333,13.069347


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,3.566935,10.153883,7.894857,2.236678,5.170305,12.222301,-0.673834,5.643659,11.003242,4.259126,...,3.583434,7.508776,5.09449,9.211471,12.842161,9.674342,3.458494,5.863804,1.909697,7.042912
1,4.389413,6.200911,13.466668,12.722838,5.192177,8.58507,11.465675,11.241173,4.854446,10.822223,...,9.673835,4.430239,9.414451,11.594031,14.283516,7.128803,4.250838,9.611885,15.674663,8.98646
2,2.509696,7.428138,2.800791,2.854389,1.846265,4.364319,3.944424,4.514298,4.325277,-0.925186,...,6.298109,3.662084,8.076713,8.609882,7.090236,3.931976,9.40684,8.189756,2.574563,4.76139
3,10.717542,2.415108,8.583749,10.624184,11.983903,13.495673,11.345695,6.646214,9.523853,15.783607,...,14.074797,16.813833,13.219443,7.911068,15.576534,12.398729,11.841915,4.91811,8.764305,13.618658
4,5.978837,8.584808,7.429854,8.255467,6.155714,10.108626,12.503091,2.053408,6.115354,9.349437,...,3.645647,2.936224,10.387606,13.0429,9.052861,5.109219,2.579352,5.506562,8.840333,10.069347


In [6]:
print("Allclose (1e-12):", np.allclose(original, data_matrix, atol=1e-12))
print("Max abs diff:", np.abs(original - data_matrix).max())

Allclose (1e-12): False
Max abs diff: 9.00000000000004


In [7]:
try:
    from scipy.stats import ks_2samp, energy_distance
except ImportError as exc:
    raise ImportError("Please `pip install scipy` in this environment to run the comparison code.") from exc

if original.shape != data_matrix.shape:
    raise ValueError(f"Shape mismatch: original {original.shape}, regenerated {data_matrix.shape}")

n_samples, n_features = original.shape
rng = np.random.default_rng(0)

In [8]:
feature_ids = rng.choice(n_features, size=10, replace=False)
records = []

for j in feature_ids:
    x = original[:, j]
    y = data_matrix[:, j]
    stat, pval = ks_2samp(x, y)
    records.append(
        {
            "feature": int(j),
            "mean_original": x.mean(),
            "mean_regenerated": y.mean(),
            "mean_diff": x.mean() - y.mean(),
            "var_original": x.var(ddof=1),
            "var_regenerated": y.var(ddof=1),
            "var_diff": x.var(ddof=1) - y.var(ddof=1),
            "ks_stat": stat,
            "ks_pvalue": pval,
        }
    )

feature_summary = pd.DataFrame(records).set_index("feature").sort_index()
display(feature_summary)

Unnamed: 0_level_0,mean_original,mean_regenerated,mean_diff,var_original,var_regenerated,var_diff,ks_stat,ks_pvalue
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
33,5.378637,5.414637,-0.036,22.53676,22.528031,0.008729,0.021,0.980263
81,5.17389,5.20989,-0.036,21.763075,19.555841,2.207234,0.038,0.46596
150,5.299081,5.335081,-0.036,20.968827,21.507187,-0.538359,0.025,0.913689
350,5.351427,5.387427,-0.036,20.757008,19.882643,0.874366,0.022,0.969001
537,5.353282,5.389282,-0.036,22.362565,20.871023,1.491542,0.028,0.828219
614,5.234721,5.270721,-0.036,20.159098,20.236994,-0.077896,0.017,0.998723
1018,5.370819,5.406819,-0.036,21.769612,19.691478,2.078134,0.041,0.37012
1268,5.433197,5.469197,-0.036,20.108618,20.331562,-0.222944,0.018,0.99696
1626,5.340529,5.376529,-0.036,20.374334,22.31035,-1.936016,0.031,0.722825
1693,5.491836,5.527836,-0.036,20.704099,19.691046,1.013054,0.024,0.93577


In [10]:
row_idx = rng.choice(n_samples, size=min(200, n_samples), replace=False)
col_idx = rng.choice(n_features, size=min(50, n_features), replace=False)

orig_sub = original[np.ix_(row_idx, col_idx)].ravel()
regen_sub = data_matrix[np.ix_(row_idx, col_idx)].ravel()
energy_dist = energy_distance(orig_sub, regen_sub)

print(f"Energy distance on flattened {len(row_idx)}×{len(col_idx)} subsample: {energy_dist:.4f}")

if "component_labels" in globals():
    unique, counts = np.unique(component_labels, return_counts=True)
    comp_df = pd.DataFrame(
        {
            "component": unique,
            "count": counts,
            "fraction": counts / counts.sum(),
        }
    )
    display(comp_df)
else:
    print("`component_labels` not in scope; skipping regenerated mixture proportion check.")

Energy distance on flattened 200×50 subsample: 0.0398


Unnamed: 0,component,count,fraction
0,0,261,0.261
1,1,266,0.266
2,2,230,0.23
3,3,243,0.243


In [None]:
# from pathlib import Path
# import sys
# import numpy as np
# import pandas as pd

# REPO_ROOT = Path("..").resolve()
# if str(REPO_ROOT) not in sys.path:
#     sys.path.append(str(REPO_ROOT))

# from apclust import load_matrix, run_mcss_ap

# from apclust import generate_samples_original_1

# DATA_ROOT = REPO_ROOT / "Data"

# expected = pd.read_csv(DATA_ROOT / "samples_original_1.csv", header=None).to_numpy()
# generated, labels = generate_samples_original_1()

# print("Shapes match?", generated.shape == expected.shape)
# print("Entries match exactly?", np.array_equal(generated, expected))
# print("Label counts:", np.bincount(labels))

Shapes match? True
Entries match exactly? False
Label counts: [31 30 19 20]


In [None]:
from pathlib import Path
import json
import sys

import numpy as np
import pandas as pd

REPO_ROOT = Path("..").resolve()
if str(REPO_ROOT) not in sys.path:
    sys.path.append(str(REPO_ROOT))

from apclust import generate_gaussian_mixture, generate_samples_original_1, toeplitz_covariance

DATA_ROOT = REPO_ROOT / "Data"
SYNTHETIC_ROOT = DATA_ROOT / "Synthetic"
SYNTHETIC_ROOT.mkdir(parents=True, exist_ok=True)

In [None]:
def render_covariance(spec: dict, n_features: int):
    kind = spec["kind"]
    if kind == "identity":
        variance = spec.get("variance", 1.0)
        if variance <= 0:
            raise ValueError("Identity variance must be positive.")
        return variance
    if kind == "toeplitz":
        return toeplitz_covariance(
            n_features,
            profile=spec.get("profile"),
        )
    raise ValueError(f"Unsupported covariance spec: {kind!r}")


def save_synthetic_case(
    *,
    name: str,
    values: np.ndarray,
    labels: np.ndarray,
    meta: dict,
    root: Path = SYNTHETIC_ROOT,
):
    case_dir = root / name
    case_dir.mkdir(parents=True, exist_ok=True)

    np.savetxt(case_dir / "matrix.csv", values, delimiter=",")
    np.save(case_dir / "labels.npy", labels, allow_pickle=False)

    metadata = {
        "n_samples": int(values.shape[0]),
        "n_features": int(values.shape[1]),
        "label_counts": {int(k): int(v) for k, v in zip(*np.unique(labels, return_counts=True))},
        **meta,
    }
    with (case_dir / "metadata.json").open("w") as fh:
        json.dump(metadata, fh, indent=2)

In [None]:
MEAN_LEVELS = [1.0, 4.0, 7.0, 10.0]

SCENARIOS = []

# Baseline: reproduce the legacy CSV
SCENARIOS.append(
    dict(
        name="legacy_samples_original_1",
        generator="legacy_helper",
    )
)

# t > p, iid variance = 1
for p in (100, 200, 400):
    for t in (10_000, 5_000, 1_000):
        SCENARIOS.append(
            dict(
                name=f"tgt_gt_iso_p{p}_t{t}",
                n_samples=t,
                n_features=p,
                means=MEAN_LEVELS,
                covariance={"kind": "identity", "variance": 1.0},
                noise_seed=100 + p + t,
                label_seed=200 + p + t,
            )
        )

# High-variance iid (σ² = 10)
for p in (100, 200, 400):
    for t in (10_000, 5_000, 1_000):
        SCENARIOS.append(
            dict(
                name=f"tgt_gt_var10_p{p}_t{t}",
                n_samples=t,
                n_features=p,
                means=MEAN_LEVELS,
                covariance={"kind": "identity", "variance": 10.0},
                noise_seed=300 + p + t,
                label_seed=400 + p + t,
            )
        )

# Toeplitz correlation
for p in (100, 200, 400):
    for t in (10_000, 5_000, 1_000):
        SCENARIOS.append(
            dict(
                name=f"tgt_gt_toeplitz_p{p}_t{t}",
                n_samples=t,
                n_features=p,
                means=MEAN_LEVELS,
                covariance={"kind": "toeplitz"},
                noise_seed=500 + p + t,
                label_seed=600 + p + t,
            )
        )

# t ≤ p families (you can extend these as needed)
for p in (1_000, 2_000, 4_000):
    for t in (100, 200, 1_000):
        SCENARIOS.append(
            dict(
                name=f"t_leq_iso_p{p}_t{t}",
                n_samples=t,
                n_features=p,
                means=MEAN_LEVELS,
                covariance={"kind": "identity", "variance": 1.0},
                noise_seed=700 + p + t,
                label_seed=800 + p + t,
            )
        )

In [None]:
for cfg in SCENARIOS:
    name = cfg["name"]

    if cfg.get("generator") == "legacy_helper":
        values, labels = generate_samples_original_1()
        meta = {"generator": "generate_samples_original_1"}
    else:
        n_samples = cfg["n_samples"]
        n_features = cfg["n_features"]
        cov = render_covariance(cfg["covariance"], n_features)

        values, labels = generate_gaussian_mixture(
            n_samples=n_samples,
            n_features=n_features,
            means=cfg.get("means", MEAN_LEVELS),
            covariance=cov,
            noise_seed=cfg.get("noise_seed"),
            label_seed=cfg.get("label_seed"),
            labels=cfg.get("labels"),
            label_probs=cfg.get("label_probs"),
        )
        meta = {
            "generator": "generate_gaussian_mixture",
            "noise_seed": cfg.get("noise_seed"),
            "label_seed": cfg.get("label_seed"),
            "covariance_kind": cfg["covariance"]["kind"],
            "variance": cfg["covariance"].get("variance"),
        }

    save_synthetic_case(name=name, values=values, labels=labels, meta=meta)
    print(f"Wrote {name} → {values.shape[0]}×{values.shape[1]}")