---
#### Data generator

Parameters:
- N: Total number of data points
- V: Number of dimensions/features
- k: Number of clusters
- alpha: Controls cluster center spread (centers are in [α-1, 1-α])
- nmin: Minimum points per cluster
- seed: Random seed for reproducibility
- sig_range: Tuple (min, max) for cluster standard deviations

Returns:
- Nk: Array of cluster sizes
- R: List of ranges for each cluster
- y: Cluster labels for each point
- X: Generated data (N x V array)
- cen: Cluster centers (k x V array)

In [None]:
def generdat(N, V, k, alpha, nmin, seed=None, sig_range=(0.05, 0.1)):
    if N < k * nmin:
        raise ValueError(f"N must be >= k * nmin. Got N={N}, k={k}, nmin={nmin}")
    if k < 1:
        raise ValueError("k must be at least 1")
    if alpha == 1:
        raise ValueError("alpha cannot be 1")

    if seed is not None:
        np.random.seed(seed)

    if k == 1:
        Nk = np.array([N])
    else:
        base_sizes = np.ones(k, dtype=int) * nmin
        remaining = N - k * nmin
        if remaining > 0:
            additional = np.random.multinomial(remaining, np.ones(k)/k)
            Nk = base_sizes + additional
        else:
            Nk = base_sizes

    # Cluster centers
    cen = (alpha - 1) + 2 * (1 - alpha) * np.random.rand(k, V)

    X = np.zeros((N, V))
    y = np.zeros(N, dtype=int)
    R = []
    
    sig_min, sig_max = sig_range
    start_idx = 0
    
    for k0 in range(k):
        nk = Nk[k0]
        end_idx = start_idx + nk
        
        # Range for the current cluster
        R.append(range(start_idx, end_idx))
        y[start_idx:end_idx] = k0 
        
        # Cluster data generation
        sig = sig_min + (sig_max - sig_min) * np.random.rand(V)
        X[start_idx:end_idx] = np.random.randn(nk, V) * sig + cen[k0, :]
        
        start_idx = end_idx

    return Nk, R, y, X, cen

N, V, k, alpha, nmin = 500, 3, 4, 0.5, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)
