In [19]:
from pyod.utils.data import generate_data_clusters
import numpy as np
from joblib import dump, load

# Define the number of samples and feature dimensions
n_samples = 5000
n_features = [100, 500, 1000, 2000, 5000]
n_train = int(n_samples * 0.6)  # 60% of samples for training
n_test = int(n_samples * 0.4)  # 40% of samples for testing

def generate_groups(n_feature):
    groups = []

    low = 6  # Lower limit of feature range
    high = 20  # Upper limit of feature range

    while n_feature > 0:
        k = np.random.randint(low, high)
        k = min(k, n_feature)
        groups.append(k)
        n_feature -= k

    return groups

# Random seed for shuffling features
random_seed = 42  # You can choose any seed value

# Generate and store data for different feature dimensions
for n_feature in n_features:
    print(f"Processing {n_feature} features...")

    # Initialize training and test data sets
    X_train = np.zeros((n_train, n_feature))
    X_test = np.zeros((n_test, n_feature))
    y_train = np.zeros((n_train, 1))
    y_test = np.zeros((n_test, 1))

    # Generate feature combinations
    groups = generate_groups(n_feature)

    # Initialize label matrices
    y_train_label = np.zeros((n_train, len(groups)))
    y_test_label = np.zeros((n_test, len(groups)))

    i = 0
    k = 0

    for dimension in groups:
#         print(dimension)

        # Generate clustered data

        a, b, c, d = generate_data_clusters(n_train, n_test, 2, dimension, contamination=0.18/len(groups))

        # Fill feature data
        X_train[:, i:i + dimension] = a
        X_test[:, i:i + dimension] = b

        # Update labels
        
        # label
        y_train = y_train + c.reshape(n_train, 1)
        y_test = y_test + d.reshape(n_test, 1)

        #label for each group
        y_train_label[:, k] = y_train.reshape(n_train,)
        y_test_label[:, k] = y_test.reshape(n_test,)

        i = i + dimension
        k = k + 1

    # Shuffle features using the same random seed
    np.random.seed(random_seed)
    permutation = np.random.permutation(n_feature)
    X_train = X_train[:, permutation]
    X_test = X_test[:, permutation]

    # Convert labels to binary (1 if non-zero, 0 otherwise)
    y_train = (y_train != 0) * 1
    y_test = (y_test != 0) * 1
    
    print('Outlier Perc',(np.sum(y_train)+np.sum(y_test))/n_samples)
    # Store data
    Traindata = (X_train, y_train, y_train_label)
    Testdata = (X_test, y_test, y_test_label)

    joblibfile = f'Train{n_feature}_6_20.dat'
    dump(Traindata, joblibfile)

    joblibfile = f'Test{n_feature}_6_20.dat'
    dump(Testdata, joblibfile)

print("Data generation and storage completed.")


Processing 100 features...
Outlier Perc 0.1672
Processing 500 features...
Outlier Perc 0.1564
Processing 1000 features...
Outlier Perc 0.1526
Processing 2000 features...
Outlier Perc 0.1484
Processing 5000 features...
Outlier Perc 0.1496
Data generation and storage completed.
