# Dataset Generation
# Normal

In [None]:
from sklearn.datasets import make_classification


def Normal_generate(n_samples, n_classes,feature_list):
# Create datasets with different feature counts and 5 classes
    for num_features in feature_list:
      # Generate data with informative features
        if n_classes * 2 > 2**num_features:
            num_informative = n_classes + 1  # Adjust based on your needs
        else:
            num_informative = num_features
        # Calculate minimum total features (considering redundant features)
        min_features = num_informative + max(1, num_informative // 2)
        
        try:
      # Generate data with informative features
            X, y = make_classification(n_samples=n_samples, n_features=min_features, n_classes=n_classes, n_informative=num_informative,random_state=0)
            Xfile_name=f"./datasets/PCA_X_sample_{n_samples}_class_{n_classes}_feature_{min_features}"
            Yfile_name=f"./datasets/PCA_Y_sample_{n_samples}_class_{n_classes}_feature_{min_features}"

            np.save(Xfile_name, X)
            np.save(Yfile_name, y)
            # Print information about the created dataset
            print(f"Dataset with {num_features} features and 5 classes:")
            print(f"X shape: {X.shape}, y shape: {y.shape}")
        except Exception as e:
            print(f"{e}Error generating dataset with {num_features} features and {n_classes} classes.")


# Noisy
This code incorporates the following aspects:

Feature and Class Generation: It uses make_classification to generate informative features for five classes with n_classes=5.
Noise Addition: Gaussian noise with a specific scale (noise_scale) is introduced to each feature using np.random.normal. The noise is then added to the original features using element-wise addition (X + noise).
Information Printing: It prints the shapes of the original (X) and noisy (X_noisy) datasets for verification.
Explanation:

The loop iterates through the desired feature counts (5, 10, 15, 20).
The noise_scale parameter controls the amount of noise added to the features. Higher values introduce stronger noise.
Additional Notes:

You can explore different noise distributions like uniform noise using np.random.uniform for a wider range of noise characteristics.
Consider using libraries like matplotlib or seaborn to visualize these datasets with noise. Techniques like histograms or density plots can help visualize the distribution of noise in each feature.


In [None]:
from sklearn.datasets import make_classification
import numpy as np


# noise_scale = should be int
def Noisy_generate(n_samples, n_classes,feature_list,  noise_scale):
    noise_scale=noise_scale/100
    # Create datasets with different feature counts, 5 classes, and noise
    for num_features in feature_list:
         # Generate data with informative features
        if n_classes * 2 > 2**num_features:
            num_informative = n_classes + 1  # Adjust based on your needs
        else:
            num_informative = num_features
        
        # Calculate minimum total features (considering redundant features)
        min_features = num_informative + max(1, num_informative // 2)
        try:
          # Generate data with informative features
            X, y = make_classification(n_samples=n_samples, n_features=min_features, n_classes=n_classes, n_informative=num_informative,random_state=0)

            # Add Gaussian noise to each feature
            noise = np.random.normal(scale=noise_scale, size=(X.shape[0], min_features))
            X_noisy = X + noise
            Xfile_name=f"./datasets/PCA_Noisy_X_sample_{n_samples}_class_{n_classes}_feature_{min_features}_noise_scale_{noise_scale}_percent"
            Yfile_name=f"./datasets/PCA_Noisy_Y_sample_{n_samples}_class_{n_classes}_feature_{min_features}_noise_scale_{noise_scale}_percent"

            np.save(Xfile_name,X_noisy)
            np.save(Yfile_name, y)
            # Print information about the created dataset
            print(f"Dataset with {num_features} features and 5 classes:")
            print(f"X shape: {X.shape}, X_noisy shape: {X_noisy.shape}")
        except Exception as e:
            print(f"{e}Error generating dataset with {num_features} features and {n_classes} classes.")


# Outlier
Feature Creation: It uses make_classification to generate informative features for three classes.
Outlier Introduction:
It calculates the number of outliers per class based on outlier_fraction. Then, it randomly selects indices from each class (using np.unique(y)) to introduce outliers.
Outlier Scaling: The features at outlier indices are scaled by outlier_scale to create extreme values.
Information Printing: It prints the shapes of the feature data (X) and class labels (y) for each dataset.
Explanation:

make_classification is better suited for datasets with multiple classes compared to make_blobs.
The loop iterates through the desired feature counts (5, 10, 15, 20).
np.unique(y) ensures outliers are introduced for all three classes.
Consider using libraries like matplotlib or seaborn for visualizing these datasets with outliers.
This code provides a foundation for creating datasets with controlled features, classes, and outliers for your machine learning experiments.



In [None]:
#make outlier
from sklearn.datasets import make_classification
import numpy as np

# # Define parameters
# n_samples = 500
# outlier_fraction shoulkd be int
# outlier_scale = 5
def Outlier_generate(n_samples, n_classes,feature_list,   outlier_fraction,outlier_scale):
# Create datasets with different feature counts, 3 classes, and outliers
    outlier_fraction =outlier_fraction/100
    for num_features in feature_list:
          # Generate data with informative features
        if n_classes * 2 > 2**num_features:
            num_informative = n_classes + 1  # Adjust based on your needs
        else:
            num_informative = num_features
        
        # Calculate minimum total features (considering redundant features)
        min_features = num_informative + max(1, num_informative // 2)
        try:
            # Generate blobs with informative features
            X, y = make_classification(n_samples=n_samples, n_features=min_features, n_classes=n_classes, n_informative=num_informative,random_state=0)

            # Generate outlier indices for each class
            num_outliers_per_class = int(n_samples * outlier_fraction / n_classes)
            outlier_indices = []
            for class_label in np.unique(y):
                class_indices = np.where(y == class_label)[0]
                outlier_indices.extend(np.random.choice(class_indices, num_outliers_per_class, replace=False))

                # Scale outliers by a factor
            X[outlier_indices] = X[outlier_indices] * outlier_scale
            Xfile_name=f"./datasets/PCA_Outlier_X_sample_{n_samples}_class_{n_classes}_feature_{min_features}_outlierfraction_{outlier_fraction}_percent_outlier_scale{outlier_scale}"
            Yfile_name=f"./datasets/PCA_Outlier_Y_sample_{n_samples}_class_{n_classes}_feature_{min_features}_outlierfraction_{outlier_fraction}_percent_outlier_scale{outlier_scale}"

            np.save(Xfile_name, X)
            np.save(Yfile_name, y)
            # Print information about the created dataset
            print(f"Dataset with {num_features} features and 3 classes:")
            print(f"X shape: {X.shape}, y shape: {y.shape}")
        except Exception as e:
            print(f"{e}Error generating dataset with {num_features} features and {n_classes} classes.")



# Missing values
This code incorporates the following aspects:

Feature and Class Generation: It uses make_classification to generate informative features for five classes with n_classes=5.
Missing Value Introduction: A boolean mask (missing_indices) is created using np.random.choice to randomly select elements where missing values will be introduced. The missing_ratio parameter controls the proportion of missing values.
Missing Value Assignment: Elements at those indices in X are set to np.nan to represent missing values.
Information Printing: It prints the shapes of the feature data (X) and class labels (y) for each dataset.
Explanation:

The loop iterates through the desired feature counts (5, 10, 15, 20).
missing_ratio defines the percentage of missing values you want to introduce in each dataset. Adjust this value as needed.
Additional Notes:

This code assumes you want missing values to be randomly distributed across all features. You can modify the code to introduce them in specific features or patterns if desired.
Consider using libraries like pandas.DataFrame to represent the data with missing values. This allows for easier handling and exploration of missing data using pandas functionalities.


In [None]:
from sklearn.datasets import make_classification
import numpy as np

# Define parameters
# n_samples = 500
# missing_ratio = 0.1  # Adjust for desired proportion of missing values
def Missingvalue_generate(n_samples, n_classes,feature_list,   missing_ratio, save=False):
    missing_ratio_str=missing_ratio
    missing_ratio =missing_ratio/100
    # Create datasets with different feature counts, 5 classes, and missing values
    for num_features in feature_list:
        # Generate data with informative features
        if n_classes * 2 > 2**num_features:
            num_informative = n_classes + 1  # Adjust based on your needs
        else:
            num_informative = num_features
        
        # Calculate minimum total features (considering redundant features)
        min_features = num_informative + max(1, num_informative // 2)
        
        try:
            X, y = make_classification(n_samples=n_samples, n_features=min_features, n_classes=n_classes, n_informative=num_informative,random_state=0)

            # Introduce missing values randomly
            missing_indices = np.random.choice([True, False], size=(n_samples, min_features), p=[missing_ratio, 1-missing_ratio])
            X[missing_indices] = np.nan
            Xfile_name=f"./datasets/PCA_Missing_X_sample_{n_samples}_class_{n_classes}_feature_{min_features}_missratio_{missing_ratio_str}_percent"
            Yfile_name=f"./datasets/PCA_Missing_Y_sample_{n_samples}_class_{n_classes}_feature_{min_features}_missratio_{missing_ratio_str}_percent"

            np.save(Xfile_name, X)
            np.save(Yfile_name, y)
            # Print information about the created dataset
            print(f"Dataset with {num_features} features and 5 classes:")
            print(f"X shape: {X.shape}, y shape: {y.shape}")
        except Exception as e:
            print(f"{e}Error generating dataset with {num_features} features and {n_classes} classes.")


In [None]:
# how use
for nsample in [100,500]:#number of samples
    for nclass in [3,5]: #classes
        Normal_generate(nsample, nclass,feature_list=[3,5,10,15,20])
        #Noisy_generate(nsample, nclass,feature_list=[3,5,10,15,20],  noise_scale=50)
        #Outlier_generate(nsample, nclass,feature_list=[3,5,10,15,20],   outlier_fraction=10,outlier_scale=5)
        #Missingvalue_generate(n_samples=nsample, n_classes=nclass,feature_list=[3,5,10,15,20],   missing_ratio=10, save=False)