In [None]:
#output file balanced

In [1]:
import sys

# Add the path of the desired numpy installation at the beginning of sys.path
sys.path.insert(0, '/vsc-hard-mounts/leuven-data/352/vsc35276/env_in_conda/lib/python3.9/site-packages')

# Now import numpy
import numpy as np

print("NumPy version:", np.__version__)

NumPy version: 1.23.0


In [12]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import os

# Function to generate synthetic data for two Gaussian distributions
def generate_data(mean, cov, n_samples, labels):
    data = []
    y = []
    for label, mu in zip(labels, mean):
        samples = np.random.multivariate_normal(mu, cov, n_samples)
        data.append(samples)
        y.append(np.full(n_samples, label))
    return np.vstack(data), np.hstack(y)

# Parameters for the Gaussian distributions
mean_true = [np.array([0, 0]), np.array([1, 1])]  # Means for classes -1 and +1
cov_true = np.array([[1, 0.5], [0.5, 1]])  # Shared covariance
n_samples_per_class = 100  # Samples per class

# Generate synthetic data
data, labels = generate_data(mean_true, cov_true, n_samples_per_class, labels=[-1, 1])

# Directory for saving plots
output_dir = "Paper_Output_balanced"
os.makedirs(output_dir, exist_ok=True)

# Iterate over different numbers of duplicates
for n_duplicates in range(0, 201, 20):
    # Duplicate random examples from both classes
    random_indices = np.random.choice(np.arange(len(labels)), size=n_duplicates, replace=True)
    data_with_duplicates = np.vstack([data, data[random_indices]])
    labels_with_duplicates = np.hstack([labels, labels[random_indices]])

    # Count how many duplicated samples are from each class
    count_class_neg_duplicates = np.sum(labels[random_indices] == -1)
    count_class_pos_duplicates = np.sum(labels[random_indices] == 1)

    # Train non-linear SVM classifier for original dataset
    svm_original = SVC(kernel='rbf', gamma='auto')
    svm_original.fit(data, labels)

    # Train non-linear SVM classifier for dataset with duplicated samples
    svm_with_duplicates = SVC(kernel='rbf', gamma='auto')
    svm_with_duplicates.fit(data_with_duplicates, labels_with_duplicates)

    # Compute accuracy for each class in the original dataset
    predictions_original = svm_original.predict(data)
    accuracy_class_neg_original = accuracy_score(labels[labels == -1], predictions_original[labels == -1])
    accuracy_class_pos_original = accuracy_score(labels[labels == 1], predictions_original[labels == 1])

    # Compute accuracy for each class in the dataset with duplicated samples
    predictions_with_duplicates = svm_with_duplicates.predict(data_with_duplicates)
    accuracy_class_neg_with_duplicates = accuracy_score(labels_with_duplicates[labels_with_duplicates == -1], predictions_with_duplicates[labels_with_duplicates == -1])
    accuracy_class_pos_with_duplicates = accuracy_score(labels_with_duplicates[labels_with_duplicates == 1], predictions_with_duplicates[labels_with_duplicates == 1])

    # Compute overall accuracies
    overall_accuracy_original = (accuracy_class_neg_original + accuracy_class_pos_original) / 2
    overall_accuracy_with_duplicates = (accuracy_class_neg_with_duplicates + accuracy_class_pos_with_duplicates) / 2

    # Generate grid for decision boundaries
    x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
    y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))

    # Predict boundary values for original dataset
    Z_original = svm_original.decision_function(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

    # Predict boundary values for dataset with duplicated samples
    Z_with_duplicates = svm_with_duplicates.decision_function(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

    # Plot the datasets, decision boundaries, and duplicated samples
    plt.figure(figsize=(8, 6), dpi=300)

    # Original data
    plt.scatter(data[labels == -1][:, 0], data[labels == -1][:, 1], label="Class -1 (Original)", alpha=0.6)
    plt.scatter(data[labels == 1][:, 0], data[labels == 1][:, 1], label="Class +1 (Original)", alpha=0.6)

    # Duplicated random examples
    plt.scatter(data[random_indices][:, 0], data[random_indices][:, 1], label="Duplicated Samples", alpha=0.6, color='orange', edgecolor='k', marker='o')

    # Decision boundary for original data
    plt.contour(xx, yy, Z_original, levels=[0], colors='blue', linewidths=2)

    # Decision boundary for dataset with duplicated samples
    plt.contour(xx, yy, Z_with_duplicates, levels=[0], colors='red', linewidths=2, linestyles='dashed')

    plt.title(f"Decision Boundaries with {n_duplicates} Duplicates", fontsize=12)
    plt.xlabel("Feature 1")
    plt.ylabel("Feature 2")
    plt.legend()
    plt.grid(True)

    # Annotate results below the plot
    #results_text = (
     #   f"n_duplicates: {n_duplicates}\n"
      #  f"Accuracy for Class -1 (Original): {accuracy_class_neg_original:.2f}\n"
       # f"Accuracy for Class +1 (Original): {accuracy_class_pos_original:.2f}\n"
       # f"Accuracy for Class -1 (With Duplicates): {accuracy_class_neg_with_duplicates:.2f}\n"
       # f"Accuracy for Class +1 (With Duplicates): {accuracy_class_pos_with_duplicates:.2f}\n"
       # f"Overall Accuracy (Original): {overall_accuracy_original:.2f}\n"
       # f"Overall Accuracy (With Duplicates): {overall_accuracy_with_duplicates:.2f}\n"
       # f"Number of duplicated samples from Class -1: {count_class_neg_duplicates}\n"
       # f"Number of duplicated samples from Class +1: {count_class_pos_duplicates}"
    #)
    #plt.figtext(0, -0.22, results_text, wrap=True, horizontalalignment='left', fontsize=10)
    
    # Save each plot as a separate image
    plt.tight_layout()
    plot_filename = os.path.join(output_dir, f"Decision_Boundaries_n{n_duplicates}.pdf")
    plt.savefig(plot_filename, bbox_inches="tight", dpi=150, transparent=False)

    plt.close()


    print(f"n_duplicates: {n_duplicates/2}")
    print(f"Accuracy for Class -1 (Original): {accuracy_class_neg_original:.2f}")
    print(f"Accuracy for Class +1 (Original): {accuracy_class_pos_original:.2f}")
    print(f"Accuracy for Class -1 (With Duplicated Samples): {accuracy_class_neg_with_duplicates:.2f}")
    print(f"Accuracy for Class +1 (With Duplicated Samples): {accuracy_class_pos_with_duplicates:.2f}")

    print(f"Overall Accuracy (Original): {overall_accuracy_original:.2f}")
    print(f"Overall Accuracy (With Duplicated Samples): {overall_accuracy_with_duplicates:.2f}")

    # Print counts of duplicated samples by class
    #print(f"Number of duplicated samples from Class -1: {count_class_neg_duplicates}")
    #print(f"Number of duplicated samples from Class +1: {count_class_pos_duplicates}")
    print(f"Ratio of duplicated samples from Class +1 to total duplicated samples: {count_class_pos_duplicates / (count_class_neg_duplicates + count_class_pos_duplicates)}")  
    print("-" * 50)
print(f"Plots saved to {output_dir}/ as PNG files.")

n_duplicates: 0.0
Accuracy for Class -1 (Original): 0.87
Accuracy for Class +1 (Original): 0.54
Accuracy for Class -1 (With Duplicated Samples): 0.87
Accuracy for Class +1 (With Duplicated Samples): 0.54
Overall Accuracy (Original): 0.71
Overall Accuracy (With Duplicated Samples): 0.71
Ratio of duplicated samples from Class +1 to total duplicated samples: nan
--------------------------------------------------


  print(f"Ratio of duplicated samples from Class +1 to total duplicated samples: {count_class_pos_duplicates / (count_class_neg_duplicates + count_class_pos_duplicates)}")


n_duplicates: 10.0
Accuracy for Class -1 (Original): 0.87
Accuracy for Class +1 (Original): 0.54
Accuracy for Class -1 (With Duplicated Samples): 0.83
Accuracy for Class +1 (With Duplicated Samples): 0.56
Overall Accuracy (Original): 0.71
Overall Accuracy (With Duplicated Samples): 0.69
Ratio of duplicated samples from Class +1 to total duplicated samples: 0.45
--------------------------------------------------
n_duplicates: 20.0
Accuracy for Class -1 (Original): 0.87
Accuracy for Class +1 (Original): 0.54
Accuracy for Class -1 (With Duplicated Samples): 0.89
Accuracy for Class +1 (With Duplicated Samples): 0.51
Overall Accuracy (Original): 0.71
Overall Accuracy (With Duplicated Samples): 0.70
Ratio of duplicated samples from Class +1 to total duplicated samples: 0.475
--------------------------------------------------
n_duplicates: 30.0
Accuracy for Class -1 (Original): 0.87
Accuracy for Class +1 (Original): 0.54
Accuracy for Class -1 (With Duplicated Samples): 0.77
Accuracy for Class

In [7]:
#biased unbalanced

In [17]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import os

# Function to generate synthetic data for two Gaussian distributions
def generate_data(mean, cov, n_samples, labels):
    data = []
    y = []
    for label, mu in zip(labels, mean):
        samples = np.random.multivariate_normal(mu, cov, n_samples)
        data.append(samples)
        y.append(np.full(n_samples, label))
    return np.vstack(data), np.hstack(y)

# Parameters for the Gaussian distributions
mean_true = [np.array([0, 0]), np.array([1, 1])]  # Means for classes -1 and +1
cov_true = np.array([[1, 0.5], [0.5, 1]])  # Shared covariance
n_samples_per_class = 100 # Samples per class

# Generate synthetic data
data, labels = generate_data(mean_true, cov_true, n_samples_per_class, labels=[-1, 1])

# Directory for saving plots
output_dir = "Paper_Output"
os.makedirs(output_dir, exist_ok=True)

# Iterate over different numbers of duplicates
for n_duplicates in range(0, 401, 40):
    # Weighted sampling: 70% from class +1, 30% from class -1
    weights = np.where(labels == 1, 0.8, 0.2)
    weights /= weights.sum()  # Normalize weights

    random_indices = np.random.choice(np.arange(len(labels)), size=n_duplicates, replace=True, p=weights)
    data_with_duplicates = np.vstack([data, data[random_indices]])
    labels_with_duplicates = np.hstack([labels, labels[random_indices]])

    # Count how many duplicated samples are from each class
    count_class_neg_duplicates = np.sum(labels[random_indices] == -1)
    count_class_pos_duplicates = np.sum(labels[random_indices] == 1)

    # Train non-linear SVM classifier for original dataset
    svm_original = SVC(kernel='rbf', gamma='auto')
    svm_original.fit(data, labels)

    # Train non-linear SVM classifier for dataset with duplicated samples
    svm_with_duplicates = SVC(kernel='rbf', gamma='auto')
    svm_with_duplicates.fit(data_with_duplicates, labels_with_duplicates)

    # Compute accuracy for each class in the original dataset
    predictions_original = svm_original.predict(data)
    accuracy_class_neg_original = accuracy_score(labels[labels == -1], predictions_original[labels == -1])
    accuracy_class_pos_original = accuracy_score(labels[labels == 1], predictions_original[labels == 1])

    # Compute accuracy for each class in the dataset with duplicated samples
    predictions_with_duplicates = svm_with_duplicates.predict(data_with_duplicates)
    accuracy_class_neg_with_duplicates = accuracy_score(labels_with_duplicates[labels_with_duplicates == -1], predictions_with_duplicates[labels_with_duplicates == -1])
    accuracy_class_pos_with_duplicates = accuracy_score(labels_with_duplicates[labels_with_duplicates == 1], predictions_with_duplicates[labels_with_duplicates == 1])

    # Compute overall accuracies
    overall_accuracy_original = (accuracy_class_neg_original + accuracy_class_pos_original) / 2
    overall_accuracy_with_duplicates = (accuracy_class_neg_with_duplicates + accuracy_class_pos_with_duplicates) / 2

    # Generate grid for decision boundaries
    x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
    y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))

    # Predict boundary values for original dataset
    Z_original = svm_original.decision_function(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

    # Predict boundary values for dataset with duplicated samples
    Z_with_duplicates = svm_with_duplicates.decision_function(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

    # Plot the datasets, decision boundaries, and duplicated samples
    plt.figure(figsize=(8, 6), dpi=300)

    # Original data
    plt.scatter(data[labels == -1][:, 0], data[labels == -1][:, 1], label="Class -1 (Original)", alpha=0.7)
    plt.scatter(data[labels == 1][:, 0], data[labels == 1][:, 1], label="Class +1 (Original)", alpha=0.7)

    # Duplicated random examples
    plt.scatter(data[random_indices][:, 0], data[random_indices][:, 1], label="Duplicated Samples", alpha=0.7, color='orange', edgecolor='k', marker='o')

    # Decision boundary for original data
    plt.contour(xx, yy, Z_original, levels=[0], colors='blue', linewidths=2)

    # Decision boundary for dataset with duplicated samples
    plt.contour(xx, yy, Z_with_duplicates, levels=[0], colors='red', linewidths=2, linestyles='dashed')

    plt.title(f"Decision Boundaries with {n_duplicates} Duplicates", fontsize=12)
    plt.xlabel("Feature 1")
    plt.ylabel("Feature 2")
    plt.legend()
    plt.grid(True)

    # Annotate results below the plot
#    results_text = (
 #       f"n_duplicates: {n_duplicates}\n"
  #      f"Accuracy for Class -1 (Original): {accuracy_class_neg_original:.2f}\n"
   #     f"Accuracy for Class +1 (Original): {accuracy_class_pos_original:.2f}\n"
    #    f"Accuracy for Class -1 (With Duplicates): {accuracy_class_neg_with_duplicates:.2f}\n"
     #   f"Accuracy for Class +1 (With Duplicates): {accuracy_class_pos_with_duplicates:.2f}\n"
      #  f"Overall Accuracy (Original): {overall_accuracy_original:.2f}\n"
       # f"Overall Accuracy (With Duplicates): {overall_accuracy_with_duplicates:.2f}\n"
        #f"Number of duplicated samples from Class -1: {count_class_neg_duplicates}\n"
        #f"Number of duplicated samples from Class +1: {count_class_pos_duplicates}"
    #)
  #  plt.figtext(0, -0.22, results_text, wrap=True, horizontalalignment='left', fontsize=10)

    # Save each plot as a separate image
    plt.tight_layout()
    plot_filename = os.path.join(output_dir, f"Decision_Boundaries_n{n_duplicates}.pdf")
    plt.savefig(plot_filename, bbox_inches="tight", dpi=150, transparent=False)
    plt.close()
        # Print accuracy results
    print(f"n_duplicates: {n_duplicates/4}")
    print(f"Accuracy for Class -1 (Original): {accuracy_class_neg_original:.2f}")
    print(f"Accuracy for Class +1 (Original): {accuracy_class_pos_original:.2f}")
    print(f"Accuracy for Class -1 (With Duplicated Samples): {accuracy_class_neg_with_duplicates:.2f}")
    print(f"Accuracy for Class +1 (With Duplicated Samples): {accuracy_class_pos_with_duplicates:.2f}")

    # Compute and print overall accuracies
    overall_accuracy_original = (accuracy_class_neg_original + accuracy_class_pos_original) / 2
    overall_accuracy_with_duplicates = (accuracy_class_neg_with_duplicates + accuracy_class_pos_with_duplicates) / 2
    print(f"Overall Accuracy (Original): {overall_accuracy_original:.2f}")
    print(f"Overall Accuracy (With Duplicated Samples): {overall_accuracy_with_duplicates:.2f}")

    # Print counts of duplicated samples by class
    #print(f"Number of duplicated samples from Class -1: {count_class_neg_duplicates}")
    #print(f"Number of duplicated samples from Class +1: {count_class_pos_duplicates}")
    print(f"Ratio of duplicated samples from Class -1 to total duplicated samples: {count_class_pos_duplicates / (count_class_neg_duplicates + count_class_pos_duplicates)}")  
    print("-" * 50)
print(f"Plots saved to {output_dir}/ as PNG files.")


n_duplicates: 0.0
Accuracy for Class -1 (Original): 0.85
Accuracy for Class +1 (Original): 0.67
Accuracy for Class -1 (With Duplicated Samples): 0.85
Accuracy for Class +1 (With Duplicated Samples): 0.67
Overall Accuracy (Original): 0.76
Overall Accuracy (With Duplicated Samples): 0.76
Ratio of duplicated samples from Class -1 to total duplicated samples: nan
--------------------------------------------------


  print(f"Ratio of duplicated samples from Class -1 to total duplicated samples: {count_class_pos_duplicates / (count_class_neg_duplicates + count_class_pos_duplicates)}")


n_duplicates: 10.0
Accuracy for Class -1 (Original): 0.85
Accuracy for Class +1 (Original): 0.67
Accuracy for Class -1 (With Duplicated Samples): 0.75
Accuracy for Class +1 (With Duplicated Samples): 0.78
Overall Accuracy (Original): 0.76
Overall Accuracy (With Duplicated Samples): 0.76
Ratio of duplicated samples from Class -1 to total duplicated samples: 0.725
--------------------------------------------------
n_duplicates: 20.0
Accuracy for Class -1 (Original): 0.85
Accuracy for Class +1 (Original): 0.67
Accuracy for Class -1 (With Duplicated Samples): 0.61
Accuracy for Class +1 (With Duplicated Samples): 0.80
Overall Accuracy (Original): 0.76
Overall Accuracy (With Duplicated Samples): 0.71
Ratio of duplicated samples from Class -1 to total duplicated samples: 0.8625
--------------------------------------------------
n_duplicates: 30.0
Accuracy for Class -1 (Original): 0.85
Accuracy for Class +1 (Original): 0.67
Accuracy for Class -1 (With Duplicated Samples): 0.56
Accuracy for Cla