# Kaggle setup

In [None]:
!pip install -U -q kaggle
!mkdir -p ~/.kaggle
!mkdir -p .kaggle

In [None]:
import json
import zipfile
import os
import pandas as pd

token = {"username":"asdasdaf","key":"asdasdaf"}

with open('/home/jovyan/.kaggle/kaggle.json', 'w') as file:
    json.dump(token, file)

In [None]:
!chmod 600 ~/.kaggle/kaggle.json
!ls -laR ~/.kaggle

## Aux funct

In [None]:
def extract_and_remove_zip(zip_files):
    for zip_file in zip_files:
        extract_to_path = os.path.dirname(zip_file)

        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(extract_to_path)

        os.remove(zip_file)

In [None]:
def combine_and_delete_chunked(x_test_file, x_train_file, y_test_file, y_train_file, output_prefix):
    chunk_size = 10000  # Adjust chunk size as needed
    
    # Extract directory path from one of the input files
    output_dir = os.path.dirname(x_test_file)

    # Function to combine and save chunks
    def combine_and_save_chunks(input_file, output_file):
        with open(output_file, 'a', newline='') as f:
            for chunk in pd.read_csv(input_file, chunksize=chunk_size):
                chunk.to_csv(f, index=False, header=f.tell() == 0)
                #del chunk  # Delete DataFrame to free up memory

    # Combine and save chunks for X
    combine_and_save_chunks(x_test_file, os.path.join(output_dir, f'X_{output_prefix}.csv'))
    combine_and_save_chunks(x_train_file, os.path.join(output_dir, f'{output_prefix}_X_combined.csv'))

    # Combine and save chunks for y
    combine_and_save_chunks(y_test_file, os.path.join(output_dir, f'{output_prefix}_y_combined.csv'))
    combine_and_save_chunks(y_train_file, os.path.join(output_dir, f'{output_prefix}_y_combined.csv'))

    # Delete the original test and train files
    os.remove(x_test_file)
    os.remove(x_train_file)
    os.remove(y_test_file)
    os.remove(y_train_file)

# Deepsat

In [None]:
!mkdir -p datasets/deepsat-sat6
!mkdir -p datasets/deepsat-sat4
!~/.local/bin/kaggle datasets download -d crawford/deepsat-sat6 -p ./datasets/deepsat-sat6
!~/.local/bin/kaggle datasets download -d crawford/deepsat-sat4 -p ./datasets/deepsat-sat4

In [None]:
zip_files = ['./datasets/deepsat-sat6/deepsat-sat6.zip', './datasets/deepsat-sat4/deepsat-sat4.zip']

extract_and_remove_zip(zip_files)

In [None]:
!ls -la datasets/deepsat-sat6
!ls -la datasets/deepsat-sat4

In [None]:
# Combine and delete for sat6
combine_and_delete_chunked(
    './datasets/deepsat-sat6/X_test_sat6.csv',
    './datasets/deepsat-sat6/X_train_sat6.csv',
    './datasets/deepsat-sat6/y_test_sat6.csv',
    './datasets/deepsat-sat6/y_train_sat6.csv',
    'sat6'
)

# Combine and delete for sat4
combine_and_delete_chunked(
    './datasets/deepsat-sat4/X_test_sat4.csv',
    './datasets/deepsat-sat4/X_train_sat4.csv',
    './datasets/deepsat-sat4/y_test_sat4.csv',
    './datasets/deepsat-sat4/y_train_sat4.csv',
    'sat4'
)

In [None]:
!rm datasets/deepsat-sat6/sat-6-full.mat
!rm datasets/deepsat-sat4/sat-4-full.mat

In [None]:
!ls -la datasets/deepsat-sat6
!ls -la datasets/deepsat-sat4

# Eurosat

In [None]:
!mkdir -p datasets/eurosat
!~/.local/bin/kaggle datasets download -d apollo2506/eurosat-dataset -p ./datasets/eurosat

In [None]:
zip_files = ['./eurosat/eurosat-dataset.zip']

extract_and_remove_zip(zip_files)

In [None]:
!ls -la datasets/eurosat

In [None]:
!ls -la datasets/eurosat/EuroSAT

# Remove token

In [None]:
!rm ~/.kaggle/kaggle.json
!ls -laR ~/.kaggle

# Generate datasets

In [None]:
"""Generate datasets for the BARS & STRIPES benchmark."""

import os
import numpy as np
from qic.data import generate_bars_and_stripes

os.makedirs("datasets/bars_and_stripes", exist_ok=True)

n_samples_train = 1000
n_samples_test = 200
noise_std = 0.5

for size in [4, 8, 16, 32]:

    np.random.seed(42)

    width = size
    height = size

    X_train, y_train = generate_bars_and_stripes(
        n_samples_train, height, width, noise_std
    )
    X_test, y_test = generate_bars_and_stripes(n_samples_test, height, width, noise_std)

    path_train = f"datasets/bars_and_stripes/bars_and_stripes_{height}_x_{width}_{noise_std}noise_train.csv"
    data_train = np.c_[np.reshape(X_train, [n_samples_train, -1]), y_train]
    np.savetxt(path_train, data_train, delimiter=",")

    path_test = f"datasets/bars_and_stripes/bars_and_stripes_{height}_x_{width}_{noise_std}noise_test.csv"
    data_test = np.c_[np.reshape(X_test, [n_samples_test, -1]), y_test]
    np.savetxt(path_test, data_test, delimiter=",")

In [None]:
"""Generate datasets for the HIDDEN MANIFOLD and HIDDEN MANIFOLD DIFF benchmarks."""

import os
import numpy as np
from sklearn.model_selection import train_test_split
from qic.data import generate_hidden_manifold_model

np.random.seed(3)
os.makedirs("datasets/hidden_manifold", exist_ok=True)

manifold_dimension = 6
n_samples = 300

for n_features in range(2, 21):
    X, y = generate_hidden_manifold_model(n_samples, n_features, manifold_dimension)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    name_train = f"datasets/hidden_manifold/hidden_manifold-{manifold_dimension}manifold-{n_features}d_train.csv"
    data_train = np.c_[X_train, y_train]
    np.savetxt(name_train, data_train, delimiter=",")

    name_test = f"datasets/hidden_manifold/hidden_manifold-{manifold_dimension}manifold-{n_features}d_test.csv"
    data_test = np.c_[X_test, y_test]
    np.savetxt(name_test, data_test, delimiter=",")


os.makedirs("datasets/hidden_manifold_diff", exist_ok=True)

n_features = 10
n_samples = 300

for manifold_dimension in range(2, 21):
    X, y = generate_hidden_manifold_model(n_samples, n_features, manifold_dimension)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    name_train = f"datasets/hidden_manifold_diff/hidden_manifold-10d-{manifold_dimension}manifold_train.csv"
    data_train = np.c_[X_train, y_train]
    np.savetxt(name_train, data_train, delimiter=",")

    name_test = f"datasets/hidden_manifold_diff/hidden_manifold-10d-{manifold_dimension}manifold_test.csv"
    data_test = np.c_[X_test, y_test]
    np.savetxt(name_test, data_test, delimiter=",")


In [None]:
"""Generate datasets for the HYPERPLANES DIFF benchmark."""

import os
import numpy as np
from sklearn.model_selection import train_test_split
from qic.data import generate_hyperplanes_parity

np.random.seed(1)

os.makedirs("datasets/hyperplanes_diff", exist_ok=True)

n_features = 10
dim_hyperplanes = 3
n_samples = 300

for n_hyperplanes in range(2, 21):

    X, y = generate_hyperplanes_parity(
        n_samples, n_features, n_hyperplanes, dim_hyperplanes
    )

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    name_train = f"datasets/hyperplanes_diff/hyperplanes-10d-from{dim_hyperplanes}d-{n_hyperplanes}n_train.csv"
    data_train = np.c_[X_train, y_train]
    np.savetxt(name_train, data_train, delimiter=",")

    name_test = f"datasets/hyperplanes_diff/hyperplanes-10d-from{dim_hyperplanes}d-{n_hyperplanes}n_test.csv"
    data_test = np.c_[X_test, y_test]
    np.savetxt(name_test, data_test, delimiter=",")


In [None]:
"""Generate datasets for the  LINEARLY SEPARABLE benchmark."""

import os
import numpy as np
from sklearn.model_selection import train_test_split
from qic.data import generate_linearly_separable

np.random.seed(42)

os.makedirs("datasets/linearly_separable", exist_ok=True)

n_samples = 300

for n_features in range(2, 21):
    margin = 0.02 * n_features

    X, y = generate_linearly_separable(n_samples, n_features, margin)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    name_train = f"datasets/linearly_separable/linearly_separable_{n_features}d_train.csv"
    data_train = np.c_[X_train, y_train]
    np.savetxt(name_train, data_train, delimiter=",")

    name_test = f"datasets/linearly_separable/linearly_separable_{n_features}d_test.csv"
    data_test = np.c_[X_test, y_test]
    np.savetxt(name_test, data_test, delimiter=",")


In [None]:
"""Generate datasets for the MNIST benchmarks. Note that these can be large."""

import os
import torch
import numpy as np
# we import explicitly from data.mnist here because some dependencies of
# mnist generation are large and should not be imported by default
from qic.data.mnist import generate_mnist

# generate the MNIST PCA benchmark
np.random.seed(42)

os.makedirs("datasets/mnist-pca", exist_ok=True)

digitA = 3
digitB = 5

for n_features in range(2, 21):
    X_train, X_test, y_train, y_test = generate_mnist(
        digitA, digitB, preprocessing="pca", n_features=n_features
    )

    name_train = f"datasets/mnist-pca/mnist-pca_{digitA}-{digitB}_{n_features}d_train.csv"
    data_train = np.c_[X_train, y_train]
    np.savetxt(name_train, data_train, delimiter=",")

    name_test = f"datasets/mnist-pca/mnist-pca_{digitA}-{digitB}_{n_features}d_test.csv"
    data_test = np.c_[X_test, y_test]
    np.savetxt(name_test, data_test, delimiter=",")

# generate the MNIST PCA- benchmark
np.random.seed(42)

os.makedirs("datasets/mnist-pca-", exist_ok=True)

digitA = 3
digitB = 5

for n_features in range(2, 21):
    X_train, X_test, y_train, y_test = generate_mnist(
        digitA, digitB, preprocessing="pca-", n_features=n_features, n_samples=250
    )

    name_train = f"datasets/mnist-pca-/mnist-pca-_{digitA}-{digitB}_{n_features}d_train.csv"
    data_train = np.c_[X_train, y_train]
    np.savetxt(name_train, data_train, delimiter=",")

    name_test = f"datasets/mnist-pca-/mnist-pca-_{digitA}-{digitB}_{n_features}d_test.csv"
    data_test = np.c_[X_test, y_test]
    np.savetxt(name_test, data_test, delimiter=",")

# generate the MNIST CG benchmark
torch.manual_seed(42)

os.makedirs("datasets/mnist-cg", exist_ok=True)

digitA = 3
digitB = 5

for height in [4, 8, 16, 32]:
    X_train, X_test, y_train, y_test = generate_mnist(
        digitA, digitB, preprocessing="cg", height=height
    )

    name_train = f"datasets/mnist-cg/mnist-cg_{digitA}-{digitB}_{height}x{height}_train.csv"
    data_train = np.c_[X_train, y_train]
    np.savetxt(name_train, data_train, delimiter=",")

    name_test = f"datasets/mnist-cg/mnist-cg_{digitA}-{digitB}_{height}x{height}_test.csv"
    data_test = np.c_[X_test, y_test]
    np.savetxt(name_test, data_test, delimiter=",")


In [None]:
"""Generate datasets for the TWO CURVES and TWO CURVES DIFF benchmarks."""

import os
import numpy as np
from sklearn.model_selection import train_test_split
from qic.data import generate_two_curves

np.random.seed(3)

os.makedirs("datasets/two_curves_diff", exist_ok=True)

n_samples = 300
degree = 5
offset = 0.1
noise = 0.01

for n_features in range(2, 21):
    X, y = generate_two_curves(n_samples, n_features, degree, offset, noise)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    name_train = f"datasets/two_curves_diff/two_curves-5degree-0.1offset-{n_features}d_train.csv"
    data_train = np.c_[X_train, y_train]
    np.savetxt(name_train, data_train, delimiter=",")

    name_test = f"datasets/two_curves_diff/two_curves-5degree-0.1offset-{n_features}d_test.csv"
    data_test = np.c_[X_test, y_test]
    np.savetxt(name_test, data_test, delimiter=",")

# generate the TWO CURVES DIFF benchmark

os.makedirs("datasets/two_curves_diff", exist_ok=True)

n_samples = 300
n_features = 10
noise = 0.01

for degree in range(2, 21):
    offset = 1 / (2 * degree)

    X, y = generate_two_curves(n_samples, n_features, degree, offset, noise)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    name_train = f"datasets/two_curves_diff/two_curves-10d-{degree}degree_train.csv"
    data_train = np.c_[X_train, y_train]
    np.savetxt(name_train, data_train, delimiter=",")

    name_test = f"datasets/two_curves_diff/two_curves-10d-{degree}degree_test.csv"
    data_test = np.c_[X_test, y_test]
    np.savetxt(name_test, data_test, delimiter=",")


In [2]:
import os
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

def generate_classification_dataset(n_samples=500, n_features=10, n_informative=2, n_redundant=2, n_clusters_per_class=2, class_sep=0.5):
    """
    Generate a complex binary classification dataset.
    """
    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_informative, 
                               n_redundant=n_redundant, n_clusters_per_class=n_clusters_per_class, 
                               class_sep=class_sep, random_state=42)
    return X, y

def save_dataset_to_csv(X, y, n_features, dataset_name, informative_ratio, directory="datasets/classification-dataset"):
    """
    Save the dataset to CSV files.
    """
    os.makedirs(directory, exist_ok=True)
    
    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Save the training set
    name_train = os.path.join(directory, f"{dataset_name}_{n_features}d_{informative_ratio}_train.csv")
    data_train = np.c_[X_train, y_train]
    np.savetxt(name_train, data_train, delimiter=",")
    
    # Save the test set
    name_test = os.path.join(directory, f"{dataset_name}_{n_features}d_{informative_ratio}_test.csv")
    data_test = np.c_[X_test, y_test]
    np.savetxt(name_test, data_test, delimiter=",")
    
    return X_train, X_test, y_train, y_test

# Main script
dataset_name = "classification-dataset"
n_samples = 500
n_clusters_per_class = 2
class_sep = 0.5

for n_features in range(2, 21):
    for informative_ratio, ratio in {
        "50-50": 0.5,
        "20-80": 0.2,
        "80-20": 0.8
    }.items():
        # Calculate n_informative and n_redundant based on the ratio
        n_informative = max(1, int(n_features * ratio))
        n_redundant = n_features - n_informative

        # Ensure n_informative satisfies the constraint for n_clusters_per_class
        min_informative = max(2, int(np.ceil(np.log2(2 * n_clusters_per_class))))
        if n_informative < min_informative:
            n_informative = min_informative
            n_redundant = n_features - n_informative

        # Ensure the sum of informative and redundant features is less than the total number of features
        if n_informative + n_redundant <= n_features:
            X, y = generate_classification_dataset(n_samples=n_samples, n_features=n_features, n_informative=n_informative, 
                                            n_redundant=n_redundant, n_clusters_per_class=n_clusters_per_class, 
                                            class_sep=class_sep)
            save_dataset_to_csv(X, y, n_features, dataset_name, informative_ratio)


In [2]:
import os
import numpy as np
from sklearn.model_selection import train_test_split

def generate_circular_dataset(num_samples=100, radius=1.0, noise=0.1):
    np.random.seed(42)
    angles = np.linspace(0, 2 * np.pi, num_samples // 2)
    inner_circle_x = radius * np.cos(angles) + noise * np.random.randn(num_samples // 2)
    inner_circle_y = radius * np.sin(angles) + noise * np.random.randn(num_samples // 2)
    outer_circle_x = 2 * radius * np.cos(angles) + noise * np.random.randn(num_samples // 2)
    outer_circle_y = 2 * radius * np.sin(angles) + noise * np.random.randn(num_samples // 2)

    inner_circle = np.vstack((inner_circle_x, inner_circle_y)).T
    outer_circle = np.vstack((outer_circle_x, outer_circle_y)).T

    return np.vstack((inner_circle, outer_circle)), np.hstack((np.zeros(num_samples // 2), np.ones(num_samples // 2)))

def save_dataset_to_csv(X, y, dataset_name, directory="datasets/circular-dataset"):
    """
    Save the dataset to CSV files.
    """
    os.makedirs(directory, exist_ok=True)
    
    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Save the training set
    name_train = os.path.join(directory, f"{dataset_name}_train.csv")
    data_train = np.c_[X_train, y_train]
    np.savetxt(name_train, data_train, delimiter=",")
    
    # Save the test set
    name_test = os.path.join(directory, f"{dataset_name}_test.csv")
    data_test = np.c_[X_test, y_test]
    np.savetxt(name_test, data_test, delimiter=",")
    
    return X_train, X_test, y_train, y_test

# Main script
dataset_name = "circular-dataset"
n_samples = 500
radius = 1.0
noise = 0.1

# Generate the base circular dataset with 2 features
X, y = generate_circular_dataset(num_samples=n_samples, radius=radius, noise=noise)

# Save the dataset to CSV
save_dataset_to_csv(X, y, dataset_name)


(array([[ 1.04049817e+00, -1.38279973e-01],
        [-1.40032808e-01, -2.21093314e+00],
        [ 7.46023082e-01,  4.74356304e-01],
        [-4.54414918e-01,  2.16966458e+00],
        [-8.92702090e-01,  1.65631633e+00],
        [ 7.31357567e-02,  1.11535595e+00],
        [ 5.44594985e-01,  1.86834806e+00],
        [ 9.43292157e-01, -3.71564195e-01],
        [-9.52880668e-01,  1.68842629e+00],
        [-1.08837549e+00, -4.00514435e-01],
        [-9.82872029e-01, -1.22599597e-01],
        [-7.17230881e-01, -1.95083157e+00],
        [-1.06127965e+00,  1.70282109e+00],
        [ 8.18623481e-01,  6.36035173e-01],
        [ 1.26563256e+00, -1.55449410e+00],
        [ 1.25251524e+00,  1.27621137e+00],
        [ 9.33393870e-01, -6.94797994e-02],
        [ 9.73149494e-01, -4.06352711e-01],
        [-1.09901292e+00,  1.89384102e-01],
        [ 4.77469164e-01,  8.13328252e-01],
        [ 9.75454322e-01, -6.95697630e-01],
        [-2.66647329e-01, -1.05992586e+00],
        [-3.86791884e-01,  1.910