In [4]:
# 1st Example: Integer hyperparameters and float hyperparameters
from ConfigSpace import ConfigurationSpace

cs = ConfigurationSpace(
    space={
        "C": (-1.0, 1.0),  # Note the decimal to make it a float
        "max_iter": (10, 100),
    },
    seed=1234,
)

In [None]:
# 2nd Example: Categorical hyperparameters and conditions#
from ConfigSpace import ConfigurationSpace, Categorical, Float, Integer

kernel_type = Categorical('kernel_type', ['linear', 'poly', 'rbf', 'sigmoid'])
degree = Integer('degree', bounds=(2, 4), default=2)
coef0 = Float('coef0', bounds=(0, 1), default=0.0)
gamma = Float('gamma', bounds=(1e-5, 1e2), default=1, log=True)

cs = ConfigurationSpace()
cs.add([kernel_type, degree, coef0, gamma])

from ConfigSpace import EqualsCondition, InCondition, OrConjunction

# read as: "degree is active if kernel_type == 'poly'"
cond_1 = EqualsCondition(degree, kernel_type, 'poly')

# read as: "coef0 is active if (kernel_type == 'poly' or kernel_type == 'sigmoid')"
# You could also define this using an InCondition as shown below
cond_2 = OrConjunction(
    EqualsCondition(coef0, kernel_type, 'poly'),
    EqualsCondition(coef0, kernel_type, 'sigmoid')
)

# read as: "gamma is active if kernel_type in ['rbf', 'poly', 'sigmoid']"
cond_3 = InCondition(gamma, kernel_type, ['rbf', 'poly','sigmoid'])



Configuration space object:
  Hyperparameters:
    coef0, Type: UniformFloat, Range: [0.0, 1.0], Default: 0.0
    degree, Type: UniformInteger, Range: [2, 4], Default: 2
    gamma, Type: UniformFloat, Range: [1e-05, 100.0], Default: 1.0, on log-scale
    kernel_type, Type: Categorical, Choices: {linear, poly, rbf, sigmoid}, Default: linear



In [None]:
# 3rd Example: Forbidden clauses#
from ConfigSpace import ConfigurationSpace, Categorical, Constant

cs = ConfigurationSpace()

penalty = Categorical("penalty", ["l1", "l2"], default="l2")
loss = Categorical("loss", ["hinge", "squared_hinge"], default="squared_hinge")
dual = Constant("dual", "False")
cs.add([penalty, loss, dual])

from ConfigSpace import ForbiddenEqualsClause, ForbiddenAndConjunction

penalty_and_loss = ForbiddenAndConjunction(
    ForbiddenEqualsClause(penalty, "l1"),
    ForbiddenEqualsClause(loss, "hinge")
)
constant_penalty_and_loss = ForbiddenAndConjunction(
    ForbiddenEqualsClause(dual, "False"),
    ForbiddenEqualsClause(penalty, "l2"),
    ForbiddenEqualsClause(loss, "hinge")
)
penalty_and_dual = ForbiddenAndConjunction(
    ForbiddenEqualsClause(dual, "False"),
    ForbiddenEqualsClause(penalty, "l1")
)

In [None]:
# 4th Example Serialization#
from pathlib import Path
from ConfigSpace import ConfigurationSpace

path = Path("configspace.yaml")
cs = ConfigurationSpace(
    space={
        "C": (-1.0, 1.0),  # Note the decimal to make it a float
        "max_iter": (10, 100),
    },
    seed=1234,
)
cs.to_yaml(path)
loaded_cs = ConfigurationSpace.from_yaml(path)

with path.open() as f:
    print(f.read())

In [3]:
# 5th Example: Placing priors on the hyperparameters#
import numpy as np
from ConfigSpace import ConfigurationSpace, Float, Categorical, Beta, Normal

cs = ConfigurationSpace(
    space={
        "lr": Float(
            'lr',
            bounds=(1e-5, 1e-1),
            default=1e-3,
            log=True,
            distribution=Normal(1e-3, 1e-1)
        ),
        "dropout": Float(
            'dropout',
            bounds=(0, 0.99),
            default=0.25,
            distribution=Beta(alpha=2, beta=4)
        ),
        "activation": Categorical(
            'activation',
            items=['tanh', 'relu'],
            weights=[0.2, 0.8]
        ),
    },
    seed=1234,
)
print(cs)


Configuration space object:
  Hyperparameters:
    activation, Type: Categorical, Choices: {tanh, relu}, Default: relu, Probabilities: [0.2 0.8]
    dropout, Type: BetaFloat, Alpha: 2.0, Beta: 4.0, Range: [0.0, 0.99], Default: 0.25
    lr, Type: NormalFloat, Mu: 0.001, Sigma: 0.1, Range: [1e-05, 0.1], Default: 0.001, on log-scale



In [None]:
from ConfigSpace import Configuration


In [4]:
from smac import Scenario

def generate_scenario(cs):
    scenario = Scenario({
        'run_obj': 'quality',
        'output_dir': './automl_results',
        'shared_model': False,
        'multi_objectives': ['validation_loss'],
        'overall_obj': 'validation_loss',
        'deterministic': False,
        'wallclock_limit': 3600.0,
        'abort_on_first_run_crash': True,
        'limit_resources': True,
        'memory_limit': 16000.0,
        'cutoff': 300.0
    })
    return scenario

scenario = generate_scenario(cs)

In [6]:
import numpy as np
from types import SimpleNamespace
X = np.random.rand(10, 5)
y = np.random.rand(10, 1)


mydict = dict(X=X, y=y)


mydict = SimpleNamespace(**mydict)
mydict.X

array([[0.98525351, 0.781331  , 0.90316584, 0.42289328, 0.0836624 ],
       [0.09098289, 0.45530062, 0.8913916 , 0.55417661, 0.19638563],
       [0.34897899, 0.89023083, 0.46529547, 0.62919728, 0.78103712],
       [0.21140623, 0.78827005, 0.41219888, 0.18284856, 0.95699989],
       [0.4160139 , 0.39566742, 0.68413273, 0.9207471 , 0.81198847],
       [0.1554572 , 0.28036314, 0.04413612, 0.39658158, 0.05237825],
       [0.56122639, 0.55022277, 0.39825877, 0.58396533, 0.46107929],
       [0.69087577, 0.2232384 , 0.04952998, 0.64866486, 0.50059245],
       [0.69802054, 0.18925913, 0.25659784, 0.16636338, 0.98976883],
       [0.40259054, 0.33274224, 0.01593082, 0.50959103, 0.43491217]])

In [10]:
from keras.datasets import mnist

ModuleNotFoundError: No module named 'keras'

In [3]:
# load mnist dataset
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist.data, mnist.target


In [42]:
from ConfigSpace import ConfigurationSpace, Categorical, Float, Integer, ForbiddenAndConjunction, ForbiddenEqualsClause, EqualsCondition

def get_configspace():
    cs = ConfigurationSpace(seed=1234)

    learning_rate = Categorical('learning_rate', ['constant', 'invscaling', 'adaptive'])
    eta0 = Float('eta0', bounds=(0.0001, 1.0), default=0.01, log=True)
    max_iter = Integer('max_iter', bounds=(100, 1000), default=200)
    tol = Float('tol', bounds=(1e-6, 1e-2), default=1e-4, log=True)
    early_stopping = Categorical('early_stopping', ['True', 'False'], default='False')
    validation_fraction = Float('validation_fraction', bounds=(0.01, 0.5), default=0.1)
    n_jobs = Integer('n_jobs', bounds=(1, 10), default=1)
    random_state = Integer('random_state', bounds=(0, 100), default=42)

    cs.add([learning_rate, eta0, max_iter, tol, early_stopping, validation_fraction, n_jobs, random_state])

    # cond_eta0 = EqualsCondition(eta0, learning_rate, 'constant')
    # cs.add(cond_eta0)

    # forbidden_eta0_and_max_iter = ForbiddenAndConjunction(
    #     ForbiddenEqualsClause(eta0, 0.01),
    #     ForbiddenEqualsClause(max_iter, 100)
    # )
    # cs.add(forbidden_eta0_and_max_iter)

    return cs

cs = get_configspace()
print(cs)

Configuration space object:
  Hyperparameters:
    early_stopping, Type: Categorical, Choices: {True, False}, Default: False
    eta0, Type: UniformFloat, Range: [0.0001, 1.0], Default: 0.01, on log-scale
    learning_rate, Type: Categorical, Choices: {constant, invscaling, adaptive}, Default: constant
    max_iter, Type: UniformInteger, Range: [100, 1000], Default: 200
    n_jobs, Type: UniformInteger, Range: [1, 10], Default: 1
    random_state, Type: UniformInteger, Range: [0, 100], Default: 42
    tol, Type: UniformFloat, Range: [1e-06, 0.01], Default: 0.0001, on log-scale
    validation_fraction, Type: UniformFloat, Range: [0.01, 0.5], Default: 0.1



In [44]:
from smac.scenario import Scenario

def generate_scenario(cs):
    scenario = Scenario(
        configspace=cs,
        deterministic=True,
        n_trials=10,
    )
    return scenario
scenario = generate_scenario(cs)
print(scenario)

Scenario(configspace=Configuration space object:
  Hyperparameters:
    early_stopping, Type: Categorical, Choices: {True, False}, Default: False
    eta0, Type: UniformFloat, Range: [0.0001, 1.0], Default: 0.01, on log-scale
    learning_rate, Type: Categorical, Choices: {constant, invscaling, adaptive}, Default: constant
    max_iter, Type: UniformInteger, Range: [100, 1000], Default: 200
    n_jobs, Type: UniformInteger, Range: [1, 10], Default: 1
    random_state, Type: UniformInteger, Range: [0, 100], Default: 42
    tol, Type: UniformFloat, Range: [1e-06, 0.01], Default: 0.0001, on log-scale
    validation_fraction, Type: UniformFloat, Range: [0.01, 0.5], Default: 0.1
, name=None, output_directory=PosixPath('smac3_output'), deterministic=True, objectives='cost', crash_cost=inf, termination_cost_threshold=inf, walltime_limit=inf, cputime_limit=inf, trial_walltime_limit=None, trial_memory_limit=None, n_trials=10, use_default_config=False, instances=None, instance_features=None, min

In [32]:
print([p.name for p in list(cs.values())])

['early_stopping', 'eta0', 'learning_rate', 'max_iter', 'n_jobs', 'random_state', 'tol', 'validation_fraction']


In [45]:
scenario.configspace

Configuration space object:
  Hyperparameters:
    early_stopping, Type: Categorical, Choices: {True, False}, Default: False
    eta0, Type: UniformFloat, Range: [0.0001, 1.0], Default: 0.01, on log-scale
    learning_rate, Type: Categorical, Choices: {constant, invscaling, adaptive}, Default: constant
    max_iter, Type: UniformInteger, Range: [100, 1000], Default: 200
    n_jobs, Type: UniformInteger, Range: [1, 10], Default: 1
    random_state, Type: UniformInteger, Range: [0, 100], Default: 42
    tol, Type: UniformFloat, Range: [1e-06, 0.01], Default: 0.0001, on log-scale
    validation_fraction, Type: UniformFloat, Range: [0.01, 0.5], Default: 0.1

In [46]:
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss
from ConfigSpace import Configuration

def train(cfg: Configuration, seed: int, dataset: dict) -> float:
    """
    Train a neural network model on the given dataset.

    Args:
    - cfg (Configuration): A Configuration object containing hyperparameters.
    - seed (int): The random seed for reproducibility.
    - dataset (dict): A dictionary containing the feature matrix 'X' and label vector 'y'.

    Returns:
    - loss (float): The average training loss over 10 epochs.
    """

    # Get input and output dimensions dynamically from the dataset
    input_size = dataset['X'].shape[1]
    num_classes = len(np.unique(dataset['y']))

    # Get hyperparameters from the configuration
    learning_rate = cfg.get('learning_rate')
    eta0 = cfg.get('eta0')
    max_iter = cfg.get('max_iter')
    tol = cfg.get('tol')
    early_stopping = cfg.get('early_stopping') == 'True'
    validation_fraction = cfg.get('validation_fraction')
    n_jobs = cfg.get('n_jobs')
    warm_start = cfg.get('warm_start') == 'True'
    momentum = cfg.get('momentum')
    nesterovs_momentum = cfg.get('nesterovs_momentum') == 'True'
    power_t = cfg.get('power_t')

    # Create a neural network model
    if learning_rate == 'constant':
        # Use SGD with lr=eta0
        model = MLPClassifier(hidden_layer_sizes=(50,), max_iter=max_iter, tol=tol, 
                               early_stopping=early_stopping, validation_fraction=validation_fraction, 
                               random_state=seed, learning_rate_init=eta0, momentum=momentum, 
                               nesterovs_momentum=nesterovs_momentum)
    elif learning_rate == 'invscaling':
        # Use SGD with lr=eta0 and momentum=power_t
        model = MLPClassifier(hidden_layer_sizes=(50,), max_iter=max_iter, tol=tol, 
                               early_stopping=early_stopping, validation_fraction=validation_fraction, 
                               random_state=seed, learning_rate_init=eta0, momentum=momentum, 
                               nesterovs_momentum=nesterovs_momentum, power_t=power_t)
    elif learning_rate == 'adaptive':
        # Use Adam or equivalent with lr=eta0
        model = MLPClassifier(hidden_layer_sizes=(50,), max_iter=max_iter, tol=tol, 
                               early_stopping=early_stopping, validation_fraction=validation_fraction, 
                               random_state=seed, learning_rate_init=eta0, solver='adam')

    # Train the model
    model.fit(dataset['X'], dataset['y'])

    # Calculate the average training loss over 10 epochs
    losses = []
    for _ in range(10):
        model.fit(dataset['X'], dataset['y'])
        losses.append(log_loss(dataset['y'], model.predict_proba(dataset['X'])))
    loss = np.mean(losses)

    return loss


In [47]:
cs = get_configspace()
print(type(cs))  # Should output: <class 'ConfigSpace.configuration_space.ConfigurationSpace'>


<class 'ConfigSpace.configuration_space.ConfigurationSpace'>


In [None]:
from smac import HyperparameterOptimizationFacade, Scenario

smac = HyperparameterOptimizationFacade(
            scenario,
            train,  # We pass the target function here
            overwrite=True,  # Overrides any previous results that are found that are inconsistent with the meta-data
        )
smac.optimize()


[INFO][abstract_initial_design.py:87] Reducing the number of initial configurations from 80 to 2 (max_ratio == 0.25).
[INFO][abstract_initial_design.py:139] Using 2 initial design configurations and 0 additional configurations.
[INFO][abstract_intensifier.py:307] Using only one seed for deterministic scenario.
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 190, in run
    rval = self(config_copy, target_function, kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 264, in __call__
    return algorithm(config, **algorithm_kwargs)
TypeError: train() missing 1 required positional argument: 'dataset'


[INFO][abstract_intensifier.py:517] Added config d8c8c3 as new incumbent because there are no incumbents yet.
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 190, in run
    rval = self(config_copy, target_functi

  diff_b_a = subtract(b, a)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + V

  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 190, in run
    rval = self(config_copy, target_function, kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 264, in __call__
    return algorithm(config, **algorithm_kwargs)
TypeError: train() missing 1 required positional argument: 'dataset'


  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 190, in run
    rval = self(config_copy, target_function, kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 264, in __call__
    return algorithm(config, **algorithm_kwargs)
TypeError: train() missing 1 required positional argument: 'dataset'


  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 190, in run
    rval = self(config_copy, target_function, k

  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)


AttributeError: 'HyperparameterOptimizationFacade' object has no attribute 'run'

In [41]:
configspace = ConfigurationSpace({"C": (0.100, 1000.0)})
print(configspace)

Configuration space object:
  Hyperparameters:
    C, Type: UniformFloat, Range: [0.1, 1000.0], Default: 500.05



In [27]:
from ConfigSpace import Configuration, ConfigurationSpace

import numpy as np
from smac import HyperparameterOptimizationFacade, Scenario
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

iris = datasets.load_iris()


def train(config: Configuration, seed: int = 0) -> float:
    classifier = SVC(C=config["C"], random_state=seed)
    scores = cross_val_score(classifier, iris.data, iris.target, cv=5)
    return 1 - np.mean(scores)


configspace = ConfigurationSpace({"C": (0.100, 1000.0)})

# Scenario object specifying the optimization environment
scenario = Scenario(configspace, deterministic=True, n_trials=200)

# Use SMAC to find the best configuration/hyperparameters
smac = HyperparameterOptimizationFacade(scenario, train)
incumbent = smac.optimize()

[INFO][abstract_initial_design.py:139] Using 10 initial design configurations and 0 additional configurations.
[INFO][smbo.py:509] Continuing from previous run.
[INFO][smbo.py:278] Optimization process was already finished. Returning incumbent...


In [3]:
from configs.api_keys import GROQ_API_KEY, GOOGLE_API_KEY

from scripts.LLMClient import LLMClient

# Initialize the client
llm_client = LLMClient(
    api_key=GOOGLE_API_KEY,
    model_name="gemini-2.0-flash",
    embedding_model="sentence-transformers/all-MiniLM-L6-v2"  # This is a smaller, faster model
)

# Create a vector store with some test documents
documents = [
    "This is a test document about machine learning.",
    "This is another document about artificial intelligence."
]

# Try to create the vector store
try:
    llm_client.create_vector_store(documents)
except ValueError as e:
    print(f"Error: {e}")
    # Handle the error appropriately

# Generate responses with RAG
response = llm_client.generate_with_context(
    "What are the key points from the documents?",
    k=3  # Number of relevant documents to retrieve
)

print(response)

Failed to initialize embeddings model: Could not import sentence_transformers python package. Please install it with `pip install sentence-transformers`.
Attempting fallback to CPU-only mode...


Fallback initialization also failed: Could not import sentence_transformers python package. Please install it with `pip install sentence-transformers`.
RAG capabilities will be disabled
RAG capabilities are disabled. Falling back to standard generation.


Error: Embeddings model not initialized. RAG capabilities are disabled. Please check the logs for initialization errors.
Please provide me with the documents you are referring to. I need the text of the documents to be able to identify the key points for you. 

Once you provide the documents, I will:

*   **Read through them carefully.**
*   **Identify the main topics and arguments.**
*   **Summarize the key points in a concise and clear manner.**

I look forward to helping you!



In [2]:
from scripts.DocumentCollector import DocumentCollector
collector = DocumentCollector(max_workers=8, timeout=50)
print("Starting document collection...")
docs = collector.collect_documentation()
collector.save_documents(docs)
print("Document collection completed.")



Starting document collection...


2025-05-21 09:09:25,057 - INFO - Saved 3 docs for 'smac' to collected_docs/smac_docs.json
2025-05-21 09:09:25,059 - INFO - Saved 4 docs for 'configspace' to collected_docs/configspace_docs.json
2025-05-21 09:09:25,060 - INFO - Saved 2 docs for 'pytorch' to collected_docs/pytorch_docs.json
2025-05-21 09:09:25,061 - INFO - Saved 0 docs for 'tensorflow' to collected_docs/tensorflow_docs.json
2025-05-21 09:09:25,063 - INFO - Saved 4 docs for 'sklearn' to collected_docs/sklearn_docs.json


Document collection completed.


In [1]:
from configs.api_keys import OPENML_API_KEY
import openml

openml.config.apikey = OPENML_API_KEY

In [12]:
!pip install amltk

Collecting amltk
  Downloading amltk-1.12.1.tar.gz (174 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: amltk
  Building wheel for amltk (pyproject.toml) ... [?25ldone
[?25h  Created wheel for amltk: filename=amltk-1.12.1-py3-none-any.whl size=208757 sha256=8ec530ff80ec276549803c420f1614f2af1a41005015afd5072d34e2ad398d27
  Stored in directory: /Users/amirrezaalasti/Library/Caches/pip/wheels/a0/70/a6/f6719d420b33556947bef3dab32023a76424b9f8014b1b6c3f
Successfully built amltk
Installing collected packages: amltk
Successfully installed amltk-1.12.1


In [9]:
import openml

# The keyword you want to search for
search_keyword = "mnist"

print(f"Searching for datasets with the keyword: '{search_keyword}'...")

# Use the 'data_name' parameter to filter.
# This performs a search for datasets containing the keyword in their name.
try:
    search_results_df = openml.datasets.list_datasets(
        data_name=search_keyword,
        output_format="dataframe"
    )

    if search_results_df.empty:
        print("No datasets found with that keyword.")
    else:
        # Display the most relevant columns from the results
        print("Found the following datasets:")
        print(search_results_df[['did', 'name', 'NumberOfInstances', 'NumberOfFeatures', 'NumberOfClasses', 'status']])

except Exception as e:
    print(f"An error occurred: {e}")

Searching for datasets with the keyword: 'mnist'...
No datasets found with that keyword.


In [11]:
import pandas as pd

search_keyword = "mnist"

try:
    all_metafeatures_df = pd.read_csv('./collected_docs/openml_metafeatures_complete.csv', index_col='did')

    # Search the 'name' column.
    # str.contains() allows for partial matches.
    # case=False makes the search case-insensitive.
    search_results = all_metafeatures_df[
        all_metafeatures_df['name'].str.contains(search_keyword, case=False, na=False)
    ]

    if search_results.empty:
        print(f"No datasets found containing the keyword '{search_keyword}' in your local CSV.")
    else:
        print(f"Found the following datasets containing '{search_keyword}':")
        # Display the results
        print(search_results[['name', 'NumberOfInstances', 'NumberOfFeatures', 'NumberOfClasses']])

except FileNotFoundError:
    print("Error: 'openml_metafeatures_complete.csv' not found.")
    print("Please run the generation script first.")
except Exception as e:
    print(f"An error occurred: {e}")

Found the following datasets containing 'mnist':
                                                    name  NumberOfInstances  \
did                                                                           
554                                            mnist_784            70000.0   
40996                                      Fashion-MNIST            70000.0   
41039                                    EMNIST_Balanced           131600.0   
41065                                     mnist_rotation            62000.0   
41982                                    Kuzushiji-MNIST            70000.0   
44698  Fashion-MNIST_seed_0_nrows_2000_nclasses_10_nc...             2000.0   
44699  Fashion-MNIST_seed_1_nrows_2000_nclasses_10_nc...             2000.0   
44700  Fashion-MNIST_seed_2_nrows_2000_nclasses_10_nc...             2000.0   
44701  Fashion-MNIST_seed_3_nrows_2000_nclasses_10_nc...             2000.0   
44702  Fashion-MNIST_seed_4_nrows_2000_nclasses_10_nc...             2000.0   
450

In [4]:
openml.datasets.get_dataset(
    dataset_id=61
)

OpenML Dataset
Name.........: iris
Version......: 1
Format.......: ARFF
Upload Date..: 2014-04-06 23:23:39
Licence......: Public
Download URL.: https://api.openml.org/data/v1/download/61/iris.arff
OpenML URL...: https://www.openml.org/d/61
# of features: None

In [18]:
import openml
import pandas as pd
from tqdm.auto import tqdm # A library to show a progress bar

print("Connecting to OpenML to fetch the list of all active datasets...")

# It is recommended to have your API key configured
# openml.config.apikey = 'YOUR_API_KEY'

# First, get a dataframe of all active datasets. We only need their IDs.
datasets_df = openml.datasets.list_datasets(output_format="dataframe")
active_datasets_df = datasets_df[datasets_df['status'] == 'active']
dataset_ids = active_datasets_df['did'].tolist()

print(f"Found {len(dataset_ids)} active datasets. Now fetching meta-features for each one.")
print("This will take a long time. Please be patient.")

all_metafeatures = []

# Loop through each dataset ID with a progress bar
for did in tqdm(dataset_ids, desc="Fetching Meta-Features"):
    try:
        # Fetch the dataset object without downloading the actual data files
        dataset = openml.datasets.get_dataset(
            did,
            download_data=False,
            download_qualities=True
        )
        # The meta-features are in the 'qualities' attribute, which is a dict
        # We'll also add other useful info like did, name, etc.
        qualities = dataset.qualities
        qualities['did'] = did
        qualities['name'] = dataset.name
        qualities['status'] = 'active' # We already filtered for this
        qualities['NumberOfInstances'] = qualities.get('NumberOfInstances', dataset.qualities.get('MajorityClassSize', 0) * 2) # Fallback for instances

        all_metafeatures.append(qualities)
    except Exception as e:
        # Some datasets might be broken or cause errors, we'll just skip them
        print(f"\nCould not process dataset {did}. Error: {e}")
        continue

# Convert the list of dictionaries into a single DataFrame
metafeatures_df = pd.DataFrame(all_metafeatures)

# Set the dataset ID as the index
if 'did' in metafeatures_df.columns:
    metafeatures_df = metafeatures_df.set_index('did')

# Save the complete DataFrame to a CSV file
output_filename = 'openml_metafeatures_complete.csv'
metafeatures_df.to_csv(output_filename)

print(f"\n✅ Successfully generated '{output_filename}' with meta-features for {len(metafeatures_df)} datasets.")
print("\nHere's a sample of the data:")
print(metafeatures_df[['name', 'NumberOfInstances', 'NumberOfFeatures', 'NumberOfClasses']].head())

  from .autonotebook import tqdm as notebook_tqdm


Connecting to OpenML to fetch the list of all active datasets...
Found 6269 active datasets. Now fetching meta-features for each one.
This will take a long time. Please be patient.


Fetching Meta-Features:   2%|▏         | 125/6269 [00:00<00:26, 235.16it/s]


Could not process dataset 202. Error: https://www.openml.org/api/v1/xml/data/qualities/202 returned code 362: No qualities found - None


Fetching Meta-Features:   6%|▌         | 347/6269 [00:58<26:16,  3.76it/s] 


Could not process dataset 486. Error: https://www.openml.org/api/v1/xml/data/qualities/486 returned code 362: No qualities found - None


Fetching Meta-Features:   6%|▌         | 355/6269 [01:00<27:54,  3.53it/s]


Could not process dataset 495. Error: https://www.openml.org/api/v1/xml/data/qualities/495 returned code 362: No qualities found - None


Fetching Meta-Features:   6%|▌         | 381/6269 [01:07<26:02,  3.77it/s]


Could not process dataset 525. Error: https://www.openml.org/api/v1/xml/data/qualities/525 returned code 362: No qualities found - None


Fetching Meta-Features:   6%|▋         | 394/6269 [01:11<27:05,  3.62it/s]


Could not process dataset 538. Error: https://www.openml.org/api/v1/xml/data/qualities/538 returned code 362: No qualities found - None


Fetching Meta-Features:   7%|▋         | 414/6269 [01:16<25:01,  3.90it/s]


Could not process dataset 559. Error: https://www.openml.org/api/v1/xml/data/qualities/559 returned code 362: No qualities found - None


Fetching Meta-Features:   7%|▋         | 418/6269 [01:17<25:44,  3.79it/s]


Could not process dataset 563. Error: https://www.openml.org/api/v1/xml/data/qualities/563 returned code 362: No qualities found - None


Fetching Meta-Features:   7%|▋         | 420/6269 [01:18<24:53,  3.92it/s]


Could not process dataset 565. Error: https://www.openml.org/api/v1/xml/data/qualities/565 returned code 362: No qualities found - None


Fetching Meta-Features:   8%|▊         | 521/6269 [01:46<26:46,  3.58it/s]


Could not process dataset 669. Error: https://www.openml.org/api/v1/xml/data/qualities/669 returned code 362: No qualities found - None


Fetching Meta-Features:   9%|▉         | 551/6269 [01:56<25:29,  3.74it/s]  


Could not process dataset 700. Error: https://www.openml.org/api/v1/xml/data/qualities/700 returned code 362: No qualities found - None


Fetching Meta-Features:  15%|█▍        | 927/6269 [03:40<23:53,  3.73it/s]  


Could not process dataset 1092. Error: https://www.openml.org/api/v1/xml/data/qualities/1092 returned code 362: No qualities found - None


Fetching Meta-Features:  16%|█▌        | 999/6269 [04:00<23:23,  3.75it/s]


Could not process dataset 1168. Error: https://www.openml.org/api/v1/xml/data/qualities/1168 returned code 362: No qualities found - None


Fetching Meta-Features:  38%|███▊      | 2378/6269 [10:35<18:03,  3.59it/s]  


Could not process dataset 4537. Error: https://www.openml.org/api/v1/xml/data/qualities/4537 returned code 364: Dataset processed with error - XSD does not comply. XSD errors: XML does not correspond to XSD schema. Error Element '{http://openml.org/openml}error': [facet 'maxLength'] The value has a length of '1343'; this exceeds the allowed maximum length of '1024'.
 on line 4 column 0. Error Element '{http://openml.org/openml}error': 'Problem validating uploaded description file: XML does not correspond to XSD schema. Error Element '{http://openml.org/openml}nominal_value': [facet 'pattern'] The value 'Prepara&amp;ccedil;&amp;atilde;o' is not accepted by the pattern '\p{IsBasicLatin}*'.
 on line 336 column 0. Error Element '{http://openml.org/openml}nominal_value': 'Prepara&amp;ccedil;&amp;atilde;o' is not a valid value of the atomic type '{http://openml.org/openml}basic_latin256'.
 on line 336 column 0. Error Element '{http://openml.org/openml}ClassDistribution': [facet 'pattern'] T

Fetching Meta-Features:  38%|███▊      | 2384/6269 [10:36<17:52,  3.62it/s]


Could not process dataset 4546. Error: https://www.openml.org/api/v1/xml/data/qualities/4546 returned code 364: Dataset processed with error - Java heap space


Fetching Meta-Features:  38%|███▊      | 2390/6269 [10:38<17:40,  3.66it/s]


Could not process dataset 4562. Error: https://www.openml.org/api/v1/xml/data/qualities/4562 returned code 364: Dataset processed with error - XSD does not comply. XSD errors: XML does not correspond to XSD schema. Error Element '{http://openml.org/openml}error': [facet 'maxLength'] The value has a length of '1129'; this exceeds the allowed maximum length of '1024'.
 on line 4 column 0. Error Element '{http://openml.org/openml}error': 'Problem validating uploaded description file: XML does not correspond to XSD schema. Error Element '{http://openml.org/openml}nominal_value': [facet 'minLength'] The value has a length of '0'; this underruns the allowed minimum length of '1'.
 on line 654 column 0. Error Element '{http://openml.org/openml}nominal_value': '' is not a valid value of the atomic type '{http://openml.org/openml}basic_latin256'.
 on line 654 column 0. Error Element '{http://openml.org/openml}nominal_value': [facet 'minLength'] The value has a length of '0'; this underruns the

Fetching Meta-Features:  40%|████      | 2526/6269 [11:17<17:12,  3.62it/s]


Could not process dataset 41022. Error: https://www.openml.org/api/v1/xml/data/qualities/41022 returned code 362: No qualities found - None


Fetching Meta-Features:  41%|████      | 2562/6269 [11:27<16:30,  3.74it/s]


Could not process dataset 41197. Error: https://www.openml.org/api/v1/xml/data/qualities/41197 returned code 362: No qualities found - None


Fetching Meta-Features:  47%|████▋     | 2916/6269 [13:07<15:38,  3.57it/s]


Could not process dataset 42057. Error: https://www.openml.org/api/v1/xml/data/qualities/42057 returned code 362: No qualities found - None


Fetching Meta-Features:  47%|████▋     | 2917/6269 [13:07<15:22,  3.63it/s]


Could not process dataset 42058. Error: https://www.openml.org/api/v1/xml/data/qualities/42058 returned code 362: No qualities found - None


Fetching Meta-Features:  47%|████▋     | 2918/6269 [13:07<15:50,  3.52it/s]


Could not process dataset 42059. Error: https://www.openml.org/api/v1/xml/data/qualities/42059 returned code 362: No qualities found - None


Fetching Meta-Features:  47%|████▋     | 2919/6269 [13:08<15:29,  3.60it/s]


Could not process dataset 42060. Error: https://www.openml.org/api/v1/xml/data/qualities/42060 returned code 362: No qualities found - None


Fetching Meta-Features:  47%|████▋     | 2933/6269 [13:11<15:37,  3.56it/s]


Could not process dataset 42090. Error: https://www.openml.org/api/v1/xml/data/qualities/42090 returned code 362: No qualities found - None


Fetching Meta-Features:  47%|████▋     | 2936/6269 [13:12<15:31,  3.58it/s]


Could not process dataset 42093. Error: https://www.openml.org/api/v1/xml/data/qualities/42093 returned code 362: No qualities found - None


Fetching Meta-Features:  47%|████▋     | 2949/6269 [13:16<14:54,  3.71it/s]


Could not process dataset 42118. Error: https://www.openml.org/api/v1/xml/data/qualities/42118 returned code 362: No qualities found - None


Fetching Meta-Features:  47%|████▋     | 2950/6269 [13:16<14:45,  3.75it/s]


Could not process dataset 42121. Error: https://www.openml.org/api/v1/xml/data/qualities/42121 returned code 362: No qualities found - None


Fetching Meta-Features:  47%|████▋     | 2957/6269 [13:18<14:53,  3.70it/s]


Could not process dataset 42134. Error: https://www.openml.org/api/v1/xml/data/qualities/42134 returned code 362: No qualities found - None


Fetching Meta-Features:  47%|████▋     | 2958/6269 [13:18<14:50,  3.72it/s]


Could not process dataset 42136. Error: https://www.openml.org/api/v1/xml/data/qualities/42136 returned code 362: No qualities found - None


Fetching Meta-Features:  47%|████▋     | 2959/6269 [13:19<14:58,  3.68it/s]


Could not process dataset 42139. Error: https://www.openml.org/api/v1/xml/data/qualities/42139 returned code 362: No qualities found - None


Fetching Meta-Features:  48%|████▊     | 3011/6269 [13:33<14:48,  3.67it/s]


Could not process dataset 42332. Error: https://www.openml.org/api/v1/xml/data/qualities/42332 returned code 362: No qualities found - None


Fetching Meta-Features:  48%|████▊     | 3012/6269 [13:33<14:53,  3.64it/s]


Could not process dataset 42333. Error: https://www.openml.org/api/v1/xml/data/qualities/42333 returned code 362: No qualities found - None


Fetching Meta-Features:  48%|████▊     | 3013/6269 [13:33<14:35,  3.72it/s]


Could not process dataset 42334. Error: https://www.openml.org/api/v1/xml/data/qualities/42334 returned code 362: No qualities found - None


Fetching Meta-Features:  49%|████▉     | 3091/6269 [13:55<13:58,  3.79it/s]


Could not process dataset 42476. Error: https://www.openml.org/api/v1/xml/data/qualities/42476 returned code 362: No qualities found - None


Fetching Meta-Features:  52%|█████▏    | 3235/6269 [14:39<14:25,  3.50it/s]  


Could not process dataset 42739. Error: https://www.openml.org/api/v1/xml/data/qualities/42739 returned code 362: No qualities found - None


Fetching Meta-Features:  55%|█████▍    | 3418/6269 [15:30<13:20,  3.56it/s]


Could not process dataset 43068. Error: https://www.openml.org/api/v1/xml/data/qualities/43068 returned code 362: No qualities found - None


Fetching Meta-Features:  57%|█████▋    | 3555/6269 [16:08<12:35,  3.59it/s]


Could not process dataset 43359. Error: https://www.openml.org/api/v1/xml/data/qualities/43359 returned code 362: No qualities found - None


Fetching Meta-Features:  58%|█████▊    | 3623/6269 [16:27<11:54,  3.70it/s]


Could not process dataset 43429. Error: https://www.openml.org/api/v1/xml/data/qualities/43429 returned code 362: No qualities found - None


Fetching Meta-Features:  58%|█████▊    | 3655/6269 [16:36<12:13,  3.56it/s]


Could not process dataset 43462. Error: https://www.openml.org/api/v1/xml/data/qualities/43462 returned code 362: No qualities found - None


Fetching Meta-Features:  64%|██████▍   | 4036/6269 [18:23<10:30,  3.54it/s]


Could not process dataset 43853. Error: https://www.openml.org/api/v1/xml/data/qualities/43853 returned code 362: No qualities found - None


Fetching Meta-Features:  69%|██████▉   | 4321/6269 [19:53<08:43,  3.72it/s]  


Could not process dataset 44229. Error: https://www.openml.org/api/v1/xml/data/qualities/44229 returned code 362: No qualities found - None


Fetching Meta-Features:  69%|██████▉   | 4322/6269 [19:54<08:42,  3.73it/s]


Could not process dataset 44230. Error: https://www.openml.org/api/v1/xml/data/qualities/44230 returned code 362: No qualities found - None


Fetching Meta-Features:  69%|██████▉   | 4325/6269 [19:54<08:44,  3.71it/s]


Could not process dataset 44233. Error: https://www.openml.org/api/v1/xml/data/qualities/44233 returned code 362: No qualities found - None


Fetching Meta-Features:  78%|███████▊  | 4920/6269 [22:40<06:29,  3.46it/s]


Could not process dataset 45002. Error: https://www.openml.org/api/v1/xml/data/qualities/45002 returned code 362: No qualities found - None


Fetching Meta-Features:  87%|████████▋ | 5441/6269 [25:11<04:10,  3.31it/s]


Could not process dataset 45927. Error: https://www.openml.org/api/v1/xml/data/qualities/45927 returned code 362: No qualities found - None


Fetching Meta-Features:  91%|█████████▏| 5725/6269 [26:33<02:39,  3.42it/s]


Could not process dataset 46302. Error: https://www.openml.org/api/v1/xml/data/qualities/46302 returned code 362: No qualities found - None


Fetching Meta-Features:  92%|█████████▏| 5739/6269 [26:36<02:19,  3.80it/s]


Could not process dataset 46317. Error: https://www.openml.org/api/v1/xml/data/qualities/46317 returned code 362: No qualities found - None


Fetching Meta-Features:  94%|█████████▍| 5920/6269 [27:28<01:44,  3.35it/s]


Could not process dataset 46621. Error: https://www.openml.org/api/v1/xml/data/qualities/46621 returned code 362: No qualities found - None


Fetching Meta-Features:  94%|█████████▍| 5921/6269 [27:28<01:41,  3.43it/s]


Could not process dataset 46625. Error: https://www.openml.org/api/v1/xml/data/qualities/46625 returned code 362: No qualities found - None


Fetching Meta-Features:  94%|█████████▍| 5923/6269 [27:29<01:36,  3.57it/s]


Could not process dataset 46629. Error: https://www.openml.org/api/v1/xml/data/qualities/46629 returned code 362: No qualities found - None


Fetching Meta-Features:  98%|█████████▊| 6155/6269 [28:37<00:33,  3.44it/s]


Could not process dataset 46893. Error: https://www.openml.org/api/v1/xml/data/qualities/46893 returned code 362: No qualities found - None


Fetching Meta-Features:  98%|█████████▊| 6161/6269 [28:39<00:32,  3.33it/s]


Could not process dataset 46899. Error: https://www.openml.org/api/v1/xml/data/qualities/46899 returned code 362: No qualities found - None


Fetching Meta-Features:  99%|█████████▉| 6226/6269 [28:57<00:11,  3.80it/s]


Could not process dataset 46972. Error: https://www.openml.org/api/v1/xml/data/qualities/46972 returned code 362: No qualities found - None


Fetching Meta-Features:  99%|█████████▉| 6227/6269 [28:58<00:11,  3.77it/s]


Could not process dataset 46973. Error: https://www.openml.org/api/v1/xml/data/qualities/46973 returned code 362: No qualities found - None


Fetching Meta-Features:  99%|█████████▉| 6228/6269 [28:58<00:10,  3.81it/s]


Could not process dataset 46974. Error: https://www.openml.org/api/v1/xml/data/qualities/46974 returned code 362: No qualities found - None


Fetching Meta-Features: 100%|██████████| 6269/6269 [29:11<00:00,  3.58it/s]



✅ Successfully generated 'openml_metafeatures_complete.csv' with meta-features for 6216 datasets.

Here's a sample of the data:
           name  NumberOfInstances  NumberOfFeatures  NumberOfClasses
did                                                                  
2        anneal              898.0              39.0              5.0
3      kr-vs-kp             3196.0              37.0              2.0
4         labor               57.0              17.0              2.0
5    arrhythmia              452.0             280.0             13.0
6        letter            20000.0              17.0             26.0


In [11]:
import openml
x = openml.evaluations.list_evaluations(tasks=[232800],function="predictive_accuracy", output_format="dataframe")
x

In [2]:
import openml
runs = openml.runs.list_runs(task=[190421, 211149], output_format="dataframe")
print(runs)

Empty DataFrame
Columns: []
Index: []


In [None]:
openml.tasks.get_task(task_id=211149)

OpenMLServerException: https://www.openml.org/api/v1/xml/task/42097 returned code 151: Unknown task - None

In [15]:
import openml
tasks = openml.tasks.list_tasks(data_id=42097, output_format="dataframe")
tasks[tasks["tid"] == 211149]

Unnamed: 0,tid,ttid,did,name,task_type,status,estimation_procedure,source_data,MaxNominalAttDistinctValues,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
211149,211149,TaskType.CLUSTERING,42097,iris,Clustering,active,50 times Clustering,42097,3,5,150,0,0,4,1


In [2]:
# Example Usage:
# You would get these from a secure source in a real application
from configs.api_keys import OPENML_API_KEY
from scripts.OpenMLRAG import OpenMLRAG

OPENML_API_KEY = OPENML_API_KEY
CSV_PATH = "/Users/amirrezaalasti/Desktop/master/semester 2/AutoML-Agent/collected_docs/openml_metafeatures_complete.csv"

# 1. Initialize the helper class
rag_helper = OpenMLRAG(openml_api_key=OPENML_API_KEY, metafeatures_csv_path=CSV_PATH)

# 2. Run the main function to get suggestions for a dataset
# Let's find suggestions for the 'kr-vs-kp' (King-Rook vs King-Pawn) dataset
suggested_params = rag_helper.extract_suggested_config_space_parameters(
    dataset_name_in_openml="iris"
)

# 3. Print the final, clean output
import json
print("\n--- Final LLM-Ready Output ---")
print(json.dumps(suggested_params, indent=2))

1. Searching for source dataset: 'iris'...
   ✅ Found 'iris' with ID: 61

2. Finding the top 3 datasets similar to 'iris'...
   ✅ Found similar datasets: ['Iris', 'iriiiiiis', 'JuanFeldmanIris']

3. Gathering top-performing setups from these similar datasets...
 -> Processing similar dataset: 'Iris' (ID: 44344)
 -> Processing similar dataset: 'iriiiiiis' (ID: 43859)
   -> No evaluations found for any tasks on dataset_id: 43859
 -> Processing similar dataset: 'JuanFeldmanIris' (ID: 42186)
   -> No evaluations found for any tasks on dataset_id: 42186

   ✅ Gathered 3 total setups.

4. Cleaning and simplifying the final list...
   ✅ Final unique configurations found: 3

--- Final LLM-Ready Output ---
[
  {
    "algorithm": "\"auto\"",
    "leaf_size": "30",
    "metric": "\"minkowski\"",
    "metric_params": "null",
    "n_jobs": "null",
    "n_neighbors": "3",
    "p": "2",
    "weights": "\"uniform\""
  },
  {
    "C": "0.025",
    "break_ties": "false",
    "cache_size": "200",
    "cl

In [1]:
from scripts.OpenMLRAG import OpenMLRAG
openml_rag = OpenMLRAG()

related_dataset = openml_rag.get_related_datasets(dataset_name="iris")
print(related_dataset.name)
# convert name to did
similar_datasets = openml_rag.find_similar_datasets(source_dataset_id=int(related_dataset.name), n_similar=3)
print(similar_datasets)
related_tasks = []
for index, datasetrow in similar_datasets.iterrows():
    related_tasks.append(openml_rag.get_related_tasks_of_dataset(index))
# print(related_tasks)

setup_parameters = []
for tasks in related_tasks:
    task_ids = tasks["tid"].tolist()
    setup_parameters.append(openml_rag.get_setup_parameters_of_tasks(task_ids))
print(setup_parameters)
    

TypeError: OpenMLRAG.__init__() missing 2 required positional arguments: 'openml_api_key' and 'metafeatures_csv_path'

In [16]:
openml.datasets.get_dataset(1499, download_data=False, download_qualities=True)

OpenML Dataset
Name..........: seeds
Version.......: 1
Format........: ARFF
Upload Date...: 2015-05-25 22:06:59
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/1592291/seeds.arff
OpenML URL....: https://www.openml.org/d/1499
# of features.: 8
# of instances: 210

In [3]:
import openml
import pandas as pd
from amltk.metalearning import compute_metafeatures, dataset_distance
import numpy as np

def get_dataset_metafeatures(dataset_id: int) -> pd.Series:
    """Fetches a dataset from OpenML and computes its meta-features."""
    try:
        dataset = openml.datasets.get_dataset(dataset_id, download_data=False, download_qualities=True)
        # The 'qualities' attribute contains the meta-features
        return pd.Series(dataset.qualities)
    except Exception as e:
        print(f"Could not process dataset {dataset_id}: {e}")
        return None

# --- 1. Define the source dataset (MNIST) ---
# MNIST's dataset ID on OpenML is 554
source_dataset_id = 40996
source_metafeatures = get_dataset_metafeatures(source_dataset_id)

if source_metafeatures is not None:
    print(f"Successfully fetched meta-features for MNIST (ID: {source_dataset_id})")

    # --- 2. Get a list of active datasets from OpenML ---
    # We'll fetch a subset for this example to keep it manageable
    all_datasets = openml.datasets.list_datasets(output_format="dataframe")[:100]
    active_classification_datasets = all_datasets[
        (all_datasets["status"] == "active") &
        (all_datasets["NumberOfClasses"] > 1)
    ].head(1000)  # Limiting to the first 1000 for demonstration

    # --- 3. Compute meta-features for all candidate datasets ---
    other_metafeatures = {}
    for did in active_classification_datasets["did"]:
        if did != source_dataset_id:
            metafeatures = get_dataset_metafeatures(did)
            if metafeatures is not None:
                other_metafeatures[did] = metafeatures

    print(f"\nComputed meta-features for {len(other_metafeatures)} other datasets.")

    # --- 4. Calculate dataset distances ---
    # Convert the dictionary of meta-features to a DataFrame
    other_metafeatures_df = pd.DataFrame(other_metafeatures).T

    # Align columns between source and other meta-features
    common_features = source_metafeatures.index.intersection(other_metafeatures_df.columns)
    source_aligned = source_metafeatures[common_features]
    others_aligned = other_metafeatures_df[common_features]

    # Drop columns with non-numeric data or too many missing values for this example
    numeric_cols = others_aligned.select_dtypes(include=np.number).columns
    source_numeric = source_aligned[numeric_cols].to_frame().T.dropna(axis=1)
    others_numeric = others_aligned[numeric_cols].dropna(axis=1)

    final_common_features = source_numeric.columns.intersection(others_numeric.columns)
    source_final = source_numeric[final_common_features].iloc[0]
    others_final = others_numeric[final_common_features]


    # Calculate the L2 (Euclidean) distance and get the top 5 closest
    distances = dataset_distance(
        target=source_final,
        dataset_metafeatures=others_final.T.to_dict('series'),
        distance_metric="l2",
        scaler="minmax",  # Normalize features to be on a similar scale
        closest_n=5
    )

    # --- 5. Display the results ---
    print("\nTop 5 similar datasets to MNIST:")
    top_5_similar_datasets = all_datasets.set_index('did').loc[distances.index]
    print(top_5_similar_datasets[['name', 'NumberOfInstances', 'NumberOfFeatures', 'NumberOfClasses']])

else:
    print(f"Could not fetch meta-features for the source dataset (ID: {source_dataset_id}).")

Successfully fetched meta-features for MNIST (ID: 40996)

Computed meta-features for 99 other datasets.

Top 5 similar datasets to MNIST:
             name  NumberOfInstances  NumberOfFeatures  NumberOfClasses
28      optdigits             5620.0              65.0             10.0
12  mfeat-factors             2000.0             217.0             10.0
32      pendigits            10992.0              17.0             10.0
36        segment             2310.0              20.0              7.0
41          glass              214.0              10.0              6.0


In [4]:
openml.datasets.list_datasets(output_format='dataframe')

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
2,2,anneal,1,1,active,ARFF,684.0,7.0,8.0,5.0,39.0,898.0,898.0,22175.0,6.0,33.0
3,3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,37.0,3196.0,0.0,0.0,0.0,37.0
4,4,labor,1,1,active,ARFF,37.0,3.0,20.0,2.0,17.0,57.0,56.0,326.0,8.0,9.0
5,5,arrhythmia,1,1,active,ARFF,245.0,13.0,2.0,13.0,280.0,452.0,384.0,408.0,206.0,74.0
6,6,letter,1,1,active,ARFF,813.0,26.0,734.0,26.0,17.0,20000.0,0.0,0.0,16.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47020,47020,Shrim,1,49887,active,arff,,,,,3.0,5998.0,2218.0,2974.0,3.0,0.0
47021,47021,Shrimphealth,1,49887,active,arff,,,,,3.0,5998.0,2218.0,2974.0,3.0,0.0
47023,47023,MedMCQA,1,25914,active,arff,53591.0,,38963.0,4.0,10.0,182822.0,95746.0,117567.0,0.0,1.0
47024,47024,Laboratorio_dataset_car,4,50072,active,arff,,,,,1.0,1750.0,0.0,0.0,0.0,0.0


In [3]:
# analysis of the dataset
print(iris.description)
print(iris.qualities)
print(iris.features)


**Author**: R.A. Fisher  
**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Iris) - 1936 - Donated by Michael Marshall  
**Please cite**:   

**Iris Plants Database**  
This is perhaps the best known database to be found in the pattern recognition literature.  Fisher's paper is a classic in the field and is referenced frequently to this day.  (See Duda & Hart, for example.)  The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant.  One class is     linearly separable from the other 2; the latter are NOT linearly separable from each other.

Predicted attribute: class of iris plant.  
This is an exceedingly simple domain.  
 
### Attribute Information:
    1. sepal length in cm
    2. sepal width in cm
    3. petal length in cm
    4. petal width in cm
    5. class: 
       -- Iris Setosa
       -- Iris Versicolour
       -- Iris Virginica
{'AutoCorrelation': 0.9865771812080537, 'CfsSubsetEval_DecisionStumpAUC': 0.9565333333333332, 'C

In [4]:
# get top 5 tasks on the dataset
print(openml.tasks.list_tasks(data_id=61, output_format="dataframe"))

           tid                                           ttid  did  name  \
59          59             TaskType.SUPERVISED_CLASSIFICATION   61  iris   
118        118                        TaskType.LEARNING_CURVE   61  iris   
289        289             TaskType.SUPERVISED_CLASSIFICATION   61  iris   
1758      1758                        TaskType.LEARNING_CURVE   61  iris   
1823      1823             TaskType.SUPERVISED_CLASSIFICATION   61  iris   
1939      1939             TaskType.SUPERVISED_CLASSIFICATION   61  iris   
1992      1992             TaskType.SUPERVISED_CLASSIFICATION   61  iris   
2227      2227  TaskType.SUPERVISED_DATASTREAM_CLASSIFICATION   61  iris   
7306      7306             TaskType.SUPERVISED_CLASSIFICATION   61  iris   
7545      7545                        TaskType.LEARNING_CURVE   61  iris   
7555      7555             TaskType.SUPERVISED_CLASSIFICATION   61  iris   
10107    10107             TaskType.SUPERVISED_CLASSIFICATION   61  iris   
52949    529

In [9]:
# get top validations runs from the tasks
task = openml.tasks.get_task(59)
task

OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 59
Task URL.............: https://www.openml.org/t/59
Estimation Procedure.: crossvalidation
Evaluation Measure...: predictive_accuracy
Target Feature.......: class
# of Classes.........: 3
Cost Matrix..........: Available

In [14]:
openml.evaluations.list_evaluations(tasks=[59], output_format="dataframe", function="predictive_accuracy", sort_order="desc")

Unnamed: 0,run_id,task_id,setup_id,flow_id,flow_name,data_id,data_name,function,upload_time,uploader,uploader_name,value,values,array_data
0,2012930,59,157613,6048,sklearn.pipeline.Pipeline(dualimputer=helper.d...,61,iris,predictive_accuracy,2017-04-06 23:00:24,1104,Jeroen van Hoof,0.986667,,
1,2012939,59,157622,6048,sklearn.pipeline.Pipeline(dualimputer=helper.d...,61,iris,predictive_accuracy,2017-04-06 23:29:28,1104,Jeroen van Hoof,0.986667,,
2,2012941,59,157624,6048,sklearn.pipeline.Pipeline(dualimputer=helper.d...,61,iris,predictive_accuracy,2017-04-07 01:36:00,1104,Jeroen van Hoof,0.986667,,
3,2012943,59,157626,6048,sklearn.pipeline.Pipeline(dualimputer=helper.d...,61,iris,predictive_accuracy,2017-04-07 02:01:33,1104,Jeroen van Hoof,0.986667,,
4,2039748,59,180922,6048,sklearn.pipeline.Pipeline(dualimputer=helper.d...,61,iris,predictive_accuracy,2017-04-09 01:09:01,1104,Jeroen van Hoof,0.986667,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4295,8803221,59,6778302,7707,sklearn.pipeline.Pipeline(imputation=openmlstu...,61,iris,predictive_accuracy,2018-01-22 20:14:33,1,Jan van Rijn,0.000000,,
4296,8805528,59,6780609,7707,sklearn.pipeline.Pipeline(imputation=openmlstu...,61,iris,predictive_accuracy,2018-01-22 20:44:26,1,Jan van Rijn,0.000000,,
4297,8810924,59,6785974,7707,sklearn.pipeline.Pipeline(imputation=openmlstu...,61,iris,predictive_accuracy,2018-01-23 08:26:33,1,Jan van Rijn,0.000000,,
4298,8850710,59,6825760,7707,sklearn.pipeline.Pipeline(imputation=openmlstu...,61,iris,predictive_accuracy,2018-01-27 02:50:20,1,Jan van Rijn,0.000000,,


In [34]:
# get the code of the task
flow = openml.flows.get_flow(6048)
flow


OpenML Flow
Flow ID.........: 6048 (version 1)
Flow URL........: https://www.openml.org/f/6048
Flow Name.......: sklearn.pipeline.Pipeline(dualimputer=helper.dual_imputer.DualImputer,nusvc=sklearn.svm.classes.NuSVC)
Flow Description: Automatically created scikit-learn flow.
Upload Date.....: 2017-04-06 22:42:59
Dependencies....: sklearn==0.18.1
numpy>=1.6.1
scipy>=0.9

In [3]:
# get the code of the task
setup = openml.setups.get_setup(157613)

In [4]:
setup.parameters

{56535: OpenML Parameter
 ID............: 56535
 Flow ID.......: 6048
 Flow Name.....: sklearn.pipeline.Pipeline(dualimputer=helper.dual_imputer.DualImputer,nusvc=sklearn.svm.classes.NuSVC)(1)_steps
 Flow URL......: https://www.openml.org/f/6048
 Parameter Name: steps
   |__Data Type: None
   |__Default..: [{"oml-python:serialized_object": "component_reference", "value": {"key": "dualimputer", "step_name": "dualimputer"}}, {"oml-python:serialized_object": "component_reference", "value": {"key": "nusvc", "step_name": "nusvc"}}]
   |__Value....: [('dualimputer', <helper.dual_imputer.DualImputer object at 0x7f9edc7cfe48>), ('nusvc', NuSVC(cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
    max_iter=-1, nu=0.3, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False))],
 56536: OpenML Parameter
 ID............: 56536
 Flow ID.......: 6049
 Flow Name.....: sklearn.svm.classes.NuSVC(1)_cache_si

In [29]:
run = openml.runs.get_run(2012943)
run

OpenML Run
Uploader Name...: Jeroen van Hoof
Uploader Profile: https://www.openml.org/u/1104
Metric..........: predictive_accuracy
Result..........: 0.986667
Run ID..........: 2012943
Run URL.........: https://www.openml.org/r/2012943
Task ID.........: 59
Task Type.......: Supervised Classification
Task URL........: https://www.openml.org/t/59
Flow ID.........: 6048
Flow Name.......: sklearn.pipeline.Pipeline(dualimputer=helper.dual_imputer.DualImputer,nusvc=sklearn.svm.classes.NuSVC)(1)
Flow URL........: https://www.openml.org/f/6048
Setup ID........: 157626
Setup String....: None
Dataset ID......: 61
Dataset URL.....: https://www.openml.org/d/61

In [30]:
flow = run.flow
print(flow)

None


In [31]:
model = flow.model

AttributeError: 'NoneType' object has no attribute 'model'

In [4]:
openml.datasets.list_datasets(output_format="dataframe")

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
2,2,anneal,1,1,active,ARFF,684.0,7.0,8.0,5.0,39.0,898.0,898.0,22175.0,6.0,33.0
3,3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,37.0,3196.0,0.0,0.0,0.0,37.0
4,4,labor,1,1,active,ARFF,37.0,3.0,20.0,2.0,17.0,57.0,56.0,326.0,8.0,9.0
5,5,arrhythmia,1,1,active,ARFF,245.0,13.0,2.0,13.0,280.0,452.0,384.0,408.0,206.0,74.0
6,6,letter,1,1,active,ARFF,813.0,26.0,734.0,26.0,17.0,20000.0,0.0,0.0,16.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47020,47020,Shrim,1,49887,active,arff,,,,,3.0,5998.0,2218.0,2974.0,3.0,0.0
47021,47021,Shrimphealth,1,49887,active,arff,,,,,3.0,5998.0,2218.0,2974.0,3.0,0.0
47023,47023,MedMCQA,1,25914,active,arff,53591.0,,38963.0,4.0,10.0,182822.0,95746.0,117567.0,0.0,1.0
47024,47024,Laboratorio_dataset_car,4,50072,active,arff,,,,,1.0,1750.0,0.0,0.0,0.0,0.0


In [1]:
from scripts.OpenMLRAG import OpenMLRAG

openMLRAGAgent = OpenMLRAG()
related_datasets = openMLRAGAgent.get_related_datasets(dataset_name="iris")
tasks = openMLRAGAgent.get_related_tasks_of_dataset(dataset_id=related_datasets.did, task_type="classification")
task_ids = tasks['tid'].tolist()

In [2]:
openMLRAGAgent.get_setup_parameters_of_tasks(task_ids=task_ids)


Evaluations found: 4974
157613
157622
157624
157626
180922
180924
217067
5296637
5485657
8275511


[{56535: OpenML Parameter
  ID............: 56535
  Flow ID.......: 6048
  Flow Name.....: sklearn.pipeline.Pipeline(dualimputer=helper.dual_imputer.DualImputer,nusvc=sklearn.svm.classes.NuSVC)(1)_steps
  Flow URL......: https://www.openml.org/f/6048
  Parameter Name: steps
    |__Data Type: None
    |__Default..: [{"oml-python:serialized_object": "component_reference", "value": {"key": "dualimputer", "step_name": "dualimputer"}}, {"oml-python:serialized_object": "component_reference", "value": {"key": "nusvc", "step_name": "nusvc"}}]
    |__Value....: [('dualimputer', <helper.dual_imputer.DualImputer object at 0x7f9edc7cfe48>), ('nusvc', NuSVC(cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
     max_iter=-1, nu=0.3, probability=True, random_state=None,
     shrinking=True, tol=0.001, verbose=False))],
  56536: OpenML Parameter
  ID............: 56536
  Flow ID.......: 6049
  Flow Name.....: sklearn.svm.classes.N

In [11]:
from pydantic import BaseModel
import instructor
from configs.api_keys import GOOGLE_API_KEY  # Your API key
from google import genai

google_client = genai.Client(api_key=GOOGLE_API_KEY)


class UserInfo(BaseModel):
    dataset_name: str
    dataset_tag: str

client = instructor.from_genai(google_client, model="models/gemini-1.5-flash")

response = client.chat.completions.create(
    response_model=UserInfo,
    messages=[{"role": "user", "content": "Give the relative dataset name and tag for the iris dataset in openml."}],
)

print(response)

dataset_name='iris' dataset_tag=''


In [1]:
from ConfigSpace import ConfigurationSpace, Categorical, Float, Integer


In [10]:
from smac import Scenario
from ConfigSpace import ConfigurationSpace


def generate_scenario(cs: ConfigurationSpace) -> Scenario:
    """
    Generates a SMAC scenario configuration for hyperparameter optimization of image classification models.

    Args:
        cs (ConfigurationSpace): The configuration space from which to sample the configurations.

    Returns:
        Scenario: A properly configured SMAC scenario object.
    """

    scenario = Scenario(
        configspace=cs,
        name="image_classification_optimization",
        output_directory="automl_results",
        deterministic=False,
        walltime_limit=3600,  # 1 hour
        cputime_limit=3600,  # 1 hour
        n_trials=10,
        trial_walltime_limit=600,  # 10 minutes per trial
        # trial_memory_limit=4096,  # 4GB memory limit
        min_budget=1,
        max_budget=9, # Use a number less than 10
        n_workers=1,
        seed=42  # Ensure reproducibility
    )
    return scenario

In [11]:
from ConfigSpace import ConfigurationSpace, UniformFloatHyperparameter, UniformIntegerHyperparameter, CategoricalHyperparameter, Constant
from ConfigSpace.conditions import InCondition
from ConfigSpace.forbidden import ForbiddenEqualsClause, ForbiddenAndConjunction

def get_configspace() -> ConfigurationSpace:
    """
    Defines the configuration space for a CNN model suitable for image classification,
    taking into account the image data handling requirements and the provided
    OpenML parameter suggestions for the FMNIST dataset.
    """
    cs = ConfigurationSpace()

    # --- Model Architecture ---
    n_layers = UniformIntegerHyperparameter("n_layers", lower=2, upper=5, default_value=3)
    cs.add_hyperparameter(n_layers)

    # --- Convolutional Layers ---
    kernel_size = CategoricalHyperparameter("kernel_size", choices=[3, 5], default_value=3)
    cs.add_hyperparameter(kernel_size)

    n_filters_initial = UniformIntegerHyperparameter("n_filters_initial", lower=32, upper=128, default_value=64, log=True)
    cs.add_hyperparameter(n_filters_initial)

    max_pooling = CategoricalHyperparameter("max_pooling", choices=["True", "False"], default_value="True")
    cs.add_hyperparameter(max_pooling)

    # --- Dense Layers ---
    n_dense_units = UniformIntegerHyperparameter("n_dense_units", lower=64, upper=512, default_value=256, log=True)
    cs.add_hyperparameter(n_dense_units)

    use_dropout = CategoricalHyperparameter("use_dropout", choices=["True", "False"], default_value="False")
    cs.add_hyperparameter(use_dropout)

    dropout_rate = UniformFloatHyperparameter("dropout_rate", lower=0.2, upper=0.8, default_value=0.5)
    cs.add_hyperparameter(dropout_rate)


    # --- Optimization ---
    learning_rate = UniformFloatHyperparameter(
        "learning_rate", lower=1e-4, upper=1e-2, default_value=1e-3, log=True
    )
    cs.add_hyperparameter(learning_rate)

    batch_size = CategoricalHyperparameter("batch_size", choices=[32, 64, 128], default_value=64)
    cs.add_hyperparameter(batch_size)

    optimizer = CategoricalHyperparameter("optimizer", choices=["Adam", "SGD"], default_value="Adam")
    cs.add_hyperparameter(optimizer)

    # --- Regularization ---
    weight_decay = UniformFloatHyperparameter(
        "weight_decay", lower=1e-6, upper=1e-3, default_value=1e-5, log=True
    )
    cs.add_hyperparameter(weight_decay)

    # --- Conditions ---
    # Ensure that dropout rate is only relevant if dropout is used
    dropout_condition = InCondition(child=dropout_rate, parent=use_dropout, values=["True"])
    cs.add_condition(dropout_condition)


    # --- Layer-specific hyperparameters (conditional on n_layers) ---
    for i in range(2, 6):
        n_filters = UniformIntegerHyperparameter(f"n_filters_layer_{i}", lower=32, upper=128, default_value=64, log=True)
        cs.add_hyperparameter(n_filters)
        layer_condition = InCondition(child=n_filters, parent=n_layers, values=list(range(i, 6)))
        cs.add_condition(layer_condition)

    return cs

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from ConfigSpace import Configuration
from typing import Any
import math


def train(cfg: Configuration, dataset: Any, seed: int) -> float:
    """
    Trains a CNN model on the given dataset according to the provided configuration.

    Args:
        cfg (Configuration): The configuration object defining the hyperparameters.
        dataset (Any): A dictionary containing the dataset, with 'X' for features and 'y' for labels.
        seed (int): Random seed for reproducibility.

    Returns:
        float: The final training loss.
    """

    torch.manual_seed(seed)
    np.random.seed(seed)

    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Extract data and labels
    X = dataset['X']
    y = dataset['y']

    # Data Preprocessing & Formatting
    X = np.array(X, dtype=np.float32)  # Ensure float32 for numerical stability
    y = np.array(y, dtype=np.int64)  # Ensure int64 for CrossEntropyLoss

    n_samples = X.shape[0]
    if len(X.shape) == 2:  # Flattened input
        n_features = X.shape[1]
        height = width = int(math.sqrt(n_features))
        if height * height != n_features:
            raise ValueError("Input features are not a perfect square.")
        X = X.reshape(n_samples, 1, height, width)
    elif len(X.shape) == 3:  # (N, H, W)
        X = X.reshape(n_samples, 1, X.shape[1], X.shape[2])
    elif len(X.shape) == 4:  # (N, H, W, C) or (N, C, H, W)
        if X.shape[1] > X.shape[2] and X.shape[1] > X.shape[3]:
            X = np.transpose(X, (0, 2, 3, 1))
        if X.shape[3] > X.shape[1] and X.shape[3] > X.shape[2]:
            X = np.transpose(X, (0, 3, 1, 2))
        pass  # Assume correct format, no reordering for minimal intervention.
    else:
        raise ValueError("Unsupported input data format.")

    X = X / 255.0  # Normalize to [0, 1]

    X = torch.from_numpy(X).float().to(device)  # Move to device here
    y = torch.from_numpy(y).long().to(device)  # Move to device here
    num_classes = len(torch.unique(y))

    # Hyperparameter extraction
    n_layers = cfg.get("n_layers")
    kernel_size = cfg.get("kernel_size")
    n_filters_initial = cfg.get("n_filters_initial")
    max_pooling_enabled = cfg.get("max_pooling") == "True"
    n_dense_units = cfg.get("n_dense_units")
    learning_rate = cfg.get("learning_rate")
    batch_size = cfg.get("batch_size")
    optimizer_name = cfg.get("optimizer")
    weight_decay = cfg.get("weight_decay")
    use_dropout = cfg.get("use_dropout") == "True"
    dropout_rate = cfg.get("dropout_rate") if use_dropout else 0.0

    # Model definition
    class CNN(nn.Module):
        def __init__(self, num_classes, height, width, n_layers, kernel_size, n_filters_initial, max_pooling_enabled, n_dense_units, dropout_rate, use_dropout, cfg):
            super(CNN, self).__init__()
            self.features = nn.ModuleList()
            in_channels = 1  # Initial input channels
            n_filters = n_filters_initial

            for i in range(n_layers):
                self.features.append(nn.Conv2d(in_channels, n_filters, kernel_size=kernel_size, padding=kernel_size // 2))
                self.features.append(nn.ReLU())
                if max_pooling_enabled:
                    self.features.append(nn.MaxPool2d(kernel_size=2, stride=2))
                in_channels = n_filters
                if i < n_layers -1:
                    n_filters_key = f"n_filters_layer_{i+2}"
                    if n_filters_key in cfg:
                        n_filters = cfg.get(n_filters_key)


            self.features = nn.Sequential(*self.features)

            # Determine the output size of the convolutional layers
            with torch.no_grad():
                dummy_input = torch.randn(1, 1, height, width).to(device)
                output = self.features(dummy_input)
                feature_size = output.view(output.size(0), -1).shape[1]

            self.classifier = nn.Sequential(
                nn.Linear(feature_size, n_dense_units),
                nn.ReLU(),
                nn.Dropout(dropout_rate) if use_dropout else nn.Identity(),
                nn.Linear(n_dense_units, num_classes)
            )

        def forward(self, x):
            x = self.features(x)
            x = torch.flatten(x, 1)
            x = self.classifier(x)
            return x

    height = X.shape[2]
    width = X.shape[3]
    model = CNN(num_classes, height, width, n_layers, kernel_size, n_filters_initial, max_pooling_enabled, n_dense_units, dropout_rate, use_dropout, cfg).to(device)

    # Optimizer
    if optimizer_name == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    elif optimizer_name == "SGD":
        optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay, momentum=0.9)
    else:
        raise ValueError(f"Unsupported optimizer: {optimizer_name}")

    # Loss function
    criterion = nn.CrossEntropyLoss()

    # Data loader
    dataset = TensorDataset(X, y)
    dataloader = DataLoader(dataset, batch_size=int(batch_size), shuffle=True, num_workers=0, drop_last=False) # num_workers=0 for simpler debugging

    # Training loop
    num_epochs = 5  # Fixed number of epochs
    model.train()  # Set the model to training mode

    final_loss = float('inf')
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in dataloader:
            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward and optimize
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)

        epoch_loss = running_loss / len(dataloader.dataset)
        final_loss = min(final_loss, epoch_loss)

    return float(final_loss)

In [5]:
from smac.facade.hyperparameter_optimization_facade import (
    HyperparameterOptimizationFacade as HPOFacade,
)

In [13]:
cs = get_configspace()
scenario = generate_scenario(cs)
def smac_train_function(config, seed: int = 0) -> float:
            """Wrapper function to call the training function with the correct parameters"""
            return train(cfg=config, dataset=dataset, seed=seed)

smac = HPOFacade(
    scenario,
    smac_train_function,  # We pass the target function here
    overwrite=True,  # Overrides any previous results that are found that are inconsistent with the meta-data
)
smac.optimize()

[INFO][abstract_initial_design.py:87] Reducing the number of initial configurations from 150 to 2 (max_ratio == 0.25).
[INFO][abstract_initial_design.py:139] Using 2 initial design configurations and 0 additional configurations.


  cs.add_hyperparameter(n_layers)
  cs.add_hyperparameter(kernel_size)
  cs.add_hyperparameter(n_filters_initial)
  cs.add_hyperparameter(max_pooling)
  cs.add_hyperparameter(n_dense_units)
  cs.add_hyperparameter(use_dropout)
  cs.add_hyperparameter(dropout_rate)
  cs.add_hyperparameter(learning_rate)
  cs.add_hyperparameter(batch_size)
  cs.add_hyperparameter(optimizer)
  cs.add_hyperparameter(weight_decay)
  cs.add_condition(dropout_condition)
  cs.add_hyperparameter(n_filters)
  cs.add_condition(layer_condition)


  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 190, in run
    rval = self(config_copy, target_function, kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 264, in __call__
    return algorithm(config, **algorithm_kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/pynisher/pynisher.py", line 602, in __call__
    return self._handle_return(err=err)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/pynisher/pynisher.py", line 637, in _handle_return
    raise err
pynisher.exceptions.PynisherException: Unknown reason for exitcode 1, no result or error recieved and  killed process 
smac_train_function(Configuration(values={
  'batch_size': 128,
  'kernel_size': 3,
  'learning_rate': 0.0097270298868,
  'max_pooling': 'True',
  'n_dense_units': 94,
  'n_filters_initial': 107,
  'n_layers': 3,
  'optimizer': 'SGD',
  'use_dropout': 'False',


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'smac_train_function' on <module '__main__' (built-in)>


  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 190, in run
    rval = self(config_copy, target_function, kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 264, in __call__
    return algorithm(config, **algorithm_kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/pynisher/pynisher.py", line 602, in __call__
    return self._handle_return(err=err)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/pynisher/pynisher.py", line 637, in _handle_return
    raise err
pynisher.exceptions.PynisherException: Unknown reason for exitcode 1, no result or error recieved and  killed process 
smac_train_function(Configuration(values={
  'batch_size': 128,
  'kernel_size': 3,
  'learning_rate': 0.0097270298868,
  'max_pooling': 'True',
  'n_dense_units': 94,
  'n_filters_initial': 107,
  'n_layers': 3,
  'optimizer': 'SGD',
  'use_dropout': 'False',


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'smac_train_function' on <module '__main__' (built-in)>


  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 190, in run
    rval = self(config_copy, target_function, kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 264, in __call__
    return algorithm(config, **algorithm_kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/pynisher/pynisher.py", line 602, in __call__
    return self._handle_return(err=err)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/pynisher/pynisher.py", line 637, in _handle_return
    raise err
pynisher.exceptions.PynisherException: Unknown reason for exitcode 1, no result or error recieved and  killed process 
smac_train_function(Configuration(values={
  'batch_size': 32,
  'kernel_size': 5,
  'learning_rate': 0.0004621766987,
  'max_pooling': 'False',
  'n_dense_units': 399,
  'n_filters_initial': 43,
  'n_layers': 5,
  'optimizer': 'Adam',
  'use_dropout': 'True',


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'smac_train_function' on <module '__main__' (built-in)>


  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 190, in run
    rval = self(config_copy, target_function, kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 264, in __call__
    return algorithm(config, **algorithm_kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/pynisher/pynisher.py", line 602, in __call__
    return self._handle_return(err=err)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/pynisher/pynisher.py", line 637, in _handle_return
    raise err
pynisher.exceptions.PynisherException: Unknown reason for exitcode 1, no result or error recieved and  killed process 
smac_train_function(Configuration(values={
  'batch_size': 32,
  'kernel_size': 5,
  'learning_rate': 0.0004621766987,
  'max_pooling': 'False',
  'n_dense_units': 399,
  'n_filters_initial': 43,
  'n_layers': 5,
  'optimizer': 'Adam',
  'use_dropout': 'True',


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'smac_train_function' on <module '__main__' (built-in)>


  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 190, in run
    rval = self(config_copy, target_function, kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 264, in __call__
    return algorithm(config, **algorithm_kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/pynisher/pynisher.py", line 602, in __call__
    return self._handle_return(err=err)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/pynisher/pynisher.py", line 637, in _handle_return
    raise err
pynisher.exceptions.PynisherException: Unknown reason for exitcode 1, no result or error recieved and  killed process 
smac_train_function(Configuration(values={
  'batch_size': 128,
  'kernel_size': 3,
  'learning_rate': 0.0097270298868,
  'max_pooling': 'True',
  'n_dense_units': 94,
  'n_filters_initial': 107,
  'n_layers': 3,
  'optimizer': 'SGD',
  'use_dropout': 'False',


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'smac_train_function' on <module '__main__' (built-in)>
  diff_b_a = subtract(b, a)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY

[INFO][nanny.py:611] Closing Nanny at 'tcp://127.0.0.1:59725'. Reason: nanny-close
[INFO][nanny.py:858] Nanny asking worker to close. Reason: nanny-close
[INFO][nanny.py:611] Closing Nanny at 'tcp://127.0.0.1:59726'. Reason: nanny-close
[INFO][nanny.py:858] Nanny asking worker to close. Reason: nanny-close
[INFO][nanny.py:626] Nanny at 'tcp://127.0.0.1:59725' closed.
[INFO][nanny.py:626] Nanny at 'tcp://127.0.0.1:59726' closed.


  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as

  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 190, in run
    rval = self(config_copy, target_function, kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 264, in __call__
    return algorithm(config, **algorithm_kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/pynisher/pynisher.py", line 602, in __call__
    return self._handle_return(err=err)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/pynisher/pynisher.py", line 637, in _handle_return
    raise err
pynisher.exceptions.PynisherException: Unknown reason for exitcode 1, no result or error recieved and  killed process 
smac_train_function(Configuration(values={
  'batch_size': 64,
  'kernel_size': 5,
  'learning_rate': 0.0016690050539,
  'max_pooling': 'False',
  'n_dense_units': 84,
  'n_filters_initial': 128,
  'n_layers': 4,
  'optimizer': 'Adam',
  'use_dropout': 'True',


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'smac_train_function' on <module '__main__' (built-in)>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'smac_train_function' on <module '__main__' (built-in)>


  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 190, in run
    rval = self(config_copy, target_function, kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 264, in __call__
    return algorithm(config, **algorithm_kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/pynisher/pynisher.py", line 602, in __call__
    return self._handle_return(err=err)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/pynisher/pynisher.py", line 637, in _handle_return
    raise err
pynisher.exceptions.PynisherException: Unknown reason for exitcode 1, no result or error recieved and  killed process 
smac_train_function(Configuration(values={
  'batch_size': 64,
  'kernel_size': 5,
  'learning_rate': 0.0016690050539,
  'max_pooling': 'False',
  'n_dense_units': 84,
  'n_filters_initial': 128,
  'n_layers': 4,
  'optimizer': 'Adam',
  'use_dropout': 'True',


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'smac_train_function' on <module '__main__' (built-in)>


  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 190, in run
    rval = self(config_copy, target_function, kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 264, in __call__
    return algorithm(config, **algorithm_kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/pynisher/pynisher.py", line 602, in __call__
    return self._handle_return(err=err)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/pynisher/pynisher.py", line 637, in _handle_return
    raise err
pynisher.exceptions.PynisherException: Unknown reason for exitcode 1, no result or error recieved and  killed process 
smac_train_function(Configuration(values={
  'batch_size': 64,
  'kernel_size': 3,
  'learning_rate': 0.0011564505462,
  'max_pooling': 'False',
  'n_dense_units': 460,
  'n_filters_initial': 88,
  'n_layers': 2,
  'optimizer': 'Adam',
  'use_dropout': 'False',

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'smac_train_function' on <module '__main__' (built-in)>


  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 190, in run
    rval = self(config_copy, target_function, kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 264, in __call__
    return algorithm(config, **algorithm_kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/pynisher/pynisher.py", line 602, in __call__
    return self._handle_return(err=err)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/pynisher/pynisher.py", line 637, in _handle_return
    raise err
pynisher.exceptions.PynisherException: Unknown reason for exitcode 1, no result or error recieved and  killed process 
smac_train_function(Configuration(values={
  'batch_size': 64,
  'kernel_size': 3,
  'learning_rate': 0.0011564505462,
  'max_pooling': 'False',
  'n_dense_units': 460,
  'n_filters_initial': 88,
  'n_layers': 2,
  'optimizer': 'Adam',
  'use_dropout': 'False',

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'smac_train_function' on <module '__main__' (built-in)>


Configuration(values={
  'batch_size': 128,
  'kernel_size': 3,
  'learning_rate': 0.0097270298868,
  'max_pooling': 'True',
  'n_dense_units': 94,
  'n_filters_initial': 107,
  'n_layers': 3,
  'optimizer': 'SGD',
  'use_dropout': 'False',
  'weight_decay': 0.000500277371,
  'n_filters_layer_2': 78,
  'n_filters_layer_3': 54,
})