In [4]:
# 1st Example: Integer hyperparameters and float hyperparameters
from ConfigSpace import ConfigurationSpace

cs = ConfigurationSpace(
    space={
        "C": (-1.0, 1.0),  # Note the decimal to make it a float
        "max_iter": (10, 100),
    },
    seed=1234,
)

In [None]:
# 2nd Example: Categorical hyperparameters and conditions#
from ConfigSpace import ConfigurationSpace, Categorical, Float, Integer

kernel_type = Categorical('kernel_type', ['linear', 'poly', 'rbf', 'sigmoid'])
degree = Integer('degree', bounds=(2, 4), default=2)
coef0 = Float('coef0', bounds=(0, 1), default=0.0)
gamma = Float('gamma', bounds=(1e-5, 1e2), default=1, log=True)

cs = ConfigurationSpace()
cs.add([kernel_type, degree, coef0, gamma])

from ConfigSpace import EqualsCondition, InCondition, OrConjunction

# read as: "degree is active if kernel_type == 'poly'"
cond_1 = EqualsCondition(degree, kernel_type, 'poly')

# read as: "coef0 is active if (kernel_type == 'poly' or kernel_type == 'sigmoid')"
# You could also define this using an InCondition as shown below
cond_2 = OrConjunction(
    EqualsCondition(coef0, kernel_type, 'poly'),
    EqualsCondition(coef0, kernel_type, 'sigmoid')
)

# read as: "gamma is active if kernel_type in ['rbf', 'poly', 'sigmoid']"
cond_3 = InCondition(gamma, kernel_type, ['rbf', 'poly','sigmoid'])



Configuration space object:
  Hyperparameters:
    coef0, Type: UniformFloat, Range: [0.0, 1.0], Default: 0.0
    degree, Type: UniformInteger, Range: [2, 4], Default: 2
    gamma, Type: UniformFloat, Range: [1e-05, 100.0], Default: 1.0, on log-scale
    kernel_type, Type: Categorical, Choices: {linear, poly, rbf, sigmoid}, Default: linear



In [None]:
# 3rd Example: Forbidden clauses#
from ConfigSpace import ConfigurationSpace, Categorical, Constant

cs = ConfigurationSpace()

penalty = Categorical("penalty", ["l1", "l2"], default="l2")
loss = Categorical("loss", ["hinge", "squared_hinge"], default="squared_hinge")
dual = Constant("dual", "False")
cs.add([penalty, loss, dual])

from ConfigSpace import ForbiddenEqualsClause, ForbiddenAndConjunction

penalty_and_loss = ForbiddenAndConjunction(
    ForbiddenEqualsClause(penalty, "l1"),
    ForbiddenEqualsClause(loss, "hinge")
)
constant_penalty_and_loss = ForbiddenAndConjunction(
    ForbiddenEqualsClause(dual, "False"),
    ForbiddenEqualsClause(penalty, "l2"),
    ForbiddenEqualsClause(loss, "hinge")
)
penalty_and_dual = ForbiddenAndConjunction(
    ForbiddenEqualsClause(dual, "False"),
    ForbiddenEqualsClause(penalty, "l1")
)

In [None]:
# 4th Example Serialization#
from pathlib import Path
from ConfigSpace import ConfigurationSpace

path = Path("configspace.yaml")
cs = ConfigurationSpace(
    space={
        "C": (-1.0, 1.0),  # Note the decimal to make it a float
        "max_iter": (10, 100),
    },
    seed=1234,
)
cs.to_yaml(path)
loaded_cs = ConfigurationSpace.from_yaml(path)

with path.open() as f:
    print(f.read())

In [3]:
# 5th Example: Placing priors on the hyperparameters#
import numpy as np
from ConfigSpace import ConfigurationSpace, Float, Categorical, Beta, Normal

cs = ConfigurationSpace(
    space={
        "lr": Float(
            'lr',
            bounds=(1e-5, 1e-1),
            default=1e-3,
            log=True,
            distribution=Normal(1e-3, 1e-1)
        ),
        "dropout": Float(
            'dropout',
            bounds=(0, 0.99),
            default=0.25,
            distribution=Beta(alpha=2, beta=4)
        ),
        "activation": Categorical(
            'activation',
            items=['tanh', 'relu'],
            weights=[0.2, 0.8]
        ),
    },
    seed=1234,
)
print(cs)


Configuration space object:
  Hyperparameters:
    activation, Type: Categorical, Choices: {tanh, relu}, Default: relu, Probabilities: [0.2 0.8]
    dropout, Type: BetaFloat, Alpha: 2.0, Beta: 4.0, Range: [0.0, 0.99], Default: 0.25
    lr, Type: NormalFloat, Mu: 0.001, Sigma: 0.1, Range: [1e-05, 0.1], Default: 0.001, on log-scale



In [None]:
from ConfigSpace import Configuration


In [4]:
from smac import Scenario

def generate_scenario(cs):
    scenario = Scenario({
        'run_obj': 'quality',
        'output_dir': './automl_results',
        'shared_model': False,
        'multi_objectives': ['validation_loss'],
        'overall_obj': 'validation_loss',
        'deterministic': False,
        'wallclock_limit': 3600.0,
        'abort_on_first_run_crash': True,
        'limit_resources': True,
        'memory_limit': 16000.0,
        'cutoff': 300.0
    })
    return scenario

scenario = generate_scenario(cs)

In [6]:
import numpy as np
from types import SimpleNamespace
X = np.random.rand(10, 5)
y = np.random.rand(10, 1)


mydict = dict(X=X, y=y)


mydict = SimpleNamespace(**mydict)
mydict.X

array([[0.98525351, 0.781331  , 0.90316584, 0.42289328, 0.0836624 ],
       [0.09098289, 0.45530062, 0.8913916 , 0.55417661, 0.19638563],
       [0.34897899, 0.89023083, 0.46529547, 0.62919728, 0.78103712],
       [0.21140623, 0.78827005, 0.41219888, 0.18284856, 0.95699989],
       [0.4160139 , 0.39566742, 0.68413273, 0.9207471 , 0.81198847],
       [0.1554572 , 0.28036314, 0.04413612, 0.39658158, 0.05237825],
       [0.56122639, 0.55022277, 0.39825877, 0.58396533, 0.46107929],
       [0.69087577, 0.2232384 , 0.04952998, 0.64866486, 0.50059245],
       [0.69802054, 0.18925913, 0.25659784, 0.16636338, 0.98976883],
       [0.40259054, 0.33274224, 0.01593082, 0.50959103, 0.43491217]])

In [10]:
from keras.datasets import mnist

ModuleNotFoundError: No module named 'keras'

In [3]:
# load mnist dataset
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist.data, mnist.target


In [42]:
from ConfigSpace import ConfigurationSpace, Categorical, Float, Integer, ForbiddenAndConjunction, ForbiddenEqualsClause, EqualsCondition

def get_configspace():
    cs = ConfigurationSpace(seed=1234)

    learning_rate = Categorical('learning_rate', ['constant', 'invscaling', 'adaptive'])
    eta0 = Float('eta0', bounds=(0.0001, 1.0), default=0.01, log=True)
    max_iter = Integer('max_iter', bounds=(100, 1000), default=200)
    tol = Float('tol', bounds=(1e-6, 1e-2), default=1e-4, log=True)
    early_stopping = Categorical('early_stopping', ['True', 'False'], default='False')
    validation_fraction = Float('validation_fraction', bounds=(0.01, 0.5), default=0.1)
    n_jobs = Integer('n_jobs', bounds=(1, 10), default=1)
    random_state = Integer('random_state', bounds=(0, 100), default=42)

    cs.add([learning_rate, eta0, max_iter, tol, early_stopping, validation_fraction, n_jobs, random_state])

    # cond_eta0 = EqualsCondition(eta0, learning_rate, 'constant')
    # cs.add(cond_eta0)

    # forbidden_eta0_and_max_iter = ForbiddenAndConjunction(
    #     ForbiddenEqualsClause(eta0, 0.01),
    #     ForbiddenEqualsClause(max_iter, 100)
    # )
    # cs.add(forbidden_eta0_and_max_iter)

    return cs

cs = get_configspace()
print(cs)

Configuration space object:
  Hyperparameters:
    early_stopping, Type: Categorical, Choices: {True, False}, Default: False
    eta0, Type: UniformFloat, Range: [0.0001, 1.0], Default: 0.01, on log-scale
    learning_rate, Type: Categorical, Choices: {constant, invscaling, adaptive}, Default: constant
    max_iter, Type: UniformInteger, Range: [100, 1000], Default: 200
    n_jobs, Type: UniformInteger, Range: [1, 10], Default: 1
    random_state, Type: UniformInteger, Range: [0, 100], Default: 42
    tol, Type: UniformFloat, Range: [1e-06, 0.01], Default: 0.0001, on log-scale
    validation_fraction, Type: UniformFloat, Range: [0.01, 0.5], Default: 0.1



In [44]:
from smac.scenario import Scenario

def generate_scenario(cs):
    scenario = Scenario(
        configspace=cs,
        deterministic=True,
        n_trials=10,
    )
    return scenario
scenario = generate_scenario(cs)
print(scenario)

Scenario(configspace=Configuration space object:
  Hyperparameters:
    early_stopping, Type: Categorical, Choices: {True, False}, Default: False
    eta0, Type: UniformFloat, Range: [0.0001, 1.0], Default: 0.01, on log-scale
    learning_rate, Type: Categorical, Choices: {constant, invscaling, adaptive}, Default: constant
    max_iter, Type: UniformInteger, Range: [100, 1000], Default: 200
    n_jobs, Type: UniformInteger, Range: [1, 10], Default: 1
    random_state, Type: UniformInteger, Range: [0, 100], Default: 42
    tol, Type: UniformFloat, Range: [1e-06, 0.01], Default: 0.0001, on log-scale
    validation_fraction, Type: UniformFloat, Range: [0.01, 0.5], Default: 0.1
, name=None, output_directory=PosixPath('smac3_output'), deterministic=True, objectives='cost', crash_cost=inf, termination_cost_threshold=inf, walltime_limit=inf, cputime_limit=inf, trial_walltime_limit=None, trial_memory_limit=None, n_trials=10, use_default_config=False, instances=None, instance_features=None, min

In [32]:
print([p.name for p in list(cs.values())])

['early_stopping', 'eta0', 'learning_rate', 'max_iter', 'n_jobs', 'random_state', 'tol', 'validation_fraction']


In [45]:
scenario.configspace

Configuration space object:
  Hyperparameters:
    early_stopping, Type: Categorical, Choices: {True, False}, Default: False
    eta0, Type: UniformFloat, Range: [0.0001, 1.0], Default: 0.01, on log-scale
    learning_rate, Type: Categorical, Choices: {constant, invscaling, adaptive}, Default: constant
    max_iter, Type: UniformInteger, Range: [100, 1000], Default: 200
    n_jobs, Type: UniformInteger, Range: [1, 10], Default: 1
    random_state, Type: UniformInteger, Range: [0, 100], Default: 42
    tol, Type: UniformFloat, Range: [1e-06, 0.01], Default: 0.0001, on log-scale
    validation_fraction, Type: UniformFloat, Range: [0.01, 0.5], Default: 0.1

In [46]:
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss
from ConfigSpace import Configuration

def train(cfg: Configuration, seed: int, dataset: dict) -> float:
    """
    Train a neural network model on the given dataset.

    Args:
    - cfg (Configuration): A Configuration object containing hyperparameters.
    - seed (int): The random seed for reproducibility.
    - dataset (dict): A dictionary containing the feature matrix 'X' and label vector 'y'.

    Returns:
    - loss (float): The average training loss over 10 epochs.
    """

    # Get input and output dimensions dynamically from the dataset
    input_size = dataset['X'].shape[1]
    num_classes = len(np.unique(dataset['y']))

    # Get hyperparameters from the configuration
    learning_rate = cfg.get('learning_rate')
    eta0 = cfg.get('eta0')
    max_iter = cfg.get('max_iter')
    tol = cfg.get('tol')
    early_stopping = cfg.get('early_stopping') == 'True'
    validation_fraction = cfg.get('validation_fraction')
    n_jobs = cfg.get('n_jobs')
    warm_start = cfg.get('warm_start') == 'True'
    momentum = cfg.get('momentum')
    nesterovs_momentum = cfg.get('nesterovs_momentum') == 'True'
    power_t = cfg.get('power_t')

    # Create a neural network model
    if learning_rate == 'constant':
        # Use SGD with lr=eta0
        model = MLPClassifier(hidden_layer_sizes=(50,), max_iter=max_iter, tol=tol, 
                               early_stopping=early_stopping, validation_fraction=validation_fraction, 
                               random_state=seed, learning_rate_init=eta0, momentum=momentum, 
                               nesterovs_momentum=nesterovs_momentum)
    elif learning_rate == 'invscaling':
        # Use SGD with lr=eta0 and momentum=power_t
        model = MLPClassifier(hidden_layer_sizes=(50,), max_iter=max_iter, tol=tol, 
                               early_stopping=early_stopping, validation_fraction=validation_fraction, 
                               random_state=seed, learning_rate_init=eta0, momentum=momentum, 
                               nesterovs_momentum=nesterovs_momentum, power_t=power_t)
    elif learning_rate == 'adaptive':
        # Use Adam or equivalent with lr=eta0
        model = MLPClassifier(hidden_layer_sizes=(50,), max_iter=max_iter, tol=tol, 
                               early_stopping=early_stopping, validation_fraction=validation_fraction, 
                               random_state=seed, learning_rate_init=eta0, solver='adam')

    # Train the model
    model.fit(dataset['X'], dataset['y'])

    # Calculate the average training loss over 10 epochs
    losses = []
    for _ in range(10):
        model.fit(dataset['X'], dataset['y'])
        losses.append(log_loss(dataset['y'], model.predict_proba(dataset['X'])))
    loss = np.mean(losses)

    return loss


In [47]:
cs = get_configspace()
print(type(cs))  # Should output: <class 'ConfigSpace.configuration_space.ConfigurationSpace'>


<class 'ConfigSpace.configuration_space.ConfigurationSpace'>


In [None]:
from smac import HyperparameterOptimizationFacade, Scenario

smac = HyperparameterOptimizationFacade(
            scenario,
            train,  # We pass the target function here
            overwrite=True,  # Overrides any previous results that are found that are inconsistent with the meta-data
        )
smac.optimize()


[INFO][abstract_initial_design.py:87] Reducing the number of initial configurations from 80 to 2 (max_ratio == 0.25).
[INFO][abstract_initial_design.py:139] Using 2 initial design configurations and 0 additional configurations.
[INFO][abstract_intensifier.py:307] Using only one seed for deterministic scenario.
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 190, in run
    rval = self(config_copy, target_function, kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 264, in __call__
    return algorithm(config, **algorithm_kwargs)
TypeError: train() missing 1 required positional argument: 'dataset'


[INFO][abstract_intensifier.py:517] Added config d8c8c3 as new incumbent because there are no incumbents yet.
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 190, in run
    rval = self(config_copy, target_functi

  diff_b_a = subtract(b, a)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + V

  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 190, in run
    rval = self(config_copy, target_function, kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 264, in __call__
    return algorithm(config, **algorithm_kwargs)
TypeError: train() missing 1 required positional argument: 'dataset'


  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 190, in run
    rval = self(config_copy, target_function, kwargs)
  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 264, in __call__
    return algorithm(config, **algorithm_kwargs)
TypeError: train() missing 1 required positional argument: 'dataset'


  File "/opt/anaconda3/envs/SMAC/lib/python3.10/site-packages/smac/runner/target_function_runner.py", line 190, in run
    rval = self(config_copy, target_function, k

  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
  preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)


AttributeError: 'HyperparameterOptimizationFacade' object has no attribute 'run'

In [41]:
configspace = ConfigurationSpace({"C": (0.100, 1000.0)})
print(configspace)

Configuration space object:
  Hyperparameters:
    C, Type: UniformFloat, Range: [0.1, 1000.0], Default: 500.05



In [27]:
from ConfigSpace import Configuration, ConfigurationSpace

import numpy as np
from smac import HyperparameterOptimizationFacade, Scenario
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

iris = datasets.load_iris()


def train(config: Configuration, seed: int = 0) -> float:
    classifier = SVC(C=config["C"], random_state=seed)
    scores = cross_val_score(classifier, iris.data, iris.target, cv=5)
    return 1 - np.mean(scores)


configspace = ConfigurationSpace({"C": (0.100, 1000.0)})

# Scenario object specifying the optimization environment
scenario = Scenario(configspace, deterministic=True, n_trials=200)

# Use SMAC to find the best configuration/hyperparameters
smac = HyperparameterOptimizationFacade(scenario, train)
incumbent = smac.optimize()

[INFO][abstract_initial_design.py:139] Using 10 initial design configurations and 0 additional configurations.
[INFO][smbo.py:509] Continuing from previous run.
[INFO][smbo.py:278] Optimization process was already finished. Returning incumbent...


In [3]:
from configs.api_keys import GROQ_API_KEY, GOOGLE_API_KEY

from scripts.LLMClient import LLMClient

# Initialize the client
llm_client = LLMClient(
    api_key=GOOGLE_API_KEY,
    model_name="gemini-2.0-flash",
    embedding_model="sentence-transformers/all-MiniLM-L6-v2"  # This is a smaller, faster model
)

# Create a vector store with some test documents
documents = [
    "This is a test document about machine learning.",
    "This is another document about artificial intelligence."
]

# Try to create the vector store
try:
    llm_client.create_vector_store(documents)
except ValueError as e:
    print(f"Error: {e}")
    # Handle the error appropriately

# Generate responses with RAG
response = llm_client.generate_with_context(
    "What are the key points from the documents?",
    k=3  # Number of relevant documents to retrieve
)

print(response)

Failed to initialize embeddings model: Could not import sentence_transformers python package. Please install it with `pip install sentence-transformers`.
Attempting fallback to CPU-only mode...


Fallback initialization also failed: Could not import sentence_transformers python package. Please install it with `pip install sentence-transformers`.
RAG capabilities will be disabled
RAG capabilities are disabled. Falling back to standard generation.


Error: Embeddings model not initialized. RAG capabilities are disabled. Please check the logs for initialization errors.
Please provide me with the documents you are referring to. I need the text of the documents to be able to identify the key points for you. 

Once you provide the documents, I will:

*   **Read through them carefully.**
*   **Identify the main topics and arguments.**
*   **Summarize the key points in a concise and clear manner.**

I look forward to helping you!



In [2]:
from scripts.DocumentCollector import DocumentCollector
collector = DocumentCollector(max_workers=8, timeout=50)
print("Starting document collection...")
docs = collector.collect_documentation()
collector.save_documents(docs)
print("Document collection completed.")



Starting document collection...


2025-05-21 09:09:25,057 - INFO - Saved 3 docs for 'smac' to collected_docs/smac_docs.json
2025-05-21 09:09:25,059 - INFO - Saved 4 docs for 'configspace' to collected_docs/configspace_docs.json
2025-05-21 09:09:25,060 - INFO - Saved 2 docs for 'pytorch' to collected_docs/pytorch_docs.json
2025-05-21 09:09:25,061 - INFO - Saved 0 docs for 'tensorflow' to collected_docs/tensorflow_docs.json
2025-05-21 09:09:25,063 - INFO - Saved 4 docs for 'sklearn' to collected_docs/sklearn_docs.json


Document collection completed.


In [1]:
from configs.api_keys import OPENML_API_KEY
import openml

openml.config.apikey = OPENML_API_KEY

In [4]:
openml.datasets.list_datasets(output_format='dataframe')

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
2,2,anneal,1,1,active,ARFF,684.0,7.0,8.0,5.0,39.0,898.0,898.0,22175.0,6.0,33.0
3,3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,37.0,3196.0,0.0,0.0,0.0,37.0
4,4,labor,1,1,active,ARFF,37.0,3.0,20.0,2.0,17.0,57.0,56.0,326.0,8.0,9.0
5,5,arrhythmia,1,1,active,ARFF,245.0,13.0,2.0,13.0,280.0,452.0,384.0,408.0,206.0,74.0
6,6,letter,1,1,active,ARFF,813.0,26.0,734.0,26.0,17.0,20000.0,0.0,0.0,16.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47020,47020,Shrim,1,49887,active,arff,,,,,3.0,5998.0,2218.0,2974.0,3.0,0.0
47021,47021,Shrimphealth,1,49887,active,arff,,,,,3.0,5998.0,2218.0,2974.0,3.0,0.0
47023,47023,MedMCQA,1,25914,active,arff,53591.0,,38963.0,4.0,10.0,182822.0,95746.0,117567.0,0.0,1.0
47024,47024,Laboratorio_dataset_car,4,50072,active,arff,,,,,1.0,1750.0,0.0,0.0,0.0,0.0


In [3]:
# analysis of the dataset
print(iris.description)
print(iris.qualities)
print(iris.features)


**Author**: R.A. Fisher  
**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Iris) - 1936 - Donated by Michael Marshall  
**Please cite**:   

**Iris Plants Database**  
This is perhaps the best known database to be found in the pattern recognition literature.  Fisher's paper is a classic in the field and is referenced frequently to this day.  (See Duda & Hart, for example.)  The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant.  One class is     linearly separable from the other 2; the latter are NOT linearly separable from each other.

Predicted attribute: class of iris plant.  
This is an exceedingly simple domain.  
 
### Attribute Information:
    1. sepal length in cm
    2. sepal width in cm
    3. petal length in cm
    4. petal width in cm
    5. class: 
       -- Iris Setosa
       -- Iris Versicolour
       -- Iris Virginica
{'AutoCorrelation': 0.9865771812080537, 'CfsSubsetEval_DecisionStumpAUC': 0.9565333333333332, 'C

In [4]:
# get top 5 tasks on the dataset
print(openml.tasks.list_tasks(data_id=61, output_format="dataframe"))

           tid                                           ttid  did  name  \
59          59             TaskType.SUPERVISED_CLASSIFICATION   61  iris   
118        118                        TaskType.LEARNING_CURVE   61  iris   
289        289             TaskType.SUPERVISED_CLASSIFICATION   61  iris   
1758      1758                        TaskType.LEARNING_CURVE   61  iris   
1823      1823             TaskType.SUPERVISED_CLASSIFICATION   61  iris   
1939      1939             TaskType.SUPERVISED_CLASSIFICATION   61  iris   
1992      1992             TaskType.SUPERVISED_CLASSIFICATION   61  iris   
2227      2227  TaskType.SUPERVISED_DATASTREAM_CLASSIFICATION   61  iris   
7306      7306             TaskType.SUPERVISED_CLASSIFICATION   61  iris   
7545      7545                        TaskType.LEARNING_CURVE   61  iris   
7555      7555             TaskType.SUPERVISED_CLASSIFICATION   61  iris   
10107    10107             TaskType.SUPERVISED_CLASSIFICATION   61  iris   
52949    529

In [9]:
# get top validations runs from the tasks
task = openml.tasks.get_task(59)
task

OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 59
Task URL.............: https://www.openml.org/t/59
Estimation Procedure.: crossvalidation
Evaluation Measure...: predictive_accuracy
Target Feature.......: class
# of Classes.........: 3
Cost Matrix..........: Available

In [14]:
openml.evaluations.list_evaluations(tasks=[59], output_format="dataframe", function="predictive_accuracy", sort_order="desc")

Unnamed: 0,run_id,task_id,setup_id,flow_id,flow_name,data_id,data_name,function,upload_time,uploader,uploader_name,value,values,array_data
0,2012930,59,157613,6048,sklearn.pipeline.Pipeline(dualimputer=helper.d...,61,iris,predictive_accuracy,2017-04-06 23:00:24,1104,Jeroen van Hoof,0.986667,,
1,2012939,59,157622,6048,sklearn.pipeline.Pipeline(dualimputer=helper.d...,61,iris,predictive_accuracy,2017-04-06 23:29:28,1104,Jeroen van Hoof,0.986667,,
2,2012941,59,157624,6048,sklearn.pipeline.Pipeline(dualimputer=helper.d...,61,iris,predictive_accuracy,2017-04-07 01:36:00,1104,Jeroen van Hoof,0.986667,,
3,2012943,59,157626,6048,sklearn.pipeline.Pipeline(dualimputer=helper.d...,61,iris,predictive_accuracy,2017-04-07 02:01:33,1104,Jeroen van Hoof,0.986667,,
4,2039748,59,180922,6048,sklearn.pipeline.Pipeline(dualimputer=helper.d...,61,iris,predictive_accuracy,2017-04-09 01:09:01,1104,Jeroen van Hoof,0.986667,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4295,8803221,59,6778302,7707,sklearn.pipeline.Pipeline(imputation=openmlstu...,61,iris,predictive_accuracy,2018-01-22 20:14:33,1,Jan van Rijn,0.000000,,
4296,8805528,59,6780609,7707,sklearn.pipeline.Pipeline(imputation=openmlstu...,61,iris,predictive_accuracy,2018-01-22 20:44:26,1,Jan van Rijn,0.000000,,
4297,8810924,59,6785974,7707,sklearn.pipeline.Pipeline(imputation=openmlstu...,61,iris,predictive_accuracy,2018-01-23 08:26:33,1,Jan van Rijn,0.000000,,
4298,8850710,59,6825760,7707,sklearn.pipeline.Pipeline(imputation=openmlstu...,61,iris,predictive_accuracy,2018-01-27 02:50:20,1,Jan van Rijn,0.000000,,


In [34]:
# get the code of the task
flow = openml.flows.get_flow(6048)
flow


OpenML Flow
Flow ID.........: 6048 (version 1)
Flow URL........: https://www.openml.org/f/6048
Flow Name.......: sklearn.pipeline.Pipeline(dualimputer=helper.dual_imputer.DualImputer,nusvc=sklearn.svm.classes.NuSVC)
Flow Description: Automatically created scikit-learn flow.
Upload Date.....: 2017-04-06 22:42:59
Dependencies....: sklearn==0.18.1
numpy>=1.6.1
scipy>=0.9

In [3]:
# get the code of the task
setup = openml.setups.get_setup(157613)

In [4]:
setup.parameters

{56535: OpenML Parameter
 ID............: 56535
 Flow ID.......: 6048
 Flow Name.....: sklearn.pipeline.Pipeline(dualimputer=helper.dual_imputer.DualImputer,nusvc=sklearn.svm.classes.NuSVC)(1)_steps
 Flow URL......: https://www.openml.org/f/6048
 Parameter Name: steps
   |__Data Type: None
   |__Default..: [{"oml-python:serialized_object": "component_reference", "value": {"key": "dualimputer", "step_name": "dualimputer"}}, {"oml-python:serialized_object": "component_reference", "value": {"key": "nusvc", "step_name": "nusvc"}}]
   |__Value....: [('dualimputer', <helper.dual_imputer.DualImputer object at 0x7f9edc7cfe48>), ('nusvc', NuSVC(cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
    max_iter=-1, nu=0.3, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False))],
 56536: OpenML Parameter
 ID............: 56536
 Flow ID.......: 6049
 Flow Name.....: sklearn.svm.classes.NuSVC(1)_cache_si

In [29]:
run = openml.runs.get_run(2012943)
run

OpenML Run
Uploader Name...: Jeroen van Hoof
Uploader Profile: https://www.openml.org/u/1104
Metric..........: predictive_accuracy
Result..........: 0.986667
Run ID..........: 2012943
Run URL.........: https://www.openml.org/r/2012943
Task ID.........: 59
Task Type.......: Supervised Classification
Task URL........: https://www.openml.org/t/59
Flow ID.........: 6048
Flow Name.......: sklearn.pipeline.Pipeline(dualimputer=helper.dual_imputer.DualImputer,nusvc=sklearn.svm.classes.NuSVC)(1)
Flow URL........: https://www.openml.org/f/6048
Setup ID........: 157626
Setup String....: None
Dataset ID......: 61
Dataset URL.....: https://www.openml.org/d/61

In [30]:
flow = run.flow
print(flow)

None


In [31]:
model = flow.model

AttributeError: 'NoneType' object has no attribute 'model'

In [4]:
openml.datasets.list_datasets(output_format="dataframe")

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
2,2,anneal,1,1,active,ARFF,684.0,7.0,8.0,5.0,39.0,898.0,898.0,22175.0,6.0,33.0
3,3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,37.0,3196.0,0.0,0.0,0.0,37.0
4,4,labor,1,1,active,ARFF,37.0,3.0,20.0,2.0,17.0,57.0,56.0,326.0,8.0,9.0
5,5,arrhythmia,1,1,active,ARFF,245.0,13.0,2.0,13.0,280.0,452.0,384.0,408.0,206.0,74.0
6,6,letter,1,1,active,ARFF,813.0,26.0,734.0,26.0,17.0,20000.0,0.0,0.0,16.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47020,47020,Shrim,1,49887,active,arff,,,,,3.0,5998.0,2218.0,2974.0,3.0,0.0
47021,47021,Shrimphealth,1,49887,active,arff,,,,,3.0,5998.0,2218.0,2974.0,3.0,0.0
47023,47023,MedMCQA,1,25914,active,arff,53591.0,,38963.0,4.0,10.0,182822.0,95746.0,117567.0,0.0,1.0
47024,47024,Laboratorio_dataset_car,4,50072,active,arff,,,,,1.0,1750.0,0.0,0.0,0.0,0.0


In [1]:
from scripts.OpenMLRAG import OpenMLRAG

openMLRAGAgent = OpenMLRAG()
related_datasets = openMLRAGAgent.get_related_datasets(dataset_name="iris")
tasks = openMLRAGAgent.get_related_tasks_of_dataset(dataset_id=related_datasets.did, task_type="classification")
task_ids = tasks['tid'].tolist()

In [2]:
openMLRAGAgent.get_setup_parameters_of_tasks(task_ids=task_ids)


Evaluations found: 4974
157613
157622
157624
157626
180922
180924
217067
5296637
5485657
8275511


[{56535: OpenML Parameter
  ID............: 56535
  Flow ID.......: 6048
  Flow Name.....: sklearn.pipeline.Pipeline(dualimputer=helper.dual_imputer.DualImputer,nusvc=sklearn.svm.classes.NuSVC)(1)_steps
  Flow URL......: https://www.openml.org/f/6048
  Parameter Name: steps
    |__Data Type: None
    |__Default..: [{"oml-python:serialized_object": "component_reference", "value": {"key": "dualimputer", "step_name": "dualimputer"}}, {"oml-python:serialized_object": "component_reference", "value": {"key": "nusvc", "step_name": "nusvc"}}]
    |__Value....: [('dualimputer', <helper.dual_imputer.DualImputer object at 0x7f9edc7cfe48>), ('nusvc', NuSVC(cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
     max_iter=-1, nu=0.3, probability=True, random_state=None,
     shrinking=True, tol=0.001, verbose=False))],
  56536: OpenML Parameter
  ID............: 56536
  Flow ID.......: 6049
  Flow Name.....: sklearn.svm.classes.N

In [11]:
from pydantic import BaseModel
import instructor
from configs.api_keys import GOOGLE_API_KEY  # Your API key
from google import genai

google_client = genai.Client(api_key=GOOGLE_API_KEY)


class UserInfo(BaseModel):
    dataset_name: str
    dataset_tag: str

client = instructor.from_genai(google_client, model="models/gemini-1.5-flash")

response = client.chat.completions.create(
    response_model=UserInfo,
    messages=[{"role": "user", "content": "Give the relative dataset name and tag for the iris dataset in openml."}],
)

print(response)

dataset_name='iris' dataset_tag=''
