# Como sacar runs (configuraciones de algoritmos) de OpenML

Sacado de https://docs.openml.org/examples/40_paper/2015_neurips_feurer_example/

In [None]:
import pandas as pd

import openml

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)


In [2]:
dataset_ids = [
    3, 6, 12, 14, 16, 18, 21, 22, 23, 24, 26, 28, 30, 31, 32, 36, 38, 44, 46,
    57, 60, 179, 180, 181, 182, 184, 185, 273, 293, 300, 351, 354, 357, 389,
    390, 391, 392, 393, 395, 396, 398, 399, 401, 554, 679, 715, 718, 720, 722,
    723, 727, 728, 734, 735, 737, 740, 741, 743, 751, 752, 761, 772, 797, 799,
    803, 806, 807, 813, 816, 819, 821, 822, 823, 833, 837, 843, 845, 846, 847,
    849, 866, 871, 881, 897, 901, 903, 904, 910, 912, 913, 914, 917, 923, 930,
    934, 953, 958, 959, 962, 966, 971, 976, 977, 978, 979, 980, 991, 993, 995,
    1000, 1002, 1018, 1019, 1020, 1021, 1036, 1040, 1041, 1049, 1050, 1053,
    1056, 1067, 1068, 1069, 1111, 1112, 1114, 1116, 1119, 1120, 1128, 1130,
    1134, 1138, 1139, 1142, 1146, 1161, 1166,
]

Ahora vamos a considerar:

1. Listar todas las tareas de clasificación supervisada
2. Filtrar por estrategia de remuestreo (resampling) , en este caso, el 33% de test
3. Buscar el `task_id` correspondiente a cada `dataset_id` y si hay varias tareas para el mismo dataset, toma la que tiene ID mas bajo.
4. Comprobar que la tarea usa el mismo atributo objetivo que el dataset
5. Ordenar y mostrar los `task_ids`

In [4]:
tasks = openml.tasks.list_tasks(
    task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION,
    status="all",
    output_format="dataframe",
)

# Query only those with holdout as the resampling startegy.
tasks = tasks.query('estimation_procedure == "33% Holdout set"')

task_ids = []
for did in dataset_ids:
    tasks_ = list(tasks.query("did == {}".format(did)).tid)
    if len(tasks_) >= 1:  # if there are multiple task, take the one with lowest ID (oldest).
        task_id = min(tasks_)
    else:
        raise ValueError(did)

    # Optional - Check that the task has the same target attribute as the
    # dataset default target attribute
    # (disabled for this example as it needs to run fast to be rendered online)
    # task = openml.tasks.get_task(task_id)
    # dataset = task.get_dataset()
    # if task.target_name != dataset.default_target_attribute:
    #     raise ValueError(
    #         (task.target_name, dataset.default_target_attribute)
    #     )

    task_ids.append(task_id)

assert len(task_ids) == 140
task_ids.sort()

# These are the tasks to work with:
print(task_ids)

[233, 236, 242, 244, 246, 248, 251, 252, 253, 254, 256, 258, 260, 261, 262, 266, 273, 275, 288, 2117, 2118, 2119, 2120, 2122, 2123, 2350, 3043, 3044, 75090, 75092, 75093, 75098, 75099, 75100, 75103, 75104, 75105, 75106, 75107, 75108, 75111, 75112, 75113, 75114, 75115, 75116, 75117, 75119, 75120, 75121, 75122, 75125, 75126, 75129, 75131, 75133, 75136, 75137, 75138, 75139, 75140, 75142, 75143, 75146, 75147, 75148, 75149, 75150, 75151, 75152, 75153, 75155, 75157, 75159, 75160, 75161, 75162, 75163, 75164, 75165, 75166, 75168, 75169, 75170, 75171, 75172, 75173, 75174, 75175, 75176, 75179, 75180, 75182, 75183, 75184, 75185, 75186, 75188, 75189, 75190, 75191, 75192, 75194, 75195, 75196, 75197, 75198, 75199, 75200, 75201, 75202, 75203, 75204, 75205, 75206, 75207, 75208, 75209, 75210, 75212, 75213, 75216, 75218, 75220, 75222, 75224, 75226, 75228, 75229, 75233, 75238, 75240, 75244, 75245, 75246, 75247, 75248, 75249, 75251, 190400]


Ahora veamos para un caso específico, el de `task_ids[0]`.
1. Obtener la tarea asociada a ese id
2. Listar las evaluaciones hechas a ese `task_id` , que sea basada en la función de métrica de `f_measure`.
3. Imprimir las primeras 5 filas.

In [None]:
def evaluation_task(task_id,metric = "f_measure"):
    task = openml.tasks.get_task(task_id)

    evaluations = openml.evaluations.list_evaluations(
        metric,
        tasks = [task_id],
        output_format= "dataframe",
        size = None
    )

    pd.set_option("display.max_columns", None)  # muestra todas las columnas
    pd.set_option("display.max_colwidth", None)  # muestra contenido completo de cada celda

    return evaluations


evaluations = evaluation_task(task_ids[0])


   run_id  task_id  setup_id  flow_id      flow_name  data_id data_name  \
0   90942      233       592      364  weka.ZeroR(2)        3  kr-vs-kp   
1  169035      233      1838     1068   weka.J48(28)        3  kr-vs-kp   
2  169352      233      1839     1069  weka.ZeroR(8)        3  kr-vs-kp   
3  169691      233      1840     1070  weka.Ridor(3)        3  kr-vs-kp   
4  173251      233      1841     1071  weka.OneR(17)        3  kr-vs-kp   

    function          upload_time  uploader uploader_name     value values  \
0  f_measure  2014-12-08 12:02:29         1  Jan van Rijn  0.373052   None   
1  f_measure  2015-02-20 06:23:48         1  Jan van Rijn  0.988616   None   
2  f_measure  2015-02-20 06:41:33         1  Jan van Rijn  0.373052   None   
3  f_measure  2015-02-20 07:28:32         1  Jan van Rijn  0.839545   None   
4  f_measure  2015-02-21 11:48:57         1  Jan van Rijn  0.669583   None   

            array_data  
0         [0.697157,0]  
1   [0.989343,0.98778]  
2    

**En los primeros 5 runs, obtener la configuración , cargamos el `setup` (configuraciones) y las imprimimos.Ahí veremos el nombre del algoritmo usado. La URL y los valores de los parámetros**

In [8]:
for run_id in evaluations['run_id'].head(5):  # primeros 5 runs
    run = openml.runs.get_run(run_id)
    setup = openml.setups.get_setup(run.setup_id)
    flow = openml.flows.get_flow(setup.flow_id)  # obtenemos el flow
    print(run_id, flow.name, setup.parameters)


90942 weka.ZeroR None
169035 weka.J48 {9551: OpenML Parameter
ID............: 9551
Flow ID.......: 1068
Flow Name.....: weka.J48(28)_C
Flow URL......: https://www.openml.org/f/1068
Parameter Name: C
  |__Data Type: option
  |__Default..: 0.25
  |__Value....: 0.25, 9552: OpenML Parameter
ID............: 9552
Flow ID.......: 1068
Flow Name.....: weka.J48(28)_M
Flow URL......: https://www.openml.org/f/1068
Parameter Name: M
  |__Data Type: option
  |__Default..: 2
  |__Value....: 2}
169352 weka.ZeroR None
169691 weka.Ridor {9566: OpenML Parameter
ID............: 9566
Flow ID.......: 1070
Flow Name.....: weka.Ridor(3)_F
Flow URL......: https://www.openml.org/f/1070
Parameter Name: F
  |__Data Type: option
  |__Default..: 3
  |__Value....: 3, 9567: OpenML Parameter
ID............: 9567
Flow ID.......: 1070
Flow Name.....: weka.Ridor(3)_S
Flow URL......: https://www.openml.org/f/1070
Parameter Name: S
  |__Data Type: option
  |__Default..: 1
  |__Value....: 1, 9570: OpenML Parameter
ID......