In [66]:
from model import (
    MetaFeatX, 
    get_cost_matrix, 
    intrinsic_estimator,
    train_fused_gromov_wasserstein, get_ndcg_score)
import pandas as pd
import numpy as np
from experiment import (
    load_bootstrap_representations,
    load_basic_representations,
    load_target_representations
)
from sklearn.metrics import pairwise_distances

In [67]:
class MetaFeatXCustom(MetaFeatX):
    def __init__(self,d_value,**kwargs):
        super().__init__(**kwargs)

        self.d_value = d_value

    
    def train(
        self, 
        basic_reprs: pd.DataFrame, 
        target_reprs: pd.DataFrame,
        column_id: str
    ) -> None:

        # ordena todas las tareas unicas de basic_reprs
        list_ids = sorted(list(basic_reprs[column_id].unique()))

        # comprueba que todas estas tareas tengan representationes objetivo
        task_id_has_target_representation = target_reprs[column_id].unique()
        if set(list_ids) != set(task_id_has_target_representation):
            raise ValueError("Inconsistent numbers of instances.")

        # se guarda una lista de features basicas
        basic_repr_labels = basic_reprs.columns
        self.basic_repr_labels = [str(_) for _ in basic_repr_labels if _ != column_id]
        
        self.cost_matrix = get_cost_matrix(
            target_repr=target_reprs,
            task_ids=list_ids,
            verbose=self.verbose,
            ncpus=self.ncpus,
        )

        assert self.cost_matrix.shape[0] == len(list_ids)
    
        # print(f"Cost matrix:\n{self.cost_matrix}")

        # estimacion de la dimension intrinseca
        if self.d_value == 100:
            self.intrinsic_dim = intrinsic_estimator(self.cost_matrix)
            print(f"Intrinsic dimension: {self.intrinsic_dim}")
        else :
            self.intrinsic_dim = self.d_value

        # Aprendizaje de la representacion del modelo
        self.model, self.mds = train_fused_gromov_wasserstein(
            basic_representations=basic_reprs.set_index(column_id),
            pairwise_dist_z=self.cost_matrix,
            learning_rate=self.learning_rate,
            seed=self.seed,
            early_stopping=self.early_stopping_patience,
            early_stopping_criterion_ndcg=self.early_stopping_criterion_ndcg,
            alpha=self.alpha,
            intrinsic_dim=self.intrinsic_dim,
            lambda_reg=self.lambda_reg,
            device=self.device,
            list_ids=list_ids,
        )

        print(f"Trained linear mapping shape: {self.linear_mapping.shape}")
        # print(f"Trainerd linear mapping:\n{self.model.shape}")
        return self

In [68]:

def load_model_representations(cfg, basic_reprs, target_reprs, list_ids, train_ids, test_ids, d):
    basic_reprs["boostrap"] = 0

    bootstrap_reprs = load_bootstrap_representations(metafeature=cfg.metafeature, path=cfg.data_path)

    print(bootstrap_reprs.shape)
    bootstrap_reprs = bootstrap_reprs[bootstrap_reprs.task_id.isin(list_ids)]

    
    bootstrap_reprs["boostrap"] = 1

    print(bootstrap_reprs.shape)

    combined_basic_reprs = pd.concat([basic_reprs, bootstrap_reprs], axis=0)

    print(combined_basic_reprs.shape)

    
    combined_basic_reprs = pd.concat([
        combined_basic_reprs[combined_basic_reprs.task_id.isin(train_ids)],
        combined_basic_reprs[combined_basic_reprs.task_id.isin(test_ids)]
    ], axis=0)

    print(combined_basic_reprs.shape)
    

    model = MetaFeatXCustom(d_value=d,
                    alpha=0.5,
                    lambda_reg=1e-3,
                    learning_rate=0.01,
                    early_stopping_patience=20,
                    early_stopping_criterion_ndcg=cfg.task.ndcg,
                    verbose=False,
                    seed=cfg.seed)
    
    repr_train, repr_test = model.train_and_predict(
        basic_reprs=combined_basic_reprs.drop(["boostrap"], axis=1),
        target_reprs=target_reprs,
        column_id="task_id",
        train_ids=train_ids,
        test_ids=test_ids
    )

    model_reprs = np.concatenate([repr_train, repr_test], axis=0)
    model_reprs = pd.DataFrame(model_reprs, columns=[f"col{_}" for _ in range(model_reprs.shape[1])])
    model_reprs["task_id"] = combined_basic_reprs["task_id"].values
    model_reprs["boostrap"] = combined_basic_reprs["boostrap"].values
    return model_reprs[model_reprs.boostrap == 0].drop(["boostrap"], axis=1)


In [69]:

def run_task1(cfg,d):

    target_reprs = load_target_representations(pipeline=cfg.pipeline, path=cfg.data_path)

    list_ids = sorted(list(target_reprs["task_id"].unique()))

    if cfg.openml_tid not in list_ids:
        raise Exception(f"OpenML task {cfg.openml_tid} does not have target representations.")

    basic_reprs = load_basic_representations(metafeature=cfg.metafeature, path=cfg.data_path)

    basic_reprs = basic_reprs[basic_reprs.task_id.isin(list_ids)]

    if cfg.metafeature.name == "metafeatx":
        train_ids = [_ for _ in list_ids if _ != cfg.openml_tid]
        test_ids = [cfg.openml_tid]

        basic_reprs = load_model_representations(cfg, basic_reprs, target_reprs, list_ids, train_ids, test_ids,d)

    basic_reprs = basic_reprs.set_index("task_id")

    true_dist = get_cost_matrix(target_repr=target_reprs, task_ids=list_ids, verbose=False)
    pred_dist = pairwise_distances(basic_reprs.loc[list_ids])


    id_test = list_ids.index(cfg.openml_tid)

    return get_ndcg_score(dist_pred=np.array([pred_dist[id_test]]), dist_true=np.array([true_dist[id_test]]),
                       k=cfg.task.ndcg)

In [70]:

# Lista de datsets usados
datasets_has_priors = [
    3, 6, 11, 12, 14, 15, 16, 18, 22, 23, 28, 29, 31, 32, 37, 43, 45, 49,
    53, 219, 2074, 2079, 3021, 3022, 3481, 3549, 3560, 3573, 3902, 3903,
    3904, 3913, 3917, 3918, 7592, 9910, 9946, 9952, 9957, 9960, 9964, 9971,
    9976, 9977, 9978, 9981, 9985, 10093, 10101, 14952, 14954, 14965, 14969,
    14970, 125920, 125922, 146195, 146800, 146817, 146819, 146820, 146821,
    146824, 167125
]
assert len(datasets_has_priors) == 64

In [71]:
from omegaconf import OmegaConf

def compute(dataset,cfg_metafeature, cfg_pipeline,cfg_main,cfg_task):

    cfg = OmegaConf.create({
        "seed": cfg_main.get("seed", 42),
        "pipeline": cfg_pipeline,
        "metafeature": cfg_metafeature,
        "task": cfg_task,
        "openml_tid": cfg_main.get("openml_tid", dataset),
        "data_path": cfg_main.get("data_path", "../data"),
        "output_file": cfg_main.get("output_file", None),
    })


    return cfg

In [72]:
import os

def config( pipeline_name, config_base_path="../conf",task_name="task1"):
    # Config general
    config_yaml = os.path.join(config_base_path, "config.yaml")
    cfg_main = OmegaConf.load(config_yaml)

    # Pipeline
    pipeline_yaml = os.path.join(config_base_path, "pipeline", f"{pipeline_name}.yaml")
    cfg_pipeline = OmegaConf.load(pipeline_yaml)

    # Metafeature
    metafeature_yaml = os.path.join(config_base_path, "metafeature", "metafeatx.yaml")
    cfg_metafeature = OmegaConf.load(metafeature_yaml)

    # Task
    task_yaml = os.path.join(config_base_path, "task", f"{task_name}.yaml")
    cfg_task = OmegaConf.load(task_yaml)

    return cfg_main,cfg_pipeline, cfg_task, cfg_metafeature


In [None]:
import random
from collections import defaultdict

pipelines = ['adaboost','random_forest','libsvm_svc']
k_values = [10,15,20,25]
d_values = [2,5,10,15,20,25,100]

results = defaultdict(lambda: defaultdict(dict))  # 2 niveles por default

for pipeline in pipelines:
    
    for k_value in k_values:
        for d in d_values:
            # escogemos un dataset aleatorio
            dataset_id = random.choice(datasets_has_priors)
            print(f"PIPELINE =={pipeline} ---- NDCG@K=={k_value} ---- dimension=={d} ---- dataset random=={dataset_id}")

            cfg_main, cfg_pipeline, cfg_task, cfg_metafeature = config(pipeline_name=pipeline)
            
            cfg.task.ndcg = k_value
            
            # generamos la configuración para ese dataset
            cfg = compute(
                dataset=dataset_id,
                cfg_metafeature=cfg_metafeature, 
                cfg_pipeline=cfg_pipeline,
                cfg_main=cfg_main,
                cfg_task=cfg_task
            )
            
            
            value_ndcg = run_task1(cfg=cfg, d=d)  # retorna un valor para ese dataset
            
            # guardamos el valor (media y std son iguales porque solo hay 1 dataset)
            results[pipeline][k_value][d] = {
                'mean': float(value_ndcg),
                'std': 0.0  # sin variación, solo 1 valor
            }

print(results)


PIPELINE ==adaboost ---- NDCG@K==10 ---- dimension==2 ---- dataset random==146824
(72000, 136)
(64000, 137)
(64064, 137)
(64064, 137)
Trained linear mapping shape: (2, 135)
PIPELINE ==adaboost ---- NDCG@K==10 ---- dimension==5 ---- dataset random==9978
(72000, 136)
(64000, 137)
(64064, 137)
(64064, 137)
Trained linear mapping shape: (5, 135)
PIPELINE ==adaboost ---- NDCG@K==10 ---- dimension==10 ---- dataset random==14970
(72000, 136)
(64000, 137)
(64064, 137)
(64064, 137)
Trained linear mapping shape: (10, 135)
PIPELINE ==adaboost ---- NDCG@K==10 ---- dimension==15 ---- dataset random==23
(72000, 136)
(64000, 137)
(64064, 137)
(64064, 137)
Trained linear mapping shape: (15, 135)
PIPELINE ==adaboost ---- NDCG@K==10 ---- dimension==20 ---- dataset random==3904
(72000, 136)
(64000, 137)
(64064, 137)
(64064, 137)
Trained linear mapping shape: (20, 135)
PIPELINE ==adaboost ---- NDCG@K==10 ---- dimension==25 ---- dataset random==14952
(72000, 136)
(64000, 137)
(64064, 137)
(64064, 137)
Trai