In [None]:
import sys
import os

# agregar la carpeta padre de notebook y model al path
sys.path.append(os.path.abspath(".."))

import random
from collections import defaultdict
import numpy as np
import math
from sklearn.linear_model import LinearRegression
from model import get_cost_matrix
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [None]:

# Lista de datsets usados
datasets_has_priors = [
    3, 6, 11, 12, 14, 15, 16, 18, 22, 23, 28, 29, 31, 32, 37, 43, 45, 49,
    53, 219, 2074, 2079, 3021, 3022, 3481, 3549, 3560, 3573, 3902, 3903,
    3904, 3913, 3917, 3918, 7592, 9910, 9946, 9952, 9957, 9960, 9964, 9971,
    9976, 9977, 9978, 9981, 9985, 10093, 10101, 14952, 14954, 14965, 14969,
    14970, 125920, 125922, 146195, 146800, 146817, 146819, 146820, 146821,
    146824, 167125
]
assert len(datasets_has_priors) == 64

In [None]:
def extrac_matrix_distance(basic_reprs, target_reprs,column_id):
     # ordena todas las tareas unicas de basic_reprs
    list_ids = sorted(list(basic_reprs[column_id].unique()))

        # comprueba que todas estas tareas tengan representationes objetivo
    task_id_has_target_representation = target_reprs[column_id].unique()
    if set(list_ids) != set(task_id_has_target_representation):
        raise ValueError("Inconsistent numbers of instances.")

        # se guarda una lista de features basicas
    basic_repr_labels = basic_reprs.columns
    basic_repr_labels = [str(_) for _ in basic_repr_labels if _ != column_id]
        
    cost_matrix = get_cost_matrix(
            target_repr=target_reprs,
            task_ids=list_ids,
            verbose= False,
            ncpus=1,
    )

    assert cost_matrix.shape[0] == len(list_ids)


    return cost_matrix
    

In [None]:

def intrinsic_estimator(matrix_distance):
    muL = []
    N = len(matrix_distance)
    Femp = []

    #itera por cada task de la matriz
    for i in range(len(matrix_distance)):
        distances_ = np.unique(matrix_distance[i])

        # toma los dos vecinos mas cercanos
        NN = np.argsort(distances_)[1:3]
        first = NN[0]
        second = NN[1]

        # calcula ratio (invariante a escala y tiene distribucion conocida segun la dimension)
        mu_i = distances_[second] / (distances_[first] + (10 ** (-3)))
        muL.append(mu_i)

    # limpiar outliers (elimina el 10% mas grande)
    muL = np.sort(muL)

    
    cutoff = int(np.floor(0.9 * len(muL)))

    if len(muL) > 10:

        muL = muL[0 : cutoff + 1]
    
    else:
        muL = muL[0:cutoff]

    # evitar valor invalidos(para el log)
    muL = [x if x > 0 else 1 + 10 ** (-3) for x in muL]

    # transformacion logaritmica
    muL = np.asarray([math.log(mu_i) for mu_i in muL]).reshape(-1, 1)
    
    # construccion de al cdf empirica
    step = 1 / N
    Femp = [i * step for i in range(1, len(muL) + 1)]
    Femp = np.asarray([-math.log(1 - x) for x in Femp]).reshape(-1, 1)

    # Regresion lineal (sin intercepto) para estimar la dimension intrinseca
    clf = LinearRegression(fit_intercept=False)
    clf.fit(muL, Femp)

    # extraer la dimension
    return clf.coef_[0][0]

In [None]:
def load_and_preprocess_data(basic_path, target_path):
    """
    Carga y normaliza las representaciones básicas y las representaciones objetivo.
    """
    # Cargar datasets
    basic_representations = pd.read_csv(basic_path).fillna(0)
    target_representations = pd.read_csv(target_path)
    
    # Filtrar tareas que existen en el target
    basic_representations = basic_representations[
        basic_representations.task_id.isin(target_representations.task_id.unique())
    ]
    
    # Normalizar meta-features (excepto la columna task_id)
    cols = basic_representations.columns.drop("task_id")
    scaler = StandardScaler()
    basic_representations[cols] = scaler.fit_transform(basic_representations[cols])
    
    return basic_representations, target_representations, scaler, cols


In [None]:
def get_basic_target_representations(pipeline):
    basic_representations, target_representations, scaler, cols = load_and_preprocess_data(
        basic_path="../data/basic_representations.csv",
        target_path=f"../data/{pipeline}_target_representation.csv"
    )

    return basic_representations, target_representations,scaler, cols

In [None]:
# import matplotlib.pyplot as plt
# ratios_np = np.array(ratios)

# plt.figure()

# for pipeline in pipelines:
#     means = [results[pipeline][r]["mean"] for r in ratios]
#     stds = [results[pipeline][r]["std"] for r in ratios]

#     plt.errorbar(
#         ratios_np,
#         means,
#         yerr=stds,
#         marker='o',
#         capsize=5,
#         label=pipeline
#     )

# plt.xlabel("Dataset ratio")
# plt.ylabel("Intrinsic dimension (mean ± std)")
# plt.title("Intrinsic Dimension vs Dataset Ratio per Pipeline")
# plt.legend()
# plt.grid(True)

# plt.show()











# # Construir la matriz de strings
# cell_text = []
# for pipeline in pipelines:
#     row = []
#     for ratio in ratios:
#         mean = results[pipeline][ratio]["mean"]
#         std  = results[pipeline][ratio]["std"]
#         row.append(f"{mean:.2f} ({std:.2f})")
#     cell_text.append(row)

# fig, ax = plt.subplots()
# ax.axis('off')

# table = ax.table(
#     cellText=cell_text,
#     rowLabels=pipelines,
#     colLabels=[str(r) for r in ratios],
#     loc='center'
# )

# table.auto_set_font_size(False)
# table.set_fontsize(10)
# table.scale(1.2, 1.5)

# ax.set_title("Intrinsic Dimension: mean (std)")

# plt.show()

In [None]:
ratios = [0.1, 0.25, 0.5,0.75,1]
pipelines = ['adaboost','random_forest', 'autosklearn', 'libsvm_svc']
results = defaultdict(dict)
n_repeats = 20


for pipeline in pipelines:


    basic,target,_, cols = get_basic_target_representations(pipeline)

    for ratio in ratios:

        if ratio <= 0.25:
            n_repeats = np.random.randint(25, 31)
        elif ratio <= 0.75:
            n_repeats = np.random.randint(10, 16)
        else :
            n_repeats = 1

        intrinsic_values = []

        n_samples = int(len(datasets_has_priors) * ratio)
        
        print(f"PIPELINE =={pipeline} ---- RATIO=={ratio} ---- n_repeats=={n_repeats}")

        for _ in range(n_repeats):

            task_ids_subset = random.sample(datasets_has_priors, n_samples)

            basic_subset = basic[basic["task_id"].isin(task_ids_subset)].copy()
            target_subset = target[target["task_id"].isin(task_ids_subset)].copy()

            assert len(basic_subset) == n_samples
            
            cost_matrix = extrac_matrix_distance(
                                basic_reprs=basic_subset, 
                                target_reprs=target_subset, 
                                column_id="task_id")   
            
            intrinsic_dim = intrinsic_estimator(cost_matrix)
            #print(f"Intrinsic dimension: {intrinsic_dim}")
            intrinsic_values.append(intrinsic_dim)

        
        mean_dim = np.mean(intrinsic_values)
        std_dim = np.std(intrinsic_values)

        results[pipeline][ratio] = {"mean": mean_dim, "std": std_dim}


print(results)