In [1]:
import os
import pandas as pd
import yaml
from sklearn.metrics import f1_score

In [2]:
objects_partitions = pd.read_parquet('../../../../../data_acquisition/ztf_forced_photometry/preprocessed/partitions/241209_ndetge8/partitions.parquet')
partitions_test = objects_partitions[objects_partitions['partition'] == 'test'].set_index('oid')
partitions_test

Unnamed: 0_level_0,ra,dec,submitter,class_name,partition
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ZTF17aaazlzl,120.870621,33.462329,amunoz,AGN,test
ZTF22aaaokdq,219.567335,-6.972391,amunoz,AGN,test
ZTF22aaplgna,188.056342,53.776541,amunoz,AGN,test
ZTF22aboawka,85.887358,-27.651368,amunoz,AGN,test
ZTF22abuhxen,7.403244,-17.641729,amunoz,AGN,test
...,...,...,...,...,...
ZTF21aclutgt,83.637554,-5.963146,amunoz,YSO,test
ZTF22aaaftbh,241.747459,-20.551293,amunoz,YSO,test
ZTF22aafvuwd,240.219609,-25.395348,amunoz,YSO,test
ZTF22abuluix,100.011046,9.590104,amunoz,YSO,test


In [3]:
y_test = partitions_test['class_name'].values
y_test

array(['AGN', 'AGN', 'AGN', ..., 'YSO', 'YSO', 'YSO'], dtype=object)

In [13]:
import numpy as np

# Ruta base donde están los trials
parent_dir = "HBRF_20250305-093441"

# Listar los trials
trials = sorted([d for d in os.listdir(parent_dir) if d.startswith("trial_")])

# Lista para almacenar los resultados
results = []

for trial in trials:
    trial_path = os.path.join(parent_dir, trial)
    
    # Cargar hiperparámetros del trial
    hparams_file = os.path.join(trial_path, "hparams.yaml")
    with open(hparams_file, "r") as file:
        hparams = yaml.safe_load(file)

    # Lista para almacenar los F1-scores de los folds de este trial
    f1_scores = []

    # Iterar sobre los folds dentro del trial
    folds = sorted([d for d in os.listdir(trial_path) if d.startswith("fold_")])
    for fold in folds:
        fold_path = os.path.join(trial_path, fold)
        pred_file = os.path.join(fold_path, "predictions_test.parquet")

        if os.path.exists(pred_file):
            # Cargar predicciones
            hbrf_df_predictions = pd.read_parquet(pred_file)
            hbrf_df_predictions = hbrf_df_predictions[hbrf_df_predictions['shorten'] == 'None']
            hbrf_df_predictions.index = hbrf_df_predictions.index.astype(str).str.split('_').str[1]
            hbrf_df_predictions = hbrf_df_predictions.reindex(partitions_test.index) 
            hbrf_y_pred = hbrf_df_predictions['y_pred'].values

            # Calcular F1-score del fold
            f1 = f1_score(y_test, hbrf_y_pred, average="macro")
            f1_scores.append(f1)

    # Promediar F1-score de todos los folds del trial
    mean_f1 = sum(f1_scores) / len(f1_scores) if f1_scores else None
    std_f1 = np.std(f1_scores)

    # Guardar los resultados con los hiperparámetros
    hparams["mean_f1"] = mean_f1
    hparams["std_f1"] = std_f1
    hparams["trial"] = trial
    results.append(hparams)

# Convertir la lista de resultados en un DataFrame
df_results = pd.DataFrame(results)
df_results = df_results.sort_values(by="mean_f1", ascending=False)
df_results

Unnamed: 0,n_trees,criterion,max_depth,n_jobs,verbose,mean_f1,std_f1,trial
13,500,gini,50,8,11,0.726543,0.003625,trial_20
16,500,entropy,50,8,11,0.726543,0.003625,trial_23
6,350,gini,50,8,11,0.725911,0.00391,trial_14
9,350,entropy,50,8,11,0.725911,0.00391,trial_17
15,500,entropy,20,8,11,0.72584,0.004114,trial_22
11,500,gini,20,8,11,0.72584,0.004114,trial_19
5,350,gini,20,8,11,0.725682,0.005324,trial_13
8,350,entropy,20,8,11,0.725682,0.005324,trial_16
21,100,gini,20,8,11,0.720683,0.005856,trial_7
2,100,entropy,20,8,11,0.720683,0.005856,trial_10
