# Hits@K Evaluation per split and stage

In [199]:
import os
import pandas as pd
import numpy as np

In [None]:
final_results = pd.read_parquet('data/processed/final_results.parquet')

In [201]:
def hits_at_k(k, y, y_pred):
    hits = []
    for target in y:
        hit = False
        for i, value in enumerate(y_pred):
            if i >= k:
                break
            if value == target:
                hit = True
                break
        hits.append(1 if hit else 0)
    return np.mean(hits)

k_di = {
    'name': 50,
    'collectionType': 1,
    'collectionSpecie': 50,
    'bioActivity': 5,
    'collectionSite': 20,
}

In [202]:
splits_base_path = r'data/splits'
properties = list(k_di.keys())
folds = [f'{x}' for x in range(10)]
estagios = ['1st', '2nd', '3rd', '4th']
models = final_results.model.unique().tolist()

all_results_splits = []
for estagio in estagios:
    print(f'{estagio} stage')
    for fold in folds:
        print(f'Fold {fold}')
        for property in properties:
            print(f'Prop: {property}')
            filename = os.path.join(splits_base_path, f'test_doi_{property}_{fold}_{estagio}.csv')
            split = pd.read_csv(filename)
            split['node'] = split.node.astype(str).str.replace('@', '/').str.lower()
            doi_list = split['node'].unique().tolist()

            for model in models:
                # print(f'Checking model {model}')
                # print(f'Opening doi_list of length {len(doi_list)}')
                for doi in doi_list:
                    results_di = {
                        'estagio': estagio,
                        'fold': fold,
                        'model': model,
                        'doi': doi,
                        'property': property
                    }
                    y = split[split.node == doi]['neighbor'].unique().tolist()
                    y_pred = final_results[(final_results.model == model) & (final_results.doi == doi)][property].unique().tolist()
                    
                    results_di['hits@k'] = hits_at_k(k_di[property], y=y, y_pred=y_pred)
                    all_results_splits.append(results_di)

splits_results_df = pd.DataFrame(all_results_splits)

1st stage
Fold 0
Prop: name
Prop: collectionType
Prop: collectionSpecie
Prop: bioActivity
Prop: collectionSite
Fold 1
Prop: name
Prop: collectionType
Prop: collectionSpecie
Prop: bioActivity
Prop: collectionSite
Fold 2
Prop: name
Prop: collectionType
Prop: collectionSpecie
Prop: bioActivity
Prop: collectionSite
Fold 3
Prop: name
Prop: collectionType
Prop: collectionSpecie
Prop: bioActivity
Prop: collectionSite
Fold 4
Prop: name
Prop: collectionType
Prop: collectionSpecie
Prop: bioActivity
Prop: collectionSite
Fold 5
Prop: name
Prop: collectionType
Prop: collectionSpecie
Prop: bioActivity
Prop: collectionSite
Fold 6
Prop: name
Prop: collectionType
Prop: collectionSpecie
Prop: bioActivity
Prop: collectionSite
Fold 7
Prop: name
Prop: collectionType
Prop: collectionSpecie
Prop: bioActivity
Prop: collectionSite
Fold 8
Prop: name
Prop: collectionType
Prop: collectionSpecie
Prop: bioActivity
Prop: collectionSite
Fold 9
Prop: name
Prop: collectionType
Prop: collectionSpecie
Prop: bioActivity
P

In [208]:
view_property_pivot_por_fold = splits_results_df.pivot_table(index=['model', 'estagio', 'fold'], columns=['property'], values='hits@k', aggfunc='mean').reset_index()
view_property_pivot_por_fold.columns.name = ''
view_property_pivot_por_fold.head(3)

Unnamed: 0,model,estagio,fold,bioActivity,collectionSite,collectionSpecie,collectionType,name
0,ensemble,1st,0,0.809333,0.75431,0.974138,0.952174,0.927229
1,ensemble,1st,1,0.792023,0.765217,0.974359,0.964912,0.9075
2,ensemble,1st,2,0.806944,0.74569,0.957265,0.95614,0.925922


In [204]:
view_property_pivot_resumo = splits_results_df.pivot_table(index='model', columns='property', aggfunc='mean', values='hits@k')
view_property_pivot_resumo.columns.name = ''
view_property_pivot_resumo

Unnamed: 0_level_0,bioActivity,collectionSite,collectionSpecie,collectionType,name
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ensemble,0.815809,0.765325,0.968248,0.960545,0.915354
google/gemini-2.5-pro-preview-03-25,0.783351,0.715028,0.960518,0.950364,0.88759
openai/gpt-4.1,0.785566,0.717991,0.92401,0.960545,0.641198


In [216]:
views_per_model = {}
for model in models:
    view_stage_pivot_resumo = splits_results_df[splits_results_df.model == model].pivot_table(index='property', columns='estagio', aggfunc='mean', values='hits@k')
    view_stage_pivot_resumo.columns.name = model
    views_per_model[model] = view_stage_pivot_resumo
views_per_model[models[2]]

ensemble,1st,2nd,3rd,4th
property,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bioActivity,0.805787,0.815079,0.830409,0.830994
collectionSite,0.763342,0.764045,0.770718,0.766667
collectionSpecie,0.971146,0.969902,0.966546,0.954545
collectionType,0.956541,0.95995,0.964022,0.972222
name,0.921033,0.91835,0.904531,0.90438
