In [1]:
import os
import socket
from gp.actors.data import (
    DataActor,
    DataActorArgs,
    DBActor,
    DBActorArgs,
    KGDBArgs,
    KGName,
)
from libactor.storage._global_storage import GlobalStorage
from experiments.config import DATABASE_DIR, LIBACTOR_STORAGE_DIR
from gpp.llm.qa_llm import ExplicitV100, Schema
from gpp.actors.qa_llm_actor import QALLMActor, QALLMActorArgs
from sm.misc.funcs import get_classpath
from libactor.cache import IdentObj
from tqdm import tqdm
from gp.actors.data import KGDB
from gp.misc.evaluation.evaluator import Evaluator
from experiments.dag import *

In [2]:
os.environ["HF_REMOTE"] = "http://localhost:31171"
GlobalStorage.init(LIBACTOR_STORAGE_DIR)

[32m2025-05-04 17:30:50.058[0m | [1mINFO    [0m | [36mlibactor.storage._global_storage[0m:[36minit[0m:[36m41[0m - [1mGlobalStorage: /Users/rook/workspace/projects/resm-v2/data/libactor[0m


<libactor.storage._global_storage.GlobalStorage at 0x14edeb770>

In [3]:
data_actor = create_data_actor()

In [4]:
from gp.actors.data import GPExample
from gp.actors.el.canreg import CanRegActor, CanRegActorArgs, OracleCanReg
from gp.entity_linking.candidate_recognition import HeuristicCanReg
from kgdata.models.ontology import Ontology
from libactor.dag import DAG, Flow, Cardinality
from libactor.cache import BackendFactory, cache
from libactor.cache.identitied_object import IdentObj
from libactor.misc import identity
from sm.misc.ray_helper import get_instance

dag = create_gpp_dag(
    {
        # "data": data_actor,
        # "table": Flow("data", identity, Cardinality.ONE_TO_MANY),
        "table": [get_table, remove_unknown_columns],
        "canreg": Flow(
            "table",
            CanRegActor(
                CanRegActorArgs(
                    clspath=get_classpath(HeuristicCanReg),
                    clsargs={},
                )
            ),
        ),
        "sm": Flow(
            ["table", "canreg"],
            QALLMActor(
                QALLMActorArgs(
                    model=get_classpath(ExplicitV100),
                    model_args={"model": "meta-llama/Meta-Llama-3.1-8B-Instruct"},
                    sample_size=100,
                    seed=42,
                    can_ask_for_correction=True,
                )
            ),
        ),
    },
)

In [5]:
dataset_name = "wt250"
kgdb = data_actor.get_kgdb(dataset_name)
examples = data_actor.load_dataset(dataset_name)

[32m2025-05-04 17:30:57.738[0m | [34m[1mDEBUG   [0m | [36mtimer[0m:[36mwatch_and_report[0m:[36m74[0m - [34m[1mDataActor.load_dataset deserialize: 0.199 seconds[0m


In [6]:
contextfn = get_gpp_context(data_actor, dataset_name)
output = dag.par_process(
    [{"table": (ex,)} for ex in examples],
    {"sm", "table"},
    [contextfn for _ in examples],
    n_jobs=2,
)

dag parallel processing:   0%|          | 0/250 [00:00<?, ?it/s]2025-05-04 17:31:02.986 | INFO     | libactor.storage._global_storage:init:41 - GlobalStorage: /Users/rook/workspace/projects/resm-v2/data/libactor
2025-05-04 17:31:02.986 | INFO     | libactor.storage._global_storage:init:41 - GlobalStorage: /Users/rook/workspace/projects/resm-v2/data/libactor
2025-05-04 17:31:03.194 | DEBUG    | timer:watch_and_report:74 - DataActor.load_dataset deserialize: 0.207 seconds
2025-05-04 17:31:03.194 | DEBUG    | timer:watch_and_report:74 - DataActor.load_dataset deserialize: 0.207 seconds
2025-05-04 17:31:03.235 | DEBUG    | gpp.actors.qa_llm_actor:get_agent:89 - Working directory for agent: /Users/rook/workspace/projects/resm-v2/data/libactor/QALLMActor_100/00d4e04c
2025-05-04 17:31:03.235 | DEBUG    | gpp.actors.qa_llm_actor:get_agent:89 - Working directory for agent: /Users/rook/workspace/projects/resm-v2/data/libactor/QALLMActor_100/00d4e04c
dag parallel processing: 100%|██████████| 250/

In [7]:
assert [ex.id for ex in examples] == [exout["table"][0].value.id for exout in output]
display(output[0]["table"][0].value.id)
display(output[0]["table"][0].value.table.table.df)
output[0]["sm"][0].value.print(env="notebook")

'wt250__11th_Lok_Sabha'

Unnamed: 0,Constituency,Name of Elected M.P.,Party Affiliation
0,Bidar,Ramchandra Veerappa,Bharatiya Janata Party
1,Gulbarga,Qamar ul Islam,Janata Dal
2,Raichur,Raja Rangappa Naik,Janata Dal
3,Koppal,Basavaraj Rayareddy,Janata Dal
4,Bellary,K.C. Kondaiah,Indian National Congress
5,Davangere,Gowdar Mallikarjunappa,Bharatiya Janata Party
6,Chitradurga,Puli Kodandaramaiah,Janata Dal
7,Tumkur,C.N. Bhaskarappa,Janata Dal
8,Chikballapur,R.L. Jalappa,Janata Dal
9,Kolar,K.H. Muniyappa,Indian National Congress


HTML(value='<pre>\n00.\t<span style="background: #b7eb8f; color: black; padding: 2px; border-radius: 3px;">[0]…

In [8]:
import serde.json

from experiments.config import DATA_DIR
from experiments.misc import get_notebook_output_file

outfile = get_notebook_output_file(DATA_DIR / "experiments", 4, ".json")
serde.json.ser(
    {x["table"][0].value.id: x["sm"][0].value.to_dict() for x in output},
    outfile,
    indent=2,
)

In [9]:
evaluator = Evaluator(kgdb.ontology.value, kgdb.pydb.entity_labels.cache())

In [10]:
ctas = evaluator.avg_cta(examples, [x["sm"][0].value for x in output])
cpas = evaluator.avg_cpa(examples, [x["sm"][0].value for x in output])

[32m2025-05-04 17:31:22.593[0m | [1mINFO    [0m | [36mgp.misc.evaluation.evaluator[0m:[36mavg_cta[0m:[36m104[0m - [1mfor copying...
	cta-p	cta-r	cta-f1
63.78,64.53,63.83[0m
[32m2025-05-04 17:31:22.620[0m | [1mINFO    [0m | [36msm.evaluation.sm_metrics[0m:[36mprecision_recall_f1[0m:[36m620[0m - [1mNumber of permutation is: 116280.0[0m
[32m2025-05-04 17:31:22.620[0m | [31m[1mERROR   [0m | [36msm.evaluation.sm_metrics[0m:[36mprecision_recall_f1[0m:[36m626[0m - [31m[1mPermutation explosion: got 116280.0 combinations from 1 pair groups[0m
[32m2025-05-04 17:31:22.621[0m | [1mINFO    [0m | [36msm.evaluation.sm_metrics[0m:[36mprecision_recall_f1[0m:[36m632[0m - [1m- [(label=http://wikiba.se/ontology#Statement, X=(#nodes=4), X_prime=(#nodes=20))][0m
[32m2025-05-04 17:31:22.621[0m | [1mINFO    [0m | [36msm.evaluation.sm_metrics[0m:[36mprecision_recall_f1[0m:[36m620[0m - [1mNumber of permutation is: 116280.0[0m
[32m2025-05-04 17:31:22