In [1]:
import os
import socket
from gp.actors.data import (
    DataActor,
    DataActorArgs,
    DBActor,
    DBActorArgs,
    KGDBArgs,
    KGName,
)
from libactor.storage._global_storage import GlobalStorage
from experiments.config import DATABASE_DIR, LIBACTOR_STORAGE_DIR
from gpp.llm.qa_llm import ExplicitV100, Schema
from gpp.actors.qa_llm_actor import QALLMActor, QALLMActorArgs
from sm.misc.funcs import get_classpath
from libactor.cache import IdentObj
from tqdm import tqdm
from gp.actors.data import KGDB
from gp.misc.evaluation.evaluator import Evaluator
from experiments.dag import *

In [2]:
os.environ["HF_REMOTE"] = "http://localhost:31061"
GlobalStorage.init(LIBACTOR_STORAGE_DIR)

[32m2025-05-04 17:20:05.753[0m | [1mINFO    [0m | [36mlibactor.storage._global_storage[0m:[36minit[0m:[36m41[0m - [1mGlobalStorage: /Users/rook/workspace/projects/resm-v2/data/libactor[0m


<libactor.storage._global_storage.GlobalStorage at 0x307325820>

In [3]:
data_actor = create_data_actor()

In [5]:
from gp.actors.data import GPExample
from gp.actors.el.canreg import CanRegActor, CanRegActorArgs, OracleCanReg
from gp.entity_linking.candidate_recognition import HeuristicCanReg
from kgdata.models.ontology import Ontology
from libactor.dag import DAG, Flow, Cardinality
from libactor.cache import BackendFactory, cache
from libactor.cache.identitied_object import IdentObj
from libactor.misc import identity
from sm.misc.ray_helper import get_instance

dag = create_gpp_dag(
    {
        # "data": data_actor,
        # "table": Flow("data", identity, Cardinality.ONE_TO_MANY),
        "table": [get_table, remove_unknown_columns],
        "canreg": [
            Flow(
                "table",
                CanRegActor(
                    CanRegActorArgs(
                        clspath=get_classpath(HeuristicCanReg),
                        clsargs={},
                    )
                ),
            ),
            Flow(["table", ""], filter_skip_entity_columns),
        ],
        "sm": Flow(
            ["table", "canreg"],
            QALLMActor(
                QALLMActorArgs(
                    model=get_classpath(ExplicitV100),
                    model_args={"model": "allenai/OLMo-2-1124-7B-Instruct"},
                    # model_args={"model": "google/gemma-2-9b-it", "max_new_tokens": 64},
                    # model_args={"model": "meta-llama/Meta-Llama-3.1-8B-Instruct"},
                    # model_args={"model": "meta-llama/Llama-2-7b-hf"},
                    sample_size=100,
                    seed=42,
                    can_ask_for_correction=True,
                )
            ),
        ),
    },
)

In [6]:
dataset_name = "t2dv2"
kgdb = data_actor.get_kgdb(dataset_name)
examples = data_actor.load_dataset(dataset_name)

[32m2025-05-04 17:20:24.475[0m | [34m[1mDEBUG   [0m | [36mtimer[0m:[36mwatch_and_report[0m:[36m74[0m - [34m[1mDataActor.load_dataset deserialize: 0.023 seconds[0m


In [7]:
contextfn = get_gpp_context(data_actor, dataset_name)
output = dag.par_process(
    [{"table": (ex,)} for ex in examples],
    {"sm", "table"},
    [contextfn for _ in examples],
    n_jobs=2,
)

dag parallel processing:   0%|          | 0/224 [00:00<?, ?it/s]2025-05-04 17:20:30.141 | INFO     | libactor.storage._global_storage:init:41 - GlobalStorage: /Users/rook/workspace/projects/resm-v2/data/libactor
2025-05-04 17:20:30.141 | INFO     | libactor.storage._global_storage:init:41 - GlobalStorage: /Users/rook/workspace/projects/resm-v2/data/libactor
2025-05-04 17:20:30.209 | DEBUG    | timer:watch_and_report:74 - DataActor.load_dataset deserialize: 0.068 seconds
2025-05-04 17:20:30.210 | DEBUG    | timer:watch_and_report:74 - DataActor.load_dataset deserialize: 0.069 seconds
2025-05-04 17:20:30.215 | DEBUG    | gpp.actors.qa_llm_actor:get_agent:89 - Working directory for agent: /Users/rook/workspace/projects/resm-v2/data/libactor/QALLMActor_100/a8ea4a43
2025-05-04 17:20:30.216 | DEBUG    | gpp.actors.qa_llm_actor:get_agent:89 - Working directory for agent: /Users/rook/workspace/projects/resm-v2/data/libactor/QALLMActor_100/a8ea4a43
dag parallel processing: 100%|██████████| 224/

In [8]:
assert [ex.id for ex in examples] == [exout["table"][0].value.id for exout in output]
display(output[0]["table"][0].value.id)
display(output[0]["table"][0].value.table.table.df)
output[0]["sm"][0].value.print(env="notebook")

't2dv2__10151359_0_8168779773862259178'

Unnamed: 0,Title,Author
0,Adventures of Huckleberry Finn,Mark Twain
1,The Adventures of Super Diaper Baby,Dav Pilkey
2,The Adventures of Tom Sawyer,Mark Twain
3,Alice series,Phyllis Reynolds Naylor
4,All the King's Men,Robert Penn Warren
...,...,...
146,The Wish Giver,Bill Brittain
147,The Witches,Roald Dahl
148,Women in Love,D. H. Lawrence
149,Women on Top: How Real Life Has Changed Women?...,Nancy Friday


HTML(value='<pre>\n00.\t<span style="background: #b7eb8f; color: black; padding: 2px; border-radius: 3px;">[0]…

In [9]:
import serde.json

from experiments.config import DATA_DIR
from experiments.misc import get_notebook_output_file

outfile = get_notebook_output_file(DATA_DIR / "experiments", 4, ".json")
serde.json.ser(
    {x["table"][0].value.id: x["sm"][0].value.to_dict() for x in output},
    outfile,
    indent=2,
)

In [8]:
evaluator = Evaluator(kgdb.ontology.value, kgdb.pydb.entity_labels.cache())

In [9]:
ctas = evaluator.avg_cta(examples, [x["sm"][0].value for x in output])
cpas = evaluator.avg_cpa(examples, [x["sm"][0].value for x in output])

[32m2025-04-18 14:10:06.231[0m | [1mINFO    [0m | [36mgp.misc.evaluation.evaluator[0m:[36mavg_cta[0m:[36m104[0m - [1mfor copying...
	cta-p	cta-r	cta-f1
91.22,89.44,89.44[0m
[32m2025-04-18 14:10:07.021[0m | [1mINFO    [0m | [36mgp.misc.evaluation.evaluator[0m:[36mavg_cpa[0m:[36m78[0m - [1mfor copying...
	cpa-p	cpa-r	cpa-f1
63.29,60.99,61.50[0m
