In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
os.chdir(os.path.join(os.getcwd(), '..', '..', 'src'))
print("Current working directory:", os.getcwd())

In [None]:
from archaeo_super_prompt.dataset import MagohDataset, SamplingParams
import archaeo_super_prompt.modeling.train as training
import archaeo_super_prompt.modeling.predict as infering
import mlflow
import pandas as pd
from archaeo_super_prompt.visualization import mlflow_logging as mmlflow
from archaeo_super_prompt import visualization as visualizator
from archaeo_super_prompt.config.env import getenv_or_throw
from sklearn.pipeline import Pipeline
from sklearn import set_config

from pathlib import Path
from sklearn.base import BaseEstimator, TransformerMixin
from archaeo_super_prompt.utils.cache import get_cache_dir_for

class LoadScans(BaseEstimator, TransformerMixin):
    def __init__(self, cache_csv: Path):
        self.cache_csv = Path(cache_csv)
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        scans = pd.read_csv(self.cache_csv)
        scans = scans.drop_duplicates(subset=["id"])
        return X.merge(scans, on="id", how="inner")

CACHE_CSV = get_cache_dir_for("interim", "miscel") / "scans.csv"
SCANS_DF = pd.read_csv(CACHE_CSV)


In [None]:
# mlflow server --host 127.0.0.1 --port 8887

#source ~/.venvs/vllm/bin/activate
#vllm serve ibm-granite/granite-vision-3.3-2b --host 0.0.0.0 --port 8005

In [None]:
EXP_NAME = "Complete training"
mlflow.set_tracking_uri(f"http://{getenv_or_throw('MLFLOW_HOST')}:{getenv_or_throw('MLFLOW_PORT')}")
mlflow.set_experiment(EXP_NAME)
mlflow.dspy.autolog(log_compiles=True, log_evals=True, log_traces_from_compile=True)
pd.set_option('display.max_columns', None)
set_config(display="diagram")

In [None]:
from archaeo_super_prompt.dataset import MagohDataset

selected_ids = set(map(int, SCANS_DF["id"].dropna().tolist()))
ds = MagohDataset(selected_ids)

inputs = ds.files.merge(SCANS_DF[["id"]].drop_duplicates(), on="id", how="inner")
train_inputs, eval_inputs = inputs.iloc[:10], inputs.iloc[10:]


In [None]:
import ast
from pathlib import Path
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

from archaeo_super_prompt.utils.cache import get_cache_dir_for
import archaeo_super_prompt.modeling.train as training
from archaeo_super_prompt.modeling.DAG_builder import DAGComponent, DAGBuilder

CACHE_CSV = get_cache_dir_for("interim", "miscel") / "scans.csv"

def _maybe_eval(v):
    if isinstance(v, str):
        s = v.strip()
        if s.startswith("[") or s.startswith("{") or s.startswith("("):
            try:
                return ast.literal_eval(s)
            except Exception:
                return v
    return v

def _ensure_list(v):
    if isinstance(v, list):
        return v
    if pd.isna(v):
        return []
    if isinstance(v, str):
        e = _maybe_eval(v)
        if isinstance(e, list):
            return e
        return [e]
    return [v]

def _ensure_int_list(v):
    return [int(x) for x in _ensure_list(v)]

class LoadScans(BaseEstimator, TransformerMixin):
    def __init__(self, cache_csv: Path | str):
        self.cache_csv = Path(cache_csv)
        self._df = None
    def fit(self, X, y=None):
        df = pd.read_csv(self.cache_csv)
        if "id" in df.columns:
            df["id"] = df["id"].astype("int64").astype(int)
        for col in df.columns:
            if df[col].dtype == object:
                sample = df[col].head(20).apply(lambda x: isinstance(x, str) and x.strip().startswith(("[", "{", "("))).mean()
                if sample >= 0.6:
                    df[col] = df[col].apply(_maybe_eval)
        if "chunk_type" in df.columns:
            df["chunk_type"] = df["chunk_type"].apply(_ensure_list)
        if "chunk_page_position" in df.columns:
            df["chunk_page_position"] = df["chunk_page_position"].apply(_ensure_int_list)
        self._df = df
        return self
    def transform(self, X):
        X = X.copy()
        if "id" in X.columns:
            X["id"] = X["id"].astype(int)
        return X.merge(self._df, on="id", how="inner")

training.VLLM_Preprocessing = lambda **kw: LoadScans(CACHE_CSV)


In [None]:
_base_parts = training.get_training_dag()
from archaeo_super_prompt.modeling import predict as infering
expected_final_pipeline = infering.build_complete_inference_dag(_base_parts)
expected_final_pipeline


In [None]:
import os, importlib

# 1) Clear conflicting OPENAI envs that may point to :8000
for k in ("OPENAI_BASE_URL", "OPENAI_API_BASE"):
    os.environ.pop(k, None)

# 2) Point the repo's vLLM client to your server on :8001
os.environ["VLLM_SERVER_BASE_URL"] = "http://127.0.0.1:8001/v1"
os.environ["OPENAI_API_KEY"] = "sk-local"

# 3) Reload provider + extractor to drop any previous monkey patches
from archaeo_super_prompt.modeling.struct_extract import language_model as lm_provider_mod
from archaeo_super_prompt.modeling.struct_extract import field_extractor as fe
lm_provider_mod = importlib.reload(lm_provider_mod)
fe = importlib.reload(fe)

# 4) Keep OCR skipped via cache
import pandas as pd
from pathlib import Path
from sklearn.base import BaseEstimator, TransformerMixin
from archaeo_super_prompt.utils.cache import get_cache_dir_for
from archaeo_super_prompt.modeling import pdf_to_text
import ast
import pandas as pd
from pathlib import Path
from sklearn.base import BaseEstimator, TransformerMixin

from archaeo_super_prompt.utils.cache import get_cache_dir_for
from archaeo_super_prompt.modeling import pdf_to_text

CACHE_CSV = get_cache_dir_for("interim", "miscel") / "scans.csv"

def _as_list(v):
    if isinstance(v, list): return v
    if pd.isna(v): return []
    if isinstance(v, str):
        s=v.strip()
        if s and s[0] in "[{(":
            try:
                x=ast.literal_eval(s)
                return x if isinstance(x, list) else [x]
            except Exception:
                return [v]
        return [v]
    return [v]

def _as_str_list(v):
    return [str(x) for x in _as_list(v)]

def _as_int_list(v):
    out=[]
    for x in _as_list(v):
        try:
            out.append(int(x))
        except Exception:
            try:
                out.append(int(float(x)))
            except Exception:
                pass
    return out

class LoadScans(BaseEstimator, TransformerMixin):
    def __init__(self, cache_csv: str | Path):
        self.cache_csv = Path(cache_csv)
        self._df = None
    def fit(self, X, y=None):
        df = pd.read_csv(self.cache_csv)
        if "id" in df.columns:
            df["id"] = df["id"].astype("int64").astype(int)
        if "chunk_type" in df.columns:
            df["chunk_type"] = df["chunk_type"].apply(_as_str_list)
        if "chunk_page_position" in df.columns:
            df["chunk_page_position"] = df["chunk_page_position"].apply(_as_int_list)
        if "identified_thesaurus" in df.columns:
            df["identified_thesaurus"] = df["identified_thesaurus"].apply(_as_int_list)
        if "named_entities" in df.columns:
            df["named_entities"] = df["named_entities"].apply(_as_list)
        self._df = df
        return self
    def transform(self, X):
        X = X.copy()
        if "id" in X.columns:
            X["id"] = X["id"].astype(int)
        return X.merge(self._df, on="id", how="inner")

pdf_to_text.VLLM_Preprocessing = lambda **kw: LoadScans(CACHE_CSV)


# 5) Reload the training module to rebuild the DAG with clean state
import archaeo_super_prompt.modeling.train as training
training = importlib.reload(training)


In [None]:
import os
os.environ["OPENAI_API_KEY"]  = "sk-local"
os.environ["OPENAI_BASE_URL"] = "http://127.0.0.1:8001/v1"
os.environ["OPENAI_API_BASE"] = os.environ["OPENAI_BASE_URL"]


In [None]:
from archaeo_super_prompt.modeling.struct_extract.extractors.intervention_date import (
    InterventionStartExtractor, ITALIAN_MONTHS, Data, DataInterventoInputData
)

def _to_dspy_input_fixed(self, x):
    d = x.data_protocollo
    return DataInterventoInputData(
        fragmenti_relazione=x.merged_chunks,
        data_di_archiviazone=Data(
            giorno=int(d.day),
            mese=ITALIAN_MONTHS[int(d.month) - 1],
            anno=int(d.year),
        ),
    )

InterventionStartExtractor._to_dspy_input = _to_dspy_input_fixed


In [None]:
from archaeo_super_prompt.modeling import predict as infering
with mlflow.start_run():
    trained_dag_parts = training.train_from_scratch(train_inputs, ds)
    per_field_scores, detailed_results = infering.score_dag(trained_dag_parts, eval_inputs, ds)


In [None]:
visualizator.init_complete_vizualisation_engine(detailed_results)

In [None]:
visualizator.run_display_server()