# Unified Launcher: Training → Offline Bundle → Inference

This notebook runs the **full pipeline** in a single session:

1) **Training** (`training.py`)
   - Builds the training dataset (if needed)
   - Splits into train/val/test (in `full` mode)
   - Trains the selected model
   - Writes artifacts to a unique run directory under `outputs/`

2) **Offline preprocessing** (`build_ontology_bundle.py`)
   - Builds the internal ontology CSV (`iri, local_name, label, text`)
   - Builds the `offline_bundle.pkl` used for retrieval
   - Computes semantic embeddings using a bi-encoder (for hybrid retrieval)

3) **Inference** (`run_inference.py`)
   - Loads the offline bundle + ontology CSV
   - Retrieves candidates (lexical or hybrid)
   - Scores candidates with the trained cross-encoder
   - Produces a predictions CSV (optionally storing top-N candidates)

**Important:**  
Do **not** zip/download intermediate artifacts if your goal is to run inference immediately after training.  
Zipping is only useful to export results to your local machine at the end.

---

## Setup — Repository and Dependencies

This cell prepares the execution environment and makes the notebook **fully reproducible** on platforms such as Google Colab or similar hosted environments.

Specifically, the cell performs the following steps:

1. **Clones the project repository** (or reuses it if it is already present).
2. **Sets the working directory** to the repository root, so that the core scripts
   (`training.py`, `build_ontology_bundle.py`, `run_inference.py`) can be executed
   directly from the notebook.
3. **Installs Python dependencies** from `requirements.txt`, if the file is present.
4. **Runs sanity checks** to ensure that the main framework scripts are available
   in the expected locations.

After this cell completes successfully, the notebook can safely execute the full
pipeline: **training → offline bundle → inference**, in a single, unified workflow.

In [None]:
# ============================================
# SETUP (clone repo + install deps)
# ============================================
from pathlib import Path
import os
import subprocess

REPO_URL = "https://github.com/adsp-polito/2025-P13-Ontology-Alignment.git"  # <-- cambia se serve
REPO_DIR = Path("repo").resolve()

def sh(cmd: str):
    print("\n$", cmd)
    subprocess.check_call(cmd, shell=True)

# 1) Clone (or reuse)
if not REPO_DIR.exists():
    sh(f"git clone {REPO_URL} {REPO_DIR}")
else:
    print("Repo already present at:", REPO_DIR)

# 2) Move into repo
os.chdir(REPO_DIR)
print("CWD:", Path(".").resolve())

# 3) Install requirements (best-effort)
if Path("requirements.txt").exists():
    sh("pip -q install -r requirements.txt")
else:
    print("No requirements.txt found. Skipping pip install.")

# 4) Sanity checks: scripts must exist
for p in ["training.py", "build_ontology_bundle.py", "run_inference.py"]:
    if not Path(p).exists():
        raise FileNotFoundError(f"Missing {p} in repo root: {Path('.').resolve()}")

print("Setup OK: repo + scripts found.")

---

## Configuration

Set all parameters below.  
The pipeline will create a unique run directory in `outputs/` and reuse it across all steps.

In [None]:
# ============================================
# CONFIGURATION (unified training -> offline -> inference)
# ============================================
from pathlib import Path
from datetime import datetime

REPO_ROOT = Path(".").resolve()
print("REPO_ROOT:", REPO_ROOT)

# -----------------------------
# Run id / output layout
# -----------------------------
RUN_ID = f"unified_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
OUT_DIR = Path("outputs") / RUN_ID
OUT_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_DIR = OUT_DIR / "training"
OFFLINE_DIR = OUT_DIR / "offline"
INFER_DIR = OUT_DIR / "inference"
TRAIN_DIR.mkdir(parents=True, exist_ok=True)
OFFLINE_DIR.mkdir(parents=True, exist_ok=True)
INFER_DIR.mkdir(parents=True, exist_ok=True)

print("RUN_ID:", RUN_ID)
print("OUT_DIR:", OUT_DIR)

# -----------------------------
# Training mode and model
# -----------------------------
RUN_MODE = "full"  # "full" | "build-dataset" | "train-only"
MODEL_TYPE = "cross-encoder"  # keep this if you want inference at the end
MODEL_NAME = "allenai/scibert_scivocab_uncased"
NUM_EPOCHS = 10

HYPERPARAMETER_TUNING = False
N_TRIALS = 5

USE_FIXED_HYPERPARAMS = True
LEARNING_RATE = 3e-5
BATCH_SIZE = 16
WEIGHT_DECAY = 0.01

SPLIT_RATIOS = "0.75,0.15,0.10"

# -----------------------------
# Inputs for dataset building
# -----------------------------
SRC_PATH = "data/sweet.owl"
TGT_PATH = "data/envo.owl"
ALIGN_PATH = "data/envo-sweet.rdf"

SRC_PREFIX = None
TGT_PREFIX = None  # e.g. "http://purl.obolibrary.org/obo/ENVO_"

USE_DESCRIPTION = True
USE_SYNONYMS = True
USE_PARENTS = True
USE_EQUIVALENT = True
USE_DISJOINT = True

VISUALIZE = False

# -----------------------------
# Canonical outputs of STEP 1
# -----------------------------
OUT_SRC_CSV = str(TRAIN_DIR / "source_ontology.csv")
OUT_TGT_CSV = str(TRAIN_DIR / "target_ontology.csv")
OUT_DATASET_CSV = str(TRAIN_DIR / "training_dataset.csv")

TRAIN_SPLIT_CSV = str(Path(OUT_DATASET_CSV).with_suffix(".train.csv"))
VAL_SPLIT_CSV   = str(Path(OUT_DATASET_CSV).with_suffix(".val.csv"))
TEST_SPLIT_CSV  = str(Path(OUT_DATASET_CSV).with_suffix(".test.csv"))

# train-only mode
DATASET_CSV = TRAIN_SPLIT_CSV

# model outputs
MODEL_OUT_DIR = str(TRAIN_DIR / "models" / f"{MODEL_TYPE}_custom")
FINAL_CROSS_ENCODER_DIR = str(Path(MODEL_OUT_DIR) / "final_cross_encoder_model")

# -----------------------------
# Offline bundle builder
# -----------------------------
OFFLINE_EXPORT_CSV = None
OFFLINE_ONT_PATH = TGT_PATH
OFFLINE_PREFIX = TGT_PREFIX

CROSS_TOKENIZER_NAME = MODEL_NAME

BI_ENCODER_MODEL_ID = "allenai/scibert_scivocab_uncased"
OFFLINE_SEMANTIC_BATCH_SIZE = 64
OFFLINE_SEMANTIC_MAX_LENGTH = 256
OFFLINE_NO_SEMANTIC_NORMALIZE = False

ONTOLOGY_INTERNAL_CSV = str(OFFLINE_DIR / "ontology_internal.csv")
OFFLINE_BUNDLE_PKL = str(OFFLINE_DIR / "offline_bundle.pkl")

# -----------------------------
# Inference
# -----------------------------
INFER_INPUT_CSV = str(Path(OUT_DATASET_CSV).with_suffix(".test.queries.csv"))  # Option B queries
INFER_OUT_CSV = str(INFER_DIR / "predictions.csv")

RETRIEVAL_COL = "source_label"
SCORING_COL = "source_text"
ID_COL = "source_iri"

INFER_MODE = "hybrid"
RETRIEVAL_LEXICAL_TOP_K = 100
RETRIEVAL_SEMANTIC_TOP_K = 100
RETRIEVAL_MERGED_TOP_K = 150
HYBRID_RATIO_SEMANTIC = 0.5
SEMANTIC_BATCH_SIZE = 64

CROSS_TOP_K = 20
CROSS_BATCH_SIZE = 32
CROSS_MAX_LENGTH = 256

KEEP_TOP_N = 0

print("Config OK.")

---

## Run pipeline (3 steps)

This section executes:
1) `training.py`
2) `build_ontology_bundle.py`
3) `run_inference.py`

Logs are written under the run directory.
The notebook stops immediately if any step fails.

In [None]:
# ============================================
# RUN PIPELINE (training -> offline -> inference)
# ============================================
from pathlib import Path

def print_tail(path: Path, n=120):
    p = Path(path)
    if not p.exists():
        print(f"[tail] log not found: {p}")
        return
    lines = p.read_text(errors="replace").splitlines()
    print("\n".join(lines[-n:]))

def run_cmd(cmd, log_path: Path, cwd: Path):
    print("\nRunning command:\n", " ".join(cmd))
    print("CWD:", cwd)
    print("Log:", log_path)
    log_path.parent.mkdir(parents=True, exist_ok=True)

    with open(log_path, "w") as f:
        proc = subprocess.run(cmd, stdout=f, stderr=subprocess.STDOUT, cwd=str(cwd))

    print("Return code:", proc.returncode)
    if proc.returncode != 0:
        print("!!! Error occurred. Last lines of log:")
        print_tail(log_path, n=120)
        raise RuntimeError(f"Command failed with return code {proc.returncode}. See log: {log_path}")
    return proc.returncode

# Guardrails
if RUN_MODE == "full" and MODEL_TYPE != "cross-encoder":
    raise ValueError("RUN_MODE='full' ends with inference => needs MODEL_TYPE='cross-encoder'.")

if HYPERPARAMETER_TUNING and RUN_MODE != "full":
    raise ValueError("--tune only allowed in RUN_MODE='full'.")

Path(MODEL_OUT_DIR).mkdir(parents=True, exist_ok=True)

# -----------------------------
# STEP 1) TRAINING
# -----------------------------
train_log = TRAIN_DIR / "training.log"

train_cmd = ["python", "training.py", "--mode", RUN_MODE]

if RUN_MODE in {"full", "build-dataset"}:
    train_cmd += ["--src", SRC_PATH, "--tgt", TGT_PATH, "--align", ALIGN_PATH]
    train_cmd += ["--out-src", OUT_SRC_CSV, "--out-tgt", OUT_TGT_CSV, "--out-dataset", OUT_DATASET_CSV]
    train_cmd += ["--split-ratios", SPLIT_RATIOS]

    if SRC_PREFIX:
        train_cmd += ["--src-prefix", SRC_PREFIX]
    if TGT_PREFIX:
        train_cmd += ["--tgt-prefix", TGT_PREFIX]

    if USE_DESCRIPTION: train_cmd.append("--src-use-description")
    if USE_SYNONYMS: train_cmd.append("--src-use-synonyms")
    if USE_PARENTS: train_cmd.append("--src-use-parents")
    if USE_EQUIVALENT: train_cmd.append("--src-use-equivalent")
    if USE_DISJOINT: train_cmd.append("--src-use-disjoint")
    if VISUALIZE: train_cmd.append("--visualize-alignments")

if RUN_MODE in {"full", "train-only"}:
    train_cmd += ["--model-type", MODEL_TYPE, "--model-name", MODEL_NAME, "--model-output-dir", MODEL_OUT_DIR]
    train_cmd += ["--num-epochs", str(NUM_EPOCHS)]

    if HYPERPARAMETER_TUNING:
        train_cmd += ["--tune", "--n-trials", str(N_TRIALS)]
    elif USE_FIXED_HYPERPARAMS:
        train_cmd += ["--learning-rate", str(LEARNING_RATE)]
        train_cmd += ["--batch-size", str(BATCH_SIZE)]
        train_cmd += ["--weight-decay", str(WEIGHT_DECAY)]

if RUN_MODE == "train-only":
    train_cmd += ["--dataset-csv", DATASET_CSV]

run_cmd(train_cmd, train_log, cwd=REPO_ROOT)

print("\nTraining completed.")
print("Dataset CSV:", OUT_DATASET_CSV)
print("Train split:", TRAIN_SPLIT_CSV)
print("Val split:", VAL_SPLIT_CSV)
print("Test split:", TEST_SPLIT_CSV)
print("Cross-encoder dir:", FINAL_CROSS_ENCODER_DIR)

CROSS_ENCODER_MODEL_ID = FINAL_CROSS_ENCODER_DIR

# -----------------------------
# STEP 2) OFFLINE BUNDLE
# -----------------------------
offline_log = OFFLINE_DIR / "offline_bundle.log"

offline_cmd = [
    "python", "build_ontology_bundle.py",
    "--out-csv", ONTOLOGY_INTERNAL_CSV,
    "--out-bundle", OFFLINE_BUNDLE_PKL,
    "--tokenizer-name", CROSS_TOKENIZER_NAME,
    "--bi-encoder-model-id", BI_ENCODER_MODEL_ID,
    "--semantic-batch-size", str(OFFLINE_SEMANTIC_BATCH_SIZE),
    "--semantic-max-length", str(OFFLINE_SEMANTIC_MAX_LENGTH),
]
if OFFLINE_NO_SEMANTIC_NORMALIZE:
    offline_cmd.append("--no-semantic-normalize")

if OFFLINE_EXPORT_CSV:
    offline_cmd += ["--export-csv", OFFLINE_EXPORT_CSV]
else:
    offline_cmd += ["--ont-path", OFFLINE_ONT_PATH]
    if OFFLINE_PREFIX:
        offline_cmd += ["--prefix", OFFLINE_PREFIX]

run_cmd(offline_cmd, offline_log, cwd=REPO_ROOT)

print("\nOffline bundle completed.")
print("Ontology internal CSV:", ONTOLOGY_INTERNAL_CSV)
print("Offline bundle PKL:", OFFLINE_BUNDLE_PKL)

# -----------------------------
# STEP 3) INFERENCE
# -----------------------------
infer_log = INFER_DIR / "inference.log"

if not Path(INFER_INPUT_CSV).exists():
    raise FileNotFoundError(
        f"INFER_INPUT_CSV not found: {INFER_INPUT_CSV}\n"
        "In full/build-dataset mode, training should generate *.test.queries.csv. "
        "If you want a custom query file, set INFER_INPUT_CSV to its path."
    )

infer_cmd = [
    "python", "run_inference.py",
    "--bundle", OFFLINE_BUNDLE_PKL,
    "--ontology-csv", ONTOLOGY_INTERNAL_CSV,
    "--input-csv", INFER_INPUT_CSV,
    "--out-csv", INFER_OUT_CSV,
    "--mode", INFER_MODE,
    "--cross-tokenizer-name", CROSS_TOKENIZER_NAME,
    "--cross-encoder-model-id", CROSS_ENCODER_MODEL_ID,
    "--retrieval-col", RETRIEVAL_COL,
    "--retrieval-lexical-top-k", str(RETRIEVAL_LEXICAL_TOP_K),
    "--retrieval-semantic-top-k", str(RETRIEVAL_SEMANTIC_TOP_K),
    "--retrieval-merged-top-k", str(RETRIEVAL_MERGED_TOP_K),
    "--hybrid-ratio-semantic", str(HYBRID_RATIO_SEMANTIC),
    "--semantic-batch-size", str(SEMANTIC_BATCH_SIZE),
    "--cross-top-k", str(CROSS_TOP_K),
    "--cross-batch-size", str(CROSS_BATCH_SIZE),
    "--cross-max-length", str(CROSS_MAX_LENGTH),
    "--keep-top-n", str(KEEP_TOP_N),
]
if SCORING_COL:
    infer_cmd += ["--scoring-col", SCORING_COL]
if ID_COL:
    infer_cmd += ["--id-col", ID_COL]

run_cmd(infer_cmd, infer_log, cwd=REPO_ROOT)

print("\nUnified pipeline completed successfully.")
print("Outputs:")
print(" - Training:", TRAIN_DIR)
print(" - Offline bundle:", OFFLINE_DIR)
print(" - Inference:", INFER_DIR)
print("Predictions CSV:", INFER_OUT_CSV)

---

## Optional: package outputs for download (run only at the end)

This is only needed if you want to export artifacts locally.
Skip it if you are iterating quickly.

In [None]:
import shutil

ZIP_PATH = str(OUT_DIR) + ".zip"
print("Zipping:", OUT_DIR, "->", ZIP_PATH)

# Make a zip of the whole run dir (simple + safe)
shutil.make_archive(str(OUT_DIR), "zip", root_dir=str(OUT_DIR))
print("Created:", ZIP_PATH)

# To download (Colab)
from google.colab import files
files.download(ZIP_PATH)