# Unified Launcher: Training → Offline Bundle → Inference

This notebook runs the **full pipeline** in a single session:

1) **Training** (`training.py`)
   - Builds the training dataset (if needed)
   - Splits into train/val/test (in `full` mode)
   - Trains the selected model
   - Writes artifacts to a unique run directory under `outputs/`

2) **Offline preprocessing** (`build_ontology_bundle.py`)
   - Builds the internal ontology CSV (`iri, local_name, label, text`)
   - Builds the `offline_bundle.pkl` used for retrieval
   - Computes semantic embeddings using a bi-encoder (for hybrid retrieval)

3) **Inference** (`run_inference.py`)
   - Loads the offline bundle + ontology CSV
   - Retrieves candidates (lexical or hybrid)
   - Scores candidates with the trained cross-encoder
   - Produces a predictions CSV (optionally storing top-N candidates)

**Important:**  
Do **not** zip/download intermediate artifacts if your goal is to run inference immediately after training.  
Zipping is only useful to export results to your local machine at the end.

---

## Configuration

Set all parameters below.  
The pipeline will create a unique run directory in `outputs/` and reuse it across all steps.

In [None]:
# ============================================
# CONFIGURATION (unified training -> offline -> inference)
# ============================================
from pathlib import Path
from datetime import datetime

# -----------------------------
# Run id / output layout
# -----------------------------
RUN_ID = f"unified_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
OUT_DIR = Path("outputs") / RUN_ID
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Subfolders
TRAIN_DIR = OUT_DIR / "training"
OFFLINE_DIR = OUT_DIR / "offline"
INFER_DIR = OUT_DIR / "inference"
TRAIN_DIR.mkdir(parents=True, exist_ok=True)
OFFLINE_DIR.mkdir(parents=True, exist_ok=True)
INFER_DIR.mkdir(parents=True, exist_ok=True)

print("RUN_ID:", RUN_ID)
print("OUT_DIR:", OUT_DIR)

# -----------------------------
# Training mode and model
# -----------------------------
RUN_MODE = "full"  # "full" | "build-dataset" | "train-only"
# IMPORTANT:
# - Inference always uses a cross-encoder scorer (run_inference.py).
# - So if you want the pipeline to end with inference, you should either:
#   (A) train a cross-encoder (recommended), or
#   (B) provide an external cross-encoder scorer model path/id.
MODEL_TYPE = "cross-encoder"  # "bi-encoder" | "cross-encoder"
MODEL_NAME = "allenai/scibert_scivocab_uncased"
NUM_EPOCHS = 10

# Hyperparameter tuning (allowed only with RUN_MODE="full")
HYPERPARAMETER_TUNING = False
N_TRIALS = 5

# Fixed hyperparams (used only if tuning disabled)
USE_FIXED_HYPERPARAMS = True
LEARNING_RATE = 3e-5
BATCH_SIZE = 16
WEIGHT_DECAY = 0.01

# Split ratios used by training.py (only for full/build-dataset)
SPLIT_RATIOS = "0.75,0.15,0.10"  # Train/val/test ratios (must sum to 1.0)

# -----------------------------
# Inputs for training dataset building (full/build-dataset)
# -----------------------------
SRC_PATH = "data/sweet.owl"
TGT_PATH = "data/envo.owl"
ALIGN_PATH = "data/envo-sweet.rdf"

SRC_PREFIX = None
TGT_PREFIX = None  # e.g., "http://purl.obolibrary.org/obo/ENVO_"

USE_DESCRIPTION = True
USE_SYNONYMS = True
USE_PARENTS = True
USE_EQUIVALENT = True
USE_DISJOINT = True

# Visualization can be slow / fragile in hosted notebooks
VISUALIZE = False

# -----------------------------
# Canonical outputs of STEP 1 (training.py writes these)
# -----------------------------
OUT_SRC_CSV = str(TRAIN_DIR / "source_ontology.csv")
OUT_TGT_CSV = str(TRAIN_DIR / "target_ontology.csv")
OUT_DATASET_CSV = str(TRAIN_DIR / "training_dataset.csv")  # canonical dataset path for this RUN

# Splits that training.py produces in build-dataset/full mode
TRAIN_SPLIT_CSV = str(Path(OUT_DATASET_CSV).with_suffix(".train.csv"))
VAL_SPLIT_CSV = str(Path(OUT_DATASET_CSV).with_suffix(".val.csv"))
TEST_SPLIT_CSV = str(Path(OUT_DATASET_CSV).with_suffix(".test.csv"))

# -----------------------------
# Inputs for train-only
# -----------------------------
# train-only assumes the provided CSV is the TRAINING set (no splitting).
# Keep it explicit: if you run train-only, set DATASET_CSV to the training CSV you want.
DATASET_CSV = TRAIN_SPLIT_CSV  # sensible default in unified pipeline

# -----------------------------
# Model output dir (STEP 1)
# -----------------------------
MODEL_OUT_DIR = str(TRAIN_DIR / "models" / f"{MODEL_TYPE}_custom")

# After training, we will use this as cross-encoder scorer for inference
# (it is a local path produced by SentenceTransformers `model.save(...)`)
FINAL_CROSS_ENCODER_DIR = str(Path(MODEL_OUT_DIR) / "final_cross_encoder_model")

# -----------------------------
# Offline bundle builder (build_ontology_bundle.py)
# -----------------------------
# You can build from OWL/RDF or from an already-exported CSV
OFFLINE_EXPORT_CSV = None          # e.g., "data/ontology_internal.csv" OR None
OFFLINE_ONT_PATH = TGT_PATH        # typically the target ontology used for retrieval
OFFLINE_PREFIX = TGT_PREFIX

# Tokenizer for lexical retrieval must match the cross-encoder tokenizer you want to use
CROSS_TOKENIZER_NAME = MODEL_NAME  # keep aligned with your scorer tokenizer

# bi-encoder used ONLY to compute semantic embeddings for hybrid retrieval
BI_ENCODER_MODEL_ID = "allenai/scibert_scivocab_uncased"
OFFLINE_SEMANTIC_BATCH_SIZE = 64
OFFLINE_SEMANTIC_MAX_LENGTH = 256
OFFLINE_NO_SEMANTIC_NORMALIZE = False

# Offline outputs
ONTOLOGY_INTERNAL_CSV = str(OFFLINE_DIR / "ontology_internal.csv")
OFFLINE_BUNDLE_PKL = str(OFFLINE_DIR / "offline_bundle.pkl")

# -----------------------------
# Inference (run_inference.py)
# -----------------------------
# Use the test split generated by STEP 1 (full/build-dataset)
INFER_INPUT_CSV = str(Path(OUT_DATASET_CSV).with_suffix(".test.queries.csv"))
INFER_OUT_CSV = str(INFER_DIR / "predictions.csv")

# Your test split columns:
# source_iri, target_iri, source_label, target_label, source_text, target_text, sample_type, match
RETRIEVAL_COL = "source_label"   # exact + lexical retrieval
SCORING_COL = "source_text"      # cross-encoder scorer + semantic retrieval query text
ID_COL = "source_iri"            # carry through an id per attribute

INFER_MODE = "hybrid"  # "lexical" | "hybrid"

RETRIEVAL_LEXICAL_TOP_K = 100
RETRIEVAL_SEMANTIC_TOP_K = 100
RETRIEVAL_MERGED_TOP_K = 150
HYBRID_RATIO_SEMANTIC = 0.5
SEMANTIC_BATCH_SIZE = 64

CROSS_TOP_K = 20
CROSS_BATCH_SIZE = 32
CROSS_MAX_LENGTH = 256

KEEP_TOP_N = 0

print("Config OK.")

---

## Run pipeline (3 steps)

This section executes:
1) `training.py`
2) `build_ontology_bundle.py`
3) `run_inference.py`

Logs are written under the run directory.
The notebook stops immediately if any step fails.

In [None]:
# ============================================
# RUN PIPELINE (training -> offline -> inference)
# ============================================
import subprocess
import os

def run_cmd(cmd, log_path: Path):
    print("\nRunning command:\n", " ".join(cmd))
    print("Log:", log_path)
    log_path.parent.mkdir(parents=True, exist_ok=True)

    with open(log_path, "w") as f:
        proc = subprocess.run(cmd, stdout=f, stderr=subprocess.STDOUT)

    print("Return code:", proc.returncode)
    if proc.returncode != 0:
        print("!!! Error occurred. Showing last 80 lines of log:")
        os.system(f"tail -n 80 {log_path}")
        raise RuntimeError(f"Command failed with return code {proc.returncode}. See log: {log_path}")
    return proc.returncode

# Guardrails for coherence (avoid silent nonsense)
if RUN_MODE == "full" and MODEL_TYPE != "cross-encoder":
    raise ValueError(
        "RUN_MODE='full' runs inference at the end, which needs a cross-encoder scorer. "
        "Set MODEL_TYPE='cross-encoder' (recommended) or modify the pipeline to provide an external scorer."
    )

if HYPERPARAMETER_TUNING and RUN_MODE != "full":
    raise ValueError("--tune is only allowed in RUN_MODE='full'.")

Path(MODEL_OUT_DIR).mkdir(parents=True, exist_ok=True)

# -----------------------------
# STEP 1) TRAINING
# -----------------------------
train_log = TRAIN_DIR / "training.log"

train_cmd = ["python", "training.py", "--mode", RUN_MODE]

# Mode: full or build-dataset => requires ontologies + alignment + output CSV paths
if RUN_MODE in {"full", "build-dataset"}:
    train_cmd += ["--src", SRC_PATH, "--tgt", TGT_PATH, "--align", ALIGN_PATH]
    train_cmd += ["--out-src", OUT_SRC_CSV, "--out-tgt", OUT_TGT_CSV, "--out-dataset", OUT_DATASET_CSV]
    train_cmd += ["--split-ratios", SPLIT_RATIOS]

    if SRC_PREFIX:
        train_cmd += ["--src-prefix", SRC_PREFIX]
    if TGT_PREFIX:
        train_cmd += ["--tgt-prefix", TGT_PREFIX]

    if USE_DESCRIPTION: train_cmd.append("--src-use-description")
    if USE_SYNONYMS: train_cmd.append("--src-use-synonyms")
    if USE_PARENTS: train_cmd.append("--src-use-parents")
    if USE_EQUIVALENT: train_cmd.append("--src-use-equivalent")
    if USE_DISJOINT: train_cmd.append("--src-use-disjoint")
    if VISUALIZE: train_cmd.append("--visualize-alignments")

# Mode: full or train-only => requires model args
if RUN_MODE in {"full", "train-only"}:
    train_cmd += ["--model-type", MODEL_TYPE, "--model-name", MODEL_NAME, "--model-output-dir", MODEL_OUT_DIR]
    train_cmd += ["--num-epochs", str(NUM_EPOCHS)]

    if HYPERPARAMETER_TUNING:
        train_cmd += ["--tune", "--n-trials", str(N_TRIALS)]
    elif USE_FIXED_HYPERPARAMS:
        train_cmd += ["--learning-rate", str(LEARNING_RATE)]
        train_cmd += ["--batch-size", str(BATCH_SIZE)]
        train_cmd += ["--weight-decay", str(WEIGHT_DECAY)]

# Mode: train-only => requires dataset CSV (assumed train set)
if RUN_MODE == "train-only":
    train_cmd += ["--dataset-csv", DATASET_CSV]

run_cmd(train_cmd, train_log)

print("\nTraining completed.")
print("Dataset CSV:", OUT_DATASET_CSV)
print("Train split:", TRAIN_SPLIT_CSV)
print("Val split:", VAL_SPLIT_CSV)
print("Test split:", TEST_SPLIT_CSV)
print("Model out dir:", MODEL_OUT_DIR)

# Cross-encoder scorer directory for inference
CROSS_ENCODER_MODEL_ID = FINAL_CROSS_ENCODER_DIR

# -----------------------------
# STEP 2) OFFLINE BUNDLE
# -----------------------------
offline_log = OFFLINE_DIR / "offline_bundle.log"

offline_cmd = [
    "python", "build_ontology_bundle.py",
    "--out-csv", ONTOLOGY_INTERNAL_CSV,
    "--out-bundle", OFFLINE_BUNDLE_PKL,
    "--tokenizer-name", CROSS_TOKENIZER_NAME,
    "--bi-encoder-model-id", BI_ENCODER_MODEL_ID,
    "--semantic-batch-size", str(OFFLINE_SEMANTIC_BATCH_SIZE),
    "--semantic-max-length", str(OFFLINE_SEMANTIC_MAX_LENGTH),
]

if OFFLINE_NO_SEMANTIC_NORMALIZE:
    offline_cmd.append("--no-semantic-normalize")

# Choose ontology source: either export-csv OR ont-path
if OFFLINE_EXPORT_CSV:
    offline_cmd += ["--export-csv", OFFLINE_EXPORT_CSV]
else:
    offline_cmd += ["--ont-path", OFFLINE_ONT_PATH]
    if OFFLINE_PREFIX:
        offline_cmd += ["--prefix", OFFLINE_PREFIX]

run_cmd(offline_cmd, offline_log)

print("\nOffline bundle completed.")
print("Ontology internal CSV:", ONTOLOGY_INTERNAL_CSV)
print("Offline bundle PKL:", OFFLINE_BUNDLE_PKL)

if RUN_MODE == "build-dataset":
    print("\nRUN_MODE='build-dataset': dataset + splits + queries/gold generated. Skipping inference (no trained model).")
    raise SystemExit(0)

# -----------------------------
# STEP 3) INFERENCE
# -----------------------------
infer_log = INFER_DIR / "inference.log"

if not Path(CROSS_ENCODER_MODEL_ID).exists():
    raise FileNotFoundError(
        f"Cross-encoder model not found: {CROSS_ENCODER_MODEL_ID}\n"
        "If you want to run inference with an external scorer, set CROSS_ENCODER_MODEL_ID accordingly."
    )

# If RUN_MODE is train-only, INFER_INPUT_CSV might not exist unless you set it explicitly.
if not Path(INFER_INPUT_CSV).exists():
    raise FileNotFoundError(
        f"INFER_INPUT_CSV not found: {INFER_INPUT_CSV}\n"
        "If RUN_MODE='train-only', set INFER_INPUT_CSV manually to the CSV you want to run inference on."
    )

infer_cmd = [
    "python", "run_inference.py",
    "--bundle", OFFLINE_BUNDLE_PKL,
    "--ontology-csv", ONTOLOGY_INTERNAL_CSV,
    "--input-csv", INFER_INPUT_CSV,
    "--out-csv", INFER_OUT_CSV,
    "--mode", INFER_MODE,
    "--cross-tokenizer-name", CROSS_TOKENIZER_NAME,
    "--cross-encoder-model-id", CROSS_ENCODER_MODEL_ID,
    "--retrieval-col", RETRIEVAL_COL,
    "--retrieval-lexical-top-k", str(RETRIEVAL_LEXICAL_TOP_K),
    "--retrieval-semantic-top-k", str(RETRIEVAL_SEMANTIC_TOP_K),
    "--retrieval-merged-top-k", str(RETRIEVAL_MERGED_TOP_K),
    "--hybrid-ratio-semantic", str(HYBRID_RATIO_SEMANTIC),
    "--semantic-batch-size", str(SEMANTIC_BATCH_SIZE),
    "--cross-top-k", str(CROSS_TOP_K),
    "--cross-batch-size", str(CROSS_BATCH_SIZE),
    "--cross-max-length", str(CROSS_MAX_LENGTH),
    "--keep-top-n", str(KEEP_TOP_N),
]

if SCORING_COL:
    infer_cmd += ["--scoring-col", SCORING_COL]
if ID_COL:
    infer_cmd += ["--id-col", ID_COL]

run_cmd(infer_cmd, infer_log)

print("\nUnified pipeline completed successfully.")
print("Outputs:")
print(" - Training:", TRAIN_DIR)
print(" - Offline bundle:", OFFLINE_DIR)
print(" - Inference:", INFER_DIR)
print("Predictions CSV:", INFER_OUT_CSV)

---

## Optional: package outputs for download (run only at the end)

This is only needed if you want to export artifacts locally.
Skip it if you are iterating quickly.

In [None]:
import shutil

ZIP_PATH = str(OUT_DIR) + ".zip"
print("Zipping:", OUT_DIR, "->", ZIP_PATH)

# Make a zip of the whole run dir (simple + safe)
shutil.make_archive(str(OUT_DIR), "zip", root_dir=str(OUT_DIR))
print("Created:", ZIP_PATH)

# To download (Colab)
from google.colab import files
files.download(ZIP_PATH)