In [None]:
# Parameters injected by dsl.run_notebook
input_dataset_path = globals().get("input_dataset_path")
min_length = int(globals().get("min_length", 0))

print("[nb:process_data] Starting notebook")
print(f"[nb:process_data] Params -> input_dataset_path={input_dataset_path}, min_length={min_length}")

import os
import json
from datasets import load_from_disk

if not input_dataset_path:
    raise ValueError("input_dataset_path is required")

dataset = load_from_disk(input_dataset_path)
filtered = dataset.filter(lambda ex: int(ex.get("length", 0)) >= int(min_length))

count_all = len(dataset)
count_filtered = len(filtered)

print(f"[nb:process_data] Loaded rows_total={count_all}, rows_kept={count_filtered}")

# Prepare outputs directory
output_dir = "/tmp/kfp_nb_outputs"
os.makedirs(output_dir, exist_ok=True)

# Write metrics.json for the wrapper to consume
metrics = {
    "rows_total": float(count_all),
    "rows_kept": float(count_filtered),
    "dummy_score": 1.0,
}
if count_all:
    ratio = (count_filtered / count_all) * 100.0
    if ratio:
        metrics["keep_ratio_percent"] = float(ratio)

metrics_path = os.path.join(output_dir, "metrics.json")
with open(metrics_path, "w", encoding="utf-8") as f:
    json.dump(metrics, f)
print(f"[nb:process_data] Wrote metrics to {metrics_path}")

# Write results.json summary
summary = {
    "rows_total": count_all,
    "rows_kept": count_filtered,
    "min_length": min_length,
    "example_first": filtered[0] if count_filtered > 0 else None,
}
results_path = os.path.join(output_dir, "results.json")
with open(results_path, "w", encoding="utf-8") as f:
    json.dump(summary, f, indent=2)
print(f"[nb:process_data] Wrote results to {results_path}")

print("[nb:process_data] Finished notebook")
