# Data Preprocess
Produces an artifact at `artifacts/message.json` for the downstream `train.ipynb` step.

This file will be collected via Elyra node "Output files" and pulled into the next node via "File dependencies".


In [None]:
from pathlib import Path
import json
import logging
import os
import socket
import getpass
from datetime import datetime, timezone

# Configure logging
logger = logging.getLogger("data_preprocess")
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s [%(name)s] %(message)s"))
    logger.addHandler(handler)
logger.setLevel(os.getenv("LOG_LEVEL", "INFO"))

# Prepare artifact directory
artifact_dir = Path("artifacts")
artifact_dir.mkdir(parents=True, exist_ok=True)

# Build payload
message = os.getenv("PREPROCESS_MESSAGE", "hello from data-preprocess")
payload = {
    "message": message,
    "created_at": datetime.now(timezone.utc).isoformat(),
    "hostname": socket.gethostname(),
    "user": getpass.getuser(),
}

# Write payload to JSON artifact
output_path = artifact_dir / "message.json"
with output_path.open("w", encoding="utf-8") as f:
    json.dump(payload, f, ensure_ascii=False, indent=2)

# Log details
logger.info(f"CWD: {Path.cwd()}")
logger.info(f"Wrote artifact: {output_path} size={output_path.stat().st_size} bytes")
logger.info(f"Payload keys: {list(payload.keys())}")
logger.info(f"Payload preview: {payload}")
logger.info(f"artifacts/ contents: {list(artifact_dir.iterdir())}")


## Elyra node configuration (summary)
- Output files (glob): `artifacts/message.json`
- Environment variable (optional): set `PREPROCESS_MESSAGE` for a custom message

