Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
dd8883b
feat: eid-scoped large binary create + per-execution delete (#4123)
kunwp1 May 28, 2026
d0d6fd9
feat: add executionId to InitializeExecutorRequest (#4123)
kunwp1 May 28, 2026
add9b95
feat: send executionId to workers on executor init (#4123)
kunwp1 May 28, 2026
decab59
feat: set large-binary execution context on JVM worker init (#4123)
kunwp1 May 28, 2026
d236504
fix: scope execution cleanup to the execution's large binaries (#4123)
kunwp1 May 28, 2026
51e49d1
fix: scope deleteWorkflow large-binary cleanup to its executions (#4123)
kunwp1 May 28, 2026
5114610
refactor: remove unused bucket-wide deleteAllObjects (#4123)
kunwp1 May 28, 2026
9845360
feat: eid-scoped large binary create on Python worker (#4123)
kunwp1 May 28, 2026
94c2804
fix: make Python large-binary execution-context guard fail-fast (#4123)
kunwp1 May 28, 2026
8be15ca
refactor: hold large-binary execution id in a dedicated worker holder…
kunwp1 May 28, 2026
f051176
refactor: read large-binary execution id through a getter (#4123)
kunwp1 May 28, 2026
d885c2b
Format and refactoring
kunwp1 May 28, 2026
8e1ebfb
Merge branch 'main' into fix/large-binary-eid-lifecycle
kunwp1 May 28, 2026
3330bf5
Polish comments
kunwp1 May 29, 2026
116291d
refactor: encapsulate large-binary state in a LargeBinaryManager clas…
kunwp1 Jun 1, 2026
ec952b2
test: cover create() in the MinIO-free LargeBinaryManager unit spec (…
kunwp1 Jun 1, 2026
9e78542
Format
kunwp1 Jun 1, 2026
6d51024
refactor: make Python LargeBinaryManager a __new__-guarded singleton …
kunwp1 Jun 2, 2026
aff0ab4
test: guard the LargeBinaryManager singleton invariant (#4123)
kunwp1 Jun 2, 2026
d79e5c9
Merge branch 'main' into fix/large-binary-eid-lifecycle
Xiao-zhen-Liu Jun 2, 2026
a12bc7c
Merge branch 'main' into fix/large-binary-eid-lifecycle
Xiao-zhen-Liu Jun 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ message InitializeExecutorRequest {
int32 totalWorkerCount = 1;
core.OpExecInitInfo opExecInitInfo = 2;
bool isSource = 3;
core.ExecutionIdentity executionId = 4;
}

message UpdateExecutorRequest {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,12 @@

class InitializeExecutorHandler(ControlHandler):
async def initialize_executor(self, req: InitializeExecutorRequest) -> EmptyReturn:
from pytexera.storage.large_binary_manager import LargeBinaryManager

op_exec_with_code: OpExecWithCode = get_one_of(req.op_exec_init_info)
LargeBinaryManager().set_current_execution_id(
req.execution_id.id if req.execution_id else None
)
self.context.executor_manager.initialize_executor(
op_exec_with_code.code, req.is_source, op_exec_with_code.language
)
Expand Down
4 changes: 2 additions & 2 deletions amber/src/main/python/core/models/type/large_binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,9 @@ def __init__(self, uri: Optional[str] = None):
"""
if uri is None:
# Lazy import to avoid circular dependencies
from pytexera.storage import large_binary_manager
from pytexera.storage.large_binary_manager import LargeBinaryManager

uri = large_binary_manager.create()
uri = LargeBinaryManager().create()

if not uri.startswith("s3://"):
raise ValueError(f"largebinary URI must start with 's3://', got: {uri}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ def __init__(self, large_binary: largebinary):

def _lazy_init(self):
"""Download from S3 on first read operation."""
from pytexera.storage import large_binary_manager
from pytexera.storage.large_binary_manager import LargeBinaryManager

s3 = large_binary_manager._get_s3_client()
s3 = LargeBinaryManager()._get_s3_client()
response = s3.get_object(
Bucket=self._large_binary.get_bucket_name(),
Key=self._large_binary.get_object_key(),
Expand Down
129 changes: 83 additions & 46 deletions amber/src/main/python/pytexera/storage/large_binary_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,57 +22,94 @@
and LargeBinaryInputStream/LargeBinaryOutputStream instead.
"""

import time
import uuid
from loguru import logger
from core.storage.storage_config import StorageConfig

# Module-level state
_s3_client = None
DEFAULT_BUCKET = "texera-large-binaries"

class LargeBinaryManager:
"""Manages execution-scoped large binaries in S3 for a worker process.

def _get_s3_client():
"""Get or initialize S3 client (lazy initialization, cached)."""
global _s3_client
if _s3_client is None:
try:
import boto3
from botocore.config import Config
except ImportError as e:
raise RuntimeError("boto3 required. Install with: pip install boto3") from e

_s3_client = boto3.client(
"s3",
endpoint_url=StorageConfig.S3_ENDPOINT,
aws_access_key_id=StorageConfig.S3_AUTH_USERNAME,
aws_secret_access_key=StorageConfig.S3_AUTH_PASSWORD,
region_name=StorageConfig.S3_REGION,
config=Config(signature_version="s3v4", s3={"addressing_style": "path"}),
)
return _s3_client


def _ensure_bucket_exists(bucket: str):
"""Ensure S3 bucket exists, creating it if necessary."""
s3 = _get_s3_client()
try:
s3.head_bucket(Bucket=bucket)
except s3.exceptions.NoSuchBucket:
logger.debug(f"Bucket {bucket} not found, creating it")
s3.create_bucket(Bucket=bucket)
logger.info(f"Created bucket: {bucket}")


def create() -> str:
Implemented as a singleton: ``LargeBinaryManager()`` always returns the same
instance, so the cached S3 client and the current execution id are shared across
all callers in the worker process. A Python worker is a single process serving one
execution. Mirrors the JVM ``object LargeBinaryManager``.
"""
Creates a new largebinary reference with a unique S3 URI.

Returns:
S3 URI string (format: s3://bucket/key)
"""
_ensure_bucket_exists(DEFAULT_BUCKET)
timestamp_ms = int(time.time() * 1000)
unique_id = uuid.uuid4()
object_key = f"objects/{timestamp_ms}/{unique_id}"
return f"s3://{DEFAULT_BUCKET}/{object_key}"
DEFAULT_BUCKET = "texera-large-binaries"

_instance = None

def __new__(cls):
if cls._instance is None:
instance = super().__new__(cls)
instance._s3_client = None
# Execution context: set at executor init and read by create() so the
# user-facing largebinary() API stays execution-id-free.
instance._current_execution_id = None
cls._instance = instance
return cls._instance

def set_current_execution_id(self, execution_id):
"""Sets the execution id used to scope large binaries created by this worker."""
self._current_execution_id = execution_id

def get_current_execution_id(self):
"""Returns the execution id set for this worker, or None if unset."""
return self._current_execution_id

def _get_s3_client(self):
"""Get or initialize the S3 client (lazy initialization, cached)."""
if self._s3_client is None:
try:
import boto3
from botocore.config import Config
except ImportError as e:
raise RuntimeError(
"boto3 required. Install with: pip install boto3"
) from e

self._s3_client = boto3.client(
"s3",
endpoint_url=StorageConfig.S3_ENDPOINT,
aws_access_key_id=StorageConfig.S3_AUTH_USERNAME,
aws_secret_access_key=StorageConfig.S3_AUTH_PASSWORD,
region_name=StorageConfig.S3_REGION,
config=Config(
signature_version="s3v4", s3={"addressing_style": "path"}
),
)
return self._s3_client

def _ensure_bucket_exists(self, bucket: str):
"""Ensure the S3 bucket exists, creating it if necessary."""
s3 = self._get_s3_client()
try:
s3.head_bucket(Bucket=bucket)
except s3.exceptions.NoSuchBucket:
logger.debug(f"Bucket {bucket} not found, creating it")
s3.create_bucket(Bucket=bucket)
logger.info(f"Created bucket: {bucket}")

def create(self) -> str:
"""
Creates a new largebinary reference with a unique, execution-scoped S3 URI.

The object key is namespaced by the current execution id so cleanup can delete
only this execution's objects. The execution id is injected by the system (set
via set_current_execution_id() when the worker is initialized); callers never
pass it.

Returns:
S3 URI string (format: s3://bucket/objects/{execution_id}/{uuid})
"""
self._ensure_bucket_exists(self.DEFAULT_BUCKET)
execution_id = self.get_current_execution_id()
if execution_id is None:
raise RuntimeError(
"largebinary() requires an execution context, but no execution id "
"has been set for this worker."
)
unique_id = uuid.uuid4()
object_key = f"objects/{execution_id}/{unique_id}"
return f"s3://{self.DEFAULT_BUCKET}/{object_key}"
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from typing import Optional, Union
from io import IOBase
from core.models.type.large_binary import largebinary
from pytexera.storage import large_binary_manager
from pytexera.storage.large_binary_manager import LargeBinaryManager
import threading
import queue

Expand Down Expand Up @@ -154,8 +154,9 @@ def write(self, b: Union[bytes, bytearray]) -> int:

def upload_worker():
try:
large_binary_manager._ensure_bucket_exists(self._bucket_name)
s3 = large_binary_manager._get_s3_client()
manager = LargeBinaryManager()
manager._ensure_bucket_exists(self._bucket_name)
s3 = manager._get_s3_client()
reader = _QueueReader(self._queue)
s3.upload_fileobj(reader, self._bucket_name, self._object_key)
except Exception as e:
Expand Down Expand Up @@ -231,7 +232,7 @@ def close(self) -> None:
def _cleanup_failed_upload(self):
"""Clean up a failed upload by deleting the S3 object."""
try:
s3 = large_binary_manager._get_s3_client()
s3 = LargeBinaryManager()._get_s3_client()
s3.delete_object(Bucket=self._bucket_name, Key=self._object_key)
except Exception:
# Ignore cleanup errors - we're already handling an upload failure
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,8 @@ class RegionExecutionCoordinator(
InitializeExecutorRequest(
workerConfigs.length,
physicalOp.opExecInitInfo,
physicalOp.isSourceOperator
physicalOp.isSourceOperator,
Some(physicalOp.executionId)
),
asyncRPCClient.mkContext(workerId)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import org.apache.texera.amber.engine.architecture.rpc.controlcommands.{
import org.apache.texera.amber.engine.architecture.rpc.controlreturns.EmptyReturn
import org.apache.texera.amber.engine.architecture.worker.DataProcessorRPCHandlerInitializer
import org.apache.texera.amber.util.VirtualIdentityUtils
import org.apache.texera.service.util.LargeBinaryManager

trait InitializeExecutorHandler {
this: DataProcessorRPCHandlerInitializer =>
Expand All @@ -44,6 +45,7 @@ trait InitializeExecutorHandler {
)
)
cachedTotalWorkerCount = req.totalWorkerCount
req.executionId.foreach(eid => LargeBinaryManager.setCurrentExecutionId(eid.id))
setupExecutor(req.opExecInitInfo, workerIdx, cachedTotalWorkerCount)
EmptyReturn()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,8 @@ class WorkflowResource extends LazyLogging {
.asScala
.toList

LargeBinaryManager.deleteAllObjects()
// Delete large binaries for each execution belonging to the workflows being removed
eids.foreach(eid => LargeBinaryManager.deleteByExecution(eid.longValue()))

// Collect all URIs related to executions for cleanup
val uris = eids.flatMap { eid =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ class WorkflowService(
* 2. Clears URI references from the execution registry
* 3. Safely clears all result and console message documents
* 4. Expires Iceberg snapshots for runtime statistics
* 5. Deletes large binaries from MinIO
* 5. Deletes this execution's large binaries from MinIO
*
* @param eid The execution identity to clean up resources for
*/
Expand Down Expand Up @@ -355,7 +355,7 @@ class WorkflowService(
logger.debug(s"Error processing document at $uri: ${error.getMessage}")
}
}
// Delete large binaries
LargeBinaryManager.deleteAllObjects()
// Delete this execution's large binaries
LargeBinaryManager.deleteByExecution(eid.id)
}
}
6 changes: 5 additions & 1 deletion amber/src/test/python/core/models/type/test_large_binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
import pytest
from unittest.mock import patch
from core.models.type.large_binary import largebinary
from pytexera.storage.large_binary_manager import LargeBinaryManager

# The manager is a singleton; bind the shared instance for the tests.
large_binary_manager = LargeBinaryManager()


class TestLargeBinary:
Expand All @@ -31,7 +35,7 @@ def test_create_with_uri(self):

def test_create_without_uri(self):
"""Test creating largebinary without URI (calls large_binary_manager.create)."""
with patch("pytexera.storage.large_binary_manager.create") as mock_create:
with patch.object(large_binary_manager, "create") as mock_create:
mock_create.return_value = "s3://bucket/objects/123/uuid"
large_binary = largebinary()
assert large_binary.uri == "s3://bucket/objects/123/uuid"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@
from io import BytesIO
from core.models.type.large_binary import largebinary
from pytexera.storage.large_binary_input_stream import LargeBinaryInputStream
from pytexera.storage import large_binary_manager
from pytexera.storage.large_binary_manager import LargeBinaryManager

# The manager is a singleton; bind the shared instance for the tests.
large_binary_manager = LargeBinaryManager()


class TestLargeBinaryInputStream:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,16 @@
# specific language governing permissions and limitations
# under the License.

import re

import pytest
from unittest.mock import patch, MagicMock
from pytexera.storage import large_binary_manager
from pytexera.storage.large_binary_manager import LargeBinaryManager
from core.storage.storage_config import StorageConfig

# The manager is a singleton; bind the shared instance for the tests.
large_binary_manager = LargeBinaryManager()


class TestLargeBinaryManager:
@pytest.fixture(autouse=True)
Expand All @@ -42,6 +47,11 @@ def setup_storage_config(self):
s3_auth_username="minioadmin",
s3_auth_password="minioadmin",
)
# Provide a default execution id so create() doesn't raise.
original_eid = large_binary_manager.get_current_execution_id()
large_binary_manager.set_current_execution_id(1)
yield
large_binary_manager.set_current_execution_id(original_eid)

def test_get_s3_client_initializes_once(self):
"""Test that S3 client is initialized and cached."""
Expand Down Expand Up @@ -119,7 +129,7 @@ def test_ensure_bucket_exists_creates_bucket_when_missing(self):
mock_client.create_bucket.assert_called_once_with(Bucket="test-bucket")

def test_create_generates_unique_uri(self):
"""Test that create() generates a unique S3 URI."""
"""Test that create() generates a unique execution-scoped S3 URI."""
large_binary_manager._s3_client = None

with patch("boto3.client") as mock_boto3_client:
Expand All @@ -130,10 +140,10 @@ def test_create_generates_unique_uri(self):

uri = large_binary_manager.create()

# Check URI format
# Check URI format: s3://bucket/objects/{eid}/{uuid}
assert uri.startswith("s3://")
assert uri.startswith(f"s3://{large_binary_manager.DEFAULT_BUCKET}/")
assert "objects/" in uri
assert f"objects/{large_binary_manager.get_current_execution_id()}/" in uri

# Verify bucket was checked/created
mock_client.head_bucket.assert_called_once_with(
Expand All @@ -152,3 +162,35 @@ def test_create_uses_default_bucket(self):

uri = large_binary_manager.create()
assert large_binary_manager.DEFAULT_BUCKET in uri
assert f"objects/{large_binary_manager.get_current_execution_id()}/" in uri


def test_create_stamps_execution_id(monkeypatch):
# Avoid touching real S3 while testing key generation.
monkeypatch.setattr(
large_binary_manager, "_ensure_bucket_exists", lambda bucket: None
)
monkeypatch.setattr(large_binary_manager, "_current_execution_id", 42)

uri = large_binary_manager.create()

assert re.fullmatch(r"s3://texera-large-binaries/objects/42/[0-9a-fA-F-]+", uri)


def test_create_without_execution_context_raises(monkeypatch):
monkeypatch.setattr(
large_binary_manager, "_ensure_bucket_exists", lambda bucket: None
)
monkeypatch.setattr(large_binary_manager, "_current_execution_id", None)

with pytest.raises(RuntimeError):
large_binary_manager.create()


def test_largebinarymanager_is_a_singleton(monkeypatch):
# Constructing the manager always returns the same shared instance.
assert LargeBinaryManager() is LargeBinaryManager()

# State set through one handle is visible through another (shared instance).
monkeypatch.setattr(LargeBinaryManager(), "_current_execution_id", 314)
assert LargeBinaryManager().get_current_execution_id() == 314
Loading
Loading