Skip to content

Commit

Permalink
fix unhandled lock errors (#1495, #1496, #1498, #1500, #1511)
Browse files Browse the repository at this point in the history
  • Loading branch information
mikkonie committed Oct 31, 2022
1 parent c7d9e9e commit 23013f0
Show file tree
Hide file tree
Showing 9 changed files with 188 additions and 24 deletions.
11 changes: 11 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,23 @@ Added
- **Landingzones**
- ``LANDINGZONES_TRIGGER_ENABLE`` Django setting (#1508)

Changed
-------

- **Taskflowbackend**
- Improve project lock error messages (#1496, #1500, #1511)

Fixed
-----

- **General**
- Invalid ``REDIS_URL`` default value (#1497)
- Invalid modify API settings in production config (#1503)
- **Samplesheets**
- Uncaught project lock exceptions in iRODS delete request accepting (#1495)
- **Taskflowbackend**
- Unhandled project lock exceptions (#1496, #1500, #1511)
- Landing zone status not updated on flow lock/build errors (#1498)


v0.12.0 (2022-10-14)
Expand Down
1 change: 1 addition & 0 deletions docs_manual/source/sodar_release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Maintenance and bug fix release.

- Fix incorrect project modify API settings in production
- Fix Tooz and Redis connection issue handling
- Fix unhandled project locking errors


v0.12.0 (2022-10-14)
Expand Down
16 changes: 16 additions & 0 deletions landingzones/tests/test_views_api_taskflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import json

from django.test import override_settings
from django.urls import reverse

# Projectroles dependency
Expand All @@ -25,6 +26,7 @@
LandingZoneTaskflowMixin,
ZONE_TITLE,
ZONE_DESC,
INVALID_REDIS_URL,
)


Expand Down Expand Up @@ -391,3 +393,17 @@ def test_post_move_invalid_status(self):
self.assertEqual(response.status_code, 400)
self.assertEqual(LandingZone.objects.count(), 1)
self.assertEqual(LandingZone.objects.first().status, 'DELETED')

@override_settings(REDIS_URL=INVALID_REDIS_URL)
def test_post_move_lock_failure(self):
"""Test post() for moving with project lock failure"""
url = reverse(
'landingzones:api_submit_move',
kwargs={'landingzone': self.landing_zone.sodar_uuid},
)
response = self.request_knox(url, method='POST')

self.assertEqual(response.status_code, 500)
self.assertEqual(LandingZone.objects.count(), 1)
zone = LandingZone.objects.first()
self.assert_zone_status(zone, 'FAILED')
49 changes: 49 additions & 0 deletions landingzones/tests/test_views_taskflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from django.contrib import auth
from django.core import mail
from django.test import override_settings
from django.urls import reverse

# Projectroles dependency
Expand All @@ -26,6 +27,9 @@
# Taskflowbackend dependency
from taskflowbackend.tests.base import TaskflowbackendTestBase

# Timeline dependency
from timeline.models import ProjectEvent

from landingzones.models import LandingZone
from landingzones.tests.test_models import LandingZoneMixin

Expand All @@ -50,6 +54,7 @@
ASYNC_WAIT_SECONDS = 5
ASYNC_RETRY_COUNT = 3
INVALID_MD5 = '11111111111111111111111111111111'
INVALID_REDIS_URL = 'redis://127.0.0.1:6666/0'


class LandingZoneTaskflowMixin:
Expand Down Expand Up @@ -593,6 +598,50 @@ def test_validate_md5_only(self):
self.assertEqual(len(self.zone_coll.data_objects), 1)
self.assertEqual(len(self.assay_coll.data_objects), 0)

@override_settings(REDIS_URL=INVALID_REDIS_URL)
def test_move_lock_failure(self):
"""Test validating and moving with project lock failure"""
self.irods_obj = self.make_object(self.zone_coll, TEST_OBJ_NAME)
self.md5_obj = self.make_md5_object(self.irods_obj)
zone = LandingZone.objects.first()
self.assertEqual(zone.status, 'ACTIVE')
self.assertEqual(len(self.zone_coll.data_objects), 2)
self.assertEqual(len(self.assay_coll.data_objects), 0)
self.assertEqual(len(mail.outbox), 1)
self.assertEqual(
ProjectEvent.objects.filter(event_name='zone_move').count(), 0
)
self.assertEqual(
AppAlert.objects.filter(alert_name='zone_move').count(), 0
)

with self.login(self.user):
response = self.client.post(
reverse(
'landingzones:move',
kwargs={'landingzone': self.landing_zone.sodar_uuid},
),
)
self.assertRedirects(
response,
reverse(
'landingzones:list',
kwargs={'project': self.project.sodar_uuid},
),
)

self.assert_zone_status(zone, 'FAILED')
self.assertEqual(len(self.zone_coll.data_objects), 2)
self.assertEqual(len(self.assay_coll.data_objects), 0)
self.assertEqual(len(mail.outbox), 1) # TODO: Should this send email?
tl_event = ProjectEvent.objects.filter(event_name='zone_move').first()
self.assertIsInstance(tl_event, ProjectEvent)
self.assertEqual(tl_event.get_status().status_type, 'FAILED')
# TODO: Create app alerts for async failures (see #1499)
self.assertEqual(
AppAlert.objects.filter(alert_name='zone_move').count(), 0
)


class TestLandingZoneDeleteView(
SampleSheetIOMixin,
Expand Down
42 changes: 42 additions & 0 deletions samplesheets/tests/test_views_taskflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
PUBLIC_USER_PASS = 'password'
SOURCE_ID = '0815'
SAMPLE_ID = '0815-N1'
INVALID_REDIS_URL = 'redis://127.0.0.1:6666/0'


class SampleSheetTaskflowMixin:
Expand Down Expand Up @@ -1453,6 +1454,47 @@ def test_accept_one_of_multiple(self):
self.assertEqual(self._get_create_alert_count(self.user), 1)
self.assertEqual(self._get_create_alert_count(self.user_delegate), 1)

@override_settings(REDIS_URL=INVALID_REDIS_URL)
def test_accept_lock_failure(self):
"""Test accepting a delete request with project lock failure"""
self.assert_irods_obj(self.path)

with self.login(self.user_contrib):
self.client.post(
reverse(
'samplesheets:irods_request_create',
kwargs={'project': self.project.sodar_uuid},
),
self.post_data,
)

self.assertEqual(IrodsDataRequest.objects.count(), 1)
obj = IrodsDataRequest.objects.first()
self.assertEqual(self._get_create_alert_count(self.user), 1)
self.assertEqual(self._get_create_alert_count(self.user_delegate), 1)

with self.login(self.user):
response = self.client.post(
reverse(
'samplesheets:irods_request_accept',
kwargs={'irodsdatarequest': obj.sodar_uuid},
),
{'confirm': True},
)
self.assertRedirects(
response,
reverse(
'samplesheets:irods_requests',
kwargs={'project': self.project.sodar_uuid},
),
)

obj.refresh_from_db()
self.assertEqual(obj.status, 'FAILED')
self.assertEqual(self._get_create_alert_count(self.user), 1)
self.assertEqual(self._get_create_alert_count(self.user_delegate), 1)
self.assert_irods_obj(self.path, True)


class TestIrodsRequestRejectView(TestIrodsRequestViewsBase):
"""Tests for IrodsRequestRejectView"""
Expand Down
21 changes: 14 additions & 7 deletions samplesheets/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -2278,6 +2278,10 @@ def form_valid(self, request, *args, **kwargs):
app_alerts = get_backend_api('appalerts_backend')
project = self.get_project()
tl_event = None
redirect_url = reverse(
'samplesheets:irods_requests',
kwargs={'project': project.sodar_uuid},
)

try:
obj = IrodsDataRequest.objects.get(
Expand Down Expand Up @@ -2330,7 +2334,15 @@ def form_valid(self, request, *args, **kwargs):
tl_event.set_status('FAILED', str(ex))
obj.status = 'FAILED'
obj.save()
raise ex
# if settings.DEBUG:
# raise ex
messages.error(
self.request,
'Accepting iRODS data request "{}" failed: {}'.format(
obj.get_display_name(), ex
),
)
return redirect(redirect_url)

# Update cache
if settings.SHEETS_ENABLE_CACHE:
Expand Down Expand Up @@ -2377,12 +2389,7 @@ def form_valid(self, request, *args, **kwargs):
self.request,
'iRODS data request "{}" accepted.'.format(obj.get_display_name()),
)
return redirect(
reverse(
'samplesheets:irods_requests',
kwargs={'project': project.sodar_uuid},
)
)
return redirect(redirect_url)


class IrodsRequestRejectView(
Expand Down
50 changes: 40 additions & 10 deletions taskflowbackend/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
from django.conf import settings
from django.core.exceptions import ImproperlyConfigured

# Landingzones dependency
from landingzones.models import LandingZone

# Projectroles dependency
from projectroles.models import SODAR_CONSTANTS
from projectroles.plugins import get_backend_api
Expand All @@ -27,6 +30,7 @@
# Local constants
DEFAULT_PERMANENT_USERS = ['client_user', 'rods', 'rodsadmin', 'public']
UNKNOWN_RUN_ERROR = 'Running flow failed: unknown error, see server log'
LOCK_FAIL_MSG = 'Unable to acquire project lock'


class TaskflowAPI:
Expand All @@ -35,6 +39,26 @@ class TaskflowAPI:
class FlowSubmitException(Exception):
"""SODAR Taskflow submission exception"""

@classmethod
def _raise_flow_exception(cls, ex_msg, tl_event=None, zone=None):
"""
Handle and raise exception with flow building or execution. Updates an
associated timeline event if present, as well as a landing zone if
provided in the flow data.
:param ex_msg: Exception message (string)
:param tl_event: Timeline event or None
:zone: LandingZone object or None
:raise: FlowSubmitException
"""
if tl_event:
tl_event.set_status('FAILED', ex_msg)
# HACK: Update landing zone
if zone:
zone.set_status('FAILED', ex_msg)
# TODO: Create app alert for failure if async (see #1499)
raise cls.FlowSubmitException(ex_msg)

@classmethod
def get_flow(
cls,
Expand Down Expand Up @@ -97,21 +121,30 @@ def run_flow(
ex_msg = None
coordinator = None
lock = None
# HACK: Get zone if present in flow
zone_uuid = flow.flow_data.get('zone_uuid')
zone = None
if zone_uuid:
zone = LandingZone.objects.filter(sodar_uuid=zone_uuid).first()

# Acquire lock if needed
if flow.require_lock:
# Acquire lock
coordinator = lock_api.get_coordinator()
if not coordinator:
raise Exception('Unable to retrieve lock coordinator')
cls._raise_flow_exception(
LOCK_FAIL_MSG + ': Failed to retrieve lock coordinator',
tl_event,
zone,
)
else:
lock_id = str(project.sodar_uuid)
lock = coordinator.get_lock(lock_id)
try:
lock_api.acquire(lock)
except Exception as ex:
raise Exception(
'Unable to acquire project lock: {}'.format(ex)
cls._raise_flow_exception(
LOCK_FAIL_MSG + ': {}'.format(ex), tl_event, zone
)
else:
logger.info('Lock not required (flow.require_lock=False)')
Expand All @@ -122,9 +155,6 @@ def run_flow(
flow.build(force_fail)
except Exception as ex:
ex_msg = 'Error building flow: {}'.format(ex)
# Set timeline status
if tl_event:
tl_event.set_status('FAILED', ex_msg)

# Run flow
if not ex_msg:
Expand All @@ -141,8 +171,6 @@ def run_flow(
elif not flow_result:
if not ex_msg:
ex_msg = UNKNOWN_RUN_ERROR
if async_mode and tl_event:
tl_event.set_status('FAILED', ex_msg)

# Release lock if acquired
if flow.require_lock and lock:
Expand All @@ -151,8 +179,10 @@ def run_flow(

# Raise exception if failed, otherwise return result
if ex_msg:
logger.error(ex_msg)
raise cls.FlowSubmitException(ex_msg)
logger.error(ex_msg) # TODO: Isn't this redundant?
# NOTE: Not providing zone here since it's handled by flow
# TODO: Replace RevertLandingZoneFailTask with a call here?
cls._raise_flow_exception(ex_msg, tl_event, None)
return flow_result

def submit(
Expand Down
7 changes: 4 additions & 3 deletions taskflowbackend/lock_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
LOCK_ENABLED = settings.TASKFLOW_LOCK_ENABLED
LOCK_RETRY_COUNT = settings.TASKFLOW_LOCK_RETRY_COUNT
LOCK_RETRY_INTERVAL = settings.TASKFLOW_LOCK_RETRY_INTERVAL
REDIS_URL = settings.REDIS_URL


logger = logging.getLogger('__name__')
Expand Down Expand Up @@ -40,7 +39,9 @@ def get_coordinator(cls):
host_id = 'sodar_{}'.format(uuid.uuid4())
try:
coordinator = coordination.get_coordinator(
backend_url=REDIS_URL, member_id=host_id, socket_keepalive=True
backend_url=settings.REDIS_URL,
member_id=host_id,
socket_keepalive=True,
)
if coordinator:
coordinator.start(start_heart=True)
Expand Down Expand Up @@ -78,7 +79,7 @@ def acquire(
return True
time.sleep(retry_interval)
cls._log_status(lock, unlock=False, failed=True)
raise LockAcquireException('Unable to acquire project lock')
raise LockAcquireException('Project is locked by another operation')

@classmethod
def release(cls, lock):
Expand Down
Loading

0 comments on commit 23013f0

Please sign in to comment.