Skip to content

Commit

Permalink
destroy_node() will return True only when all node's resources have b…
Browse files Browse the repository at this point in the history
…een cleaned up successfully. Can be called multiple times to retry when partial errors happen.

Signed-off-by: Quentin Pradet <quentinp@apache.org>
  • Loading branch information
ldipenti authored and pquentin committed Oct 10, 2017
1 parent cd68110 commit fbf015d
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 6 deletions.
19 changes: 15 additions & 4 deletions libcloud/compute/drivers/azure_arm.py
Expand Up @@ -699,20 +699,29 @@ def destroy_node(self, node, ex_destroy_nic=True, ex_destroy_vhd=True):
:rtype: ``bool``
"""

do_node_polling = True
success = True

# This returns a 202 (Accepted) which means that the delete happens
# asynchronously.
# If returns 404, we may be retrying a previous destroy_node call that
# failed to clean up its related resources, so it isn't taken as a
# failure.
try:
self.connection.request(node.id,
params={"api-version": "2015-06-15"},
method='DELETE')
except BaseHTTPError as h:
if h.code == 202:
pass
elif h.code == 404:
# No need to ask again, node already down.
do_node_polling = False
else:
return False

# Need to poll until the node actually goes away.
while True:
while do_node_polling:
try:
time.sleep(10)
self.connection.request(
Expand All @@ -738,13 +747,14 @@ def destroy_node(self, node, ex_destroy_nic=True, ex_destroy_vhd=True):
method='DELETE')
break
except BaseHTTPError as h:
if h.code == 202:
if h.code == 202 or h.code == 404:
break
inuse = h.message.startswith("[NicInUse]")
if h.code == 400 and inuse:
time.sleep(10)
else:
# NIC cleanup failed, try cleaning up the VHD.
success = False
break

# Optionally clean up OS disk VHD.
Expand All @@ -766,9 +776,10 @@ def destroy_node(self, node, ex_destroy_nic=True, ex_destroy_vhd=True):
# LibcloudError. Wait a bit and try again.
time.sleep(10)
else:
raise
success = False
break

return True
return success

def create_volume(self, size, name, location=None, snapshot=None,
ex_resource_group=None, ex_account_type=None,
Expand Down
22 changes: 20 additions & 2 deletions libcloud/test/compute/test_azure_arm.py
Expand Up @@ -21,6 +21,7 @@
import mock

from libcloud.common.exceptions import BaseHTTPError
from libcloud.common.types import LibcloudError
from libcloud.compute.base import (NodeLocation, NodeSize, VolumeSnapshot,
StorageVolume)
from libcloud.compute.drivers.azure_arm import AzureImage, NodeAuthPassword
Expand Down Expand Up @@ -147,6 +148,23 @@ def error(e, **kwargs):
ret = self.driver.destroy_node(node)
self.assertTrue(ret)

def test_destroy_node__node_not_found(self):
"""
This simulates the case when destroy_node is being called for the 2nd
time because some related resource failed to clean up, so the DELETE
operation on the node will return 404 (because it was already deleted)
but the method should return success.
"""
def error(e, **kwargs):
raise e(**kwargs)
node = self.driver.list_nodes()[0]
AzureMockHttp.responses = [
# 404 (Not Found) to the DELETE request
lambda f: error(BaseHTTPError, code=404, message='Not found'),
]
ret = self.driver.destroy_node(node)
self.assertTrue(ret)

@mock.patch('time.sleep', return_value=None)
def test_destroy_node__async(self, time_sleep_mock):
def error(e, **kwargs):
Expand All @@ -172,10 +190,10 @@ def error(e, **kwargs):
# 404 means node destroyed successfully
lambda f: error(BaseHTTPError, code=404, message='Not found'),
# 500 - transient error when trying to clean up the NIC
lambda f: error(BaseHTTPError, code=500, message="Cloud weather")
lambda f: error(BaseHTTPError, code=500, message="Cloud weather"),
]
ret = self.driver.destroy_node(node)
self.assertTrue(ret)
self.assertFalse(ret)

def test_destroy_node__failed(self):
def error(e, **kwargs):
Expand Down

0 comments on commit fbf015d

Please sign in to comment.