-
Notifications
You must be signed in to change notification settings - Fork 1.3k
fix(linstor): pre-flight check destination is a LINSTOR satellite before live migration #13077
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -314,6 +314,58 @@ private boolean needsExactSizeProp(VolumeInfo srcVolumeInfo) { | |
| return true; | ||
| } | ||
|
|
||
| /** | ||
| * Verify that the destination KVM host is a registered LINSTOR satellite on the controller | ||
| * backing every destination pool involved in this migration. Throws CloudRuntimeException | ||
| * with a clear message when it isn't, instead of letting the resource creation later fail | ||
| * obscurely inside auto-placement. | ||
| * | ||
| * Best-effort: a transient controller error during this check does not block the migration | ||
| * — we log a warning and let the downstream resource-create surface the real issue. Only a | ||
| * confirmed "host not in node list" outcome aborts the migration up-front. | ||
| */ | ||
| private void verifyDestinationIsLinstorSatellite(Map<VolumeInfo, DataStore> volumeDataStoreMap, Host destHost) { | ||
| if (destHost == null || destHost.getName() == null) { | ||
| // Without a destination host name to match, the only sensible thing is to let the | ||
| // existing flow run and report whatever it would have reported. | ||
| return; | ||
| } | ||
| for (Map.Entry<VolumeInfo, DataStore> entry : volumeDataStoreMap.entrySet()) { | ||
| DataStore destDataStore = entry.getValue(); | ||
| StoragePoolVO destStoragePool = _storagePool.findById(destDataStore.getId()); | ||
| if (destStoragePool == null | ||
| || destStoragePool.getPoolType() != Storage.StoragePoolType.Linstor) { | ||
| continue; | ||
| } | ||
| DevelopersApi api = LinstorUtil.getLinstorAPI(destStoragePool.getHostAddress()); | ||
| try { | ||
| List<String> nodes = LinstorUtil.getLinstorNodeNames(api); | ||
| if (nodes == null) { | ||
| logger.warn("LINSTOR controller {} returned null node list; skipping pre-flight", | ||
| destStoragePool.getHostAddress()); | ||
| return; | ||
| } | ||
| if (!nodes.contains(destHost.getName())) { | ||
| throw new CloudRuntimeException(String.format( | ||
| "Cannot migrate to host '%s': it is not a registered LINSTOR satellite on " + | ||
| "controller %s (pool '%s'). Known satellites: %s. Either register the " + | ||
| "host with `linstor node create` or pick a different destination.", | ||
| destHost.getName(), | ||
| destStoragePool.getHostAddress(), | ||
| destStoragePool.getName(), | ||
| nodes)); | ||
| } | ||
| } catch (ApiException apiEx) { | ||
| // Don't block migration on a transient controller hiccup — log and let the | ||
| // downstream resource creation handle the real failure. | ||
| logger.warn("LINSTOR pre-flight check could not contact controller {}: {}; " + | ||
| "letting downstream resource creation proceed", | ||
| destStoragePool.getHostAddress(), apiEx.getBestMessage()); | ||
| return; | ||
| } | ||
|
Comment on lines
+358
to
+365
|
||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public void copyAsync(Map<VolumeInfo, DataStore> volumeDataStoreMap, VirtualMachineTO vmTO, Host srcHost, | ||
| Host destHost, AsyncCompletionCallback<CopyCommandResult> callback) { | ||
|
|
@@ -323,6 +375,15 @@ public void copyAsync(Map<VolumeInfo, DataStore> volumeDataStoreMap, VirtualMach | |
| String.format("Invalid hypervisor type [%s]. Only KVM supported", srcHost.getHypervisorType())); | ||
| } | ||
|
|
||
| // Pre-flight: verify the destination KVM host is registered as a satellite on the | ||
| // LINSTOR controller backing each destination pool. Without this check, resource | ||
| // creation falls through to the resource-group's auto-placement filters and may | ||
| // either silently place the resource on the wrong node or fail with an opaque | ||
| // auto-place error from the LINSTOR API. Failing fast here gives operators a clear | ||
| // actionable message instead of having to correlate the live-migration failure with | ||
| // an unrelated LINSTOR controller log entry. | ||
| verifyDestinationIsLinstorSatellite(volumeDataStoreMap, destHost); | ||
|
|
||
|
Comment on lines
+378
to
+386
|
||
| String errMsg = null; | ||
| VMInstanceVO vmInstance = _vmDao.findById(vmTO.getId()); | ||
| vmTO.setState(vmInstance.getState()); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Inside the loop, the method
returns whennodes == null. That exits the entire pre-flight check and skips verifying any remaining destination pools/volumes. If the intent is best-effort per pool, this shouldcontinueto the next entry (or better: de-duplicate pools/controllers and keep checking the rest).