From 8dcd7bfc41953a79567f76e3753ddcdd6727259b Mon Sep 17 00:00:00 2001 From: Bernhard Kaindl Date: Thu, 27 Feb 2025 12:00:00 +0100 Subject: [PATCH 1/2] (docs) Improve xc_domain_node_setaffinity.md, add xc_vcpu_setaffinity.md Signed-off-by: Bernhard Kaindl --- .../lib/xenctrl/xc_domain_node_setaffinity.md | 99 ++++++++++++++++--- .../xenctrl/xc_vcpu_setaffinity-simplified.md | 30 ++++++ .../xenctrl/xc_vcpu_setaffinity-xenopsd.md | 90 +++++++++++++++++ .../lib/xenctrl/xc_vcpu_setaffinity.md | 58 +++++++++++ 4 files changed, 266 insertions(+), 11 deletions(-) create mode 100644 doc/content/lib/xenctrl/xc_vcpu_setaffinity-simplified.md create mode 100644 doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd.md create mode 100644 doc/content/lib/xenctrl/xc_vcpu_setaffinity.md diff --git a/doc/content/lib/xenctrl/xc_domain_node_setaffinity.md b/doc/content/lib/xenctrl/xc_domain_node_setaffinity.md index 82cf2e36f08..32d8ca42806 100644 --- a/doc/content/lib/xenctrl/xc_domain_node_setaffinity.md +++ b/doc/content/lib/xenctrl/xc_domain_node_setaffinity.md @@ -1,13 +1,32 @@ --- title: xc_domain_node_setaffinity() -description: Set a Xen domain's NUMA node affinity +description: Set a Xen domain's NUMA node affinity for memory allocations +mermaid: + force: true --- -`xc_domain_node_setaffinity()` controls the NUMA node affinity of a domain. +`xc_domain_node_setaffinity()` controls the NUMA node affinity of a domain, +but it only updates the Xen hypervisor domain's `d->node_affinity` mask. +This mask is read by the Xen memory allocator as the 2nd preference for the +NUMA node to allocate memory from for this domain. -By default, Xen enables the `auto_node_affinity` feature flag, -where setting the vCPU affinity also sets the NUMA node affinity for -memory allocations to be aligned with the vCPU affinity of the domain. +> [!info] Preferences of the Xen memory allocator: +> 1. A NUMA node passed to the allocator directly takes precedence, if present. +> 2. Then, if the allocation is for a domain, it's `node_affinity` mask is tried. +> 3. Finally, it falls back to spread the pages over all remaining NUMA nodes. + +As this call has no practical effect on the Xen scheduler, vCPU affinities +need to be set separately anyways. + +And, the domain's `auto_node_affinity` flag is enabled by default. It means +that when setting vCPU affinities, Xen updates the `d->node_affinity` mask +to consist of the NUMA nodes to which its vCPUs have affinity to. + +See [xc_vcpu_setaffinity()](xc_vcpu_setaffinity) for more information +on how `d->auto_node_affinity` is used to set the NUMA node affinity. + +Thus, so far, there is no obvious need to call `xc_domain_node_setaffinity()` +when building a domain. Setting the NUMA node affinity using this call can be used, for example, when there might not be enough memory on the @@ -63,18 +82,76 @@ https://github.com/xen-project/xen/blob/master/xen/common/domain.c#L943-L970" This function implements the functionality of `xc_domain_node_setaffinity` to set the NUMA affinity of a domain as described above. If the new_affinity does not intersect the `node_online_map`, -it returns `-EINVAL`, otherwise on success `0`. +it returns `-EINVAL`. Otherwise, the result is a success, and it returns `0`. When the `new_affinity` is a specific set of NUMA nodes, it updates the NUMA -`node_affinity` of the domain to these nodes and disables `auto_node_affinity` -for this domain. It also notifies the Xen scheduler of the change. +`node_affinity` of the domain to these nodes and disables `d->auto_node_affinity` +for this domain. With `d->auto_node_affinity` disabled, +[xc_vcpu_setaffinity()](xc_vcpu_setaffinity) no longer updates the NUMA affinity +of this domain. + +If `new_affinity` has all bits set, it re-enables the `d->auto_node_affinity` +for this domain and calls +[domain_update_node_aff()](https://github.com/xen-project/xen/blob/e16acd80/xen/common/sched/core.c#L1809-L1876) +to re-set the domain's `node_affinity` mask to the NUMA nodes of the current +the hard and soft affinity of the domain's online vCPUs. + +### Flowchart in relation to xc_set_vcpu_affinity() + +The effect of `domain_set_node_affinity()` can be seen more clearly on this +flowchart which shows how `xc_set_vcpu_affinity()` is currently used to set +the NUMA affinity of a new domain, but also shows how `domain_set_node_affinity()` +relates to it: + +{{% include "xc_vcpu_setaffinity-xenopsd.md" %}} + +Essentially, `xc_domain_node_setaffinity` can be used to: + +- Set the domain's `node_affinity` which is normally set by + `xc_set_vcpu_affinity()` to a different set of NUMA nodes that are not + aligned with the CPU affinity of the vCPUs of the domain. -This sets the preference the memory allocator to the new NUMA nodes, -and in theory, it could also alter the behaviour of the scheduler. -This of course depends on the scheduler and its configuration. + This can be useful for special situations: + + - If we like to use the CPUs of one set of NUMA nodes for booting a VM, + but allocate or spread the memory of this VM on/over other NUMA nodes. + + This can be useful if we want to avoid using memory from some NUMA nodes, + for example, to keep those NUMA nodes free for other VMs, + but still want to run the CPUs on those NUMA nodes, which + might be helpful to better define on which NUMA nodes the vCPUs + may wander to in order to prevent vCPUs from wandering to another + CPU package. Such preventions might be valid use of vCPU hard-affinity. + +- Run tests that check the performance difference from using remote memory + explicitly when starting a VM. This can be useful for testing if a given + performance reading matches the performance of local or remote memory + on a given tested system. + +#### Effect on the Xen scheduler + +If `d->node_affinity` is set before vCPU creation, the initial pCPU +of the new vCPU is the first pCPU of the first NUMA node in the domain's +`node_affinity`. This is further changed when one of more `cpupools` are set up. + +However, as this is only the initial pCPU of the vCPU, this alone does +not have a lot of effect on the Xen scheduler. ## Notes on future design improvements +### It may be possible to call it before vCPUs are created + +When done early, before vCPU creation, some domain-related data structures +could be allocated using the domain's `d->node_affinity` NUMA node mask. + +With further changes in Xen and `xenopsd`, Xen could allocate the vCPU structs +on the affine NUMA nodes of the domain. + +The pre-condition for this would be that `xenopsd` needs to call this function +before vCPU creation and after having decided the domain's NUMA placement, +preferably including claiming the required memory for the domain to ensure +that the domain will be populated from the same NUMA node(s). + This call cannot influence the past: The `xenopsd` [VM_create](../../xenopsd/walkthroughs/VM.start.md#2-create-a-xen-domain) micro-ops calls `Xenctrl.domain_create`. It currently creates diff --git a/doc/content/lib/xenctrl/xc_vcpu_setaffinity-simplified.md b/doc/content/lib/xenctrl/xc_vcpu_setaffinity-simplified.md new file mode 100644 index 00000000000..48ebf1185dd --- /dev/null +++ b/doc/content/lib/xenctrl/xc_vcpu_setaffinity-simplified.md @@ -0,0 +1,30 @@ +--- +title: Simplified flowchart of xc_vcpu_setaffinity() +description: See lib/xenctrl/xc_vcpu_setaffinity-xenopsd.md for an extended version +hidden: true +--- +```mermaid +flowchart TD +subgraph libxenctrl + xc_vcpu_setaffinity("xc_vcpu_setaffinity()")--hypercall-->xen +end +subgraph xen[Xen Hypervisor] +direction LR +vcpu_set_affinity("vcpu_set_affinity()
set the vCPU affinity") + -->check_auto_node{"Is the domain's
auto_node_affinity
enabled?"} + --"yes
(default)"--> + auto_node_affinity("Set the
domain's
node_affinity + mask as well
(used for further
NUMA memory
allocation)") + +click xc_vcpu_setaffinity +"https://github.com/xen-project/xen/blob/7cf16387/tools/libs/ctrl/xc_domain.c#L199-L250" _blank +click vcpu_set_affinity +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1353-L1393" _blank +click domain_update_node_aff +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1809-L1876" _blank +click check_auto_node +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1840-L1870" _blank +click auto_node_affinity +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1867-L1869" _blank +end +``` diff --git a/doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd.md b/doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd.md new file mode 100644 index 00000000000..312b572e46a --- /dev/null +++ b/doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd.md @@ -0,0 +1,90 @@ +--- +title: Flowchart of the use of xc_vcpu_setaffinity() by xenopsd +description: Shows how xenopsd uses xc_vcpu_setaffinity() to set NUMA affinity +hidden: true +--- +```mermaid +flowchart TD +subgraph xenopsd["xenopsd VM.build"] + +Host.numa_affinity_policy{Host.numa_affinity_policy
is} + --best_effort-->numa_placement-->XenStore +end + +subgraph xenguest +Host.numa_affinity_policy=="default: disabled"==>stub_xc_hvm_build +XenStore(Add to Xenstore:
platform/vcpu/#domid/affinity)--> + stub_xc_hvm_build("stub_xc_hvm_build()") + ==> configure_vcpus("configure_vcpus()") + ==> affinity_set{"Is
platform/vcpu/#domid/affinity
set?"} + =="affinity is found (automatically only set on numa_placement success)"==> + xc_vcpu_setaffinity("xc_vcpu_setaffinity()") + + stub_xc_hvm_build["stub_xc_hvm_build()"] + <==> get_flags["get_flags()
gets Xenstore platform data"] + + subgraph xenctrl[XenCtrl] + xc_vcpu_setaffinity + xc_domain_node_setaffinity + end +end + +xc_vcpu_setaffinity ==Currently used hypercall==> do_domctl +xc_domain_node_setaffinity --Currently not used by the Xapi toolstack--> do_domctl + +subgraph xen[Xen Hypervisor] + + subgraph domain_update_node_affinity["domain_update_node_affinity()"] + domain_update_node_aff("domain_update_node_aff()") + ==> check_auto_node{"Is
d->auto_node_affinity
enabled?"} + =="yes (default)"==>set_node_affinity_from_vcpu_affinities(" + Set the domain's node_affinity mask as well + (used for further NUMA memory allocation for the domain)") + end + + do_domctl{"do_domctl()
op->cmd=?"} + ==XEN_DOMCTL_setvcpuaffinity==> + vcpu_set_affinity("vcpu_set_affinity()
set the vCPU affinity") + ==>domain_update_node_aff + do_domctl + --XEN_DOMCTL_setnodeaffinity (not used currently) + -->nodes_full + + subgraph domain_set_node_affinity["domain_set_node_affinity()"] + nodes_full{new_affinity
is #34;all#34;?} + --is #34;all#34;--> + set_auto("auto_node_affinity=1")--> + domain_update_node_aff + nodes_full + --not #34;all#34;-->fixed("auto_node_affinity=0 + node_affinity=new_affinity") + -->domain_update_node_aff + end +end +click Host.numa_affinity_policy +"https://github.com/xapi-project/xen-api/blob/90ef043c1f3a3bc20f1c5d3ccaaf6affadc07983/ocaml/xenopsd/xc/domain.ml#L951-L962" +click numa_placement +"https://github.com/xapi-project/xen-api/blob/90ef043c/ocaml/xenopsd/xc/domain.ml#L862-L897" +click stub_xc_hvm_build +"https://github.com/xenserver/xen.pg/blob/65c0438b/patches/xenguest.patch#L2329-L2436" _blank +click get_flags +"https://github.com/xenserver/xen.pg/blob/65c0438b/patches/xenguest.patch#L1164-L1288" _blank +click do_domctl +"https://github.com/xen-project/xen/blob/7cf163879/xen/common/domctl.c#L282-L894" _blank +click domain_set_node_affinity +"https://github.com/xen-project/xen/blob/7cf163879/xen/common/domain.c#L943-L970" _blank +click configure_vcpus +"https://github.com/xenserver/xen.pg/blob/65c0438b/patches/xenguest.patch#L1297-L1348" _blank +click affinity_set +"https://github.com/xenserver/xen.pg/blob/65c0438b/patches/xenguest.patch#L1305-L1326" _blank +click xc_vcpu_setaffinity +"https://github.com/xen-project/xen/blob/7cf16387/tools/libs/ctrl/xc_domain.c#L199-L250" _blank +click vcpu_set_affinity +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1353-L1393" _blank +click domain_update_node_aff +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1809-L1876" _blank +click check_auto_node +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1840-L1870" _blank +click set_node_affinity_from_vcpu_affinities +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1867-L1869" _blank +``` diff --git a/doc/content/lib/xenctrl/xc_vcpu_setaffinity.md b/doc/content/lib/xenctrl/xc_vcpu_setaffinity.md new file mode 100644 index 00000000000..f7c5561ef8a --- /dev/null +++ b/doc/content/lib/xenctrl/xc_vcpu_setaffinity.md @@ -0,0 +1,58 @@ +--- +title: xc_vcpu_setaffinity() +description: Set a Xen vCPU's pCPU affinity and the domain's NUMA node affinity +mermaid: + force: true +--- +## Purpose + +The libxenctrl library call `xc_set_vcpu_affinity()` +controls the pCPU affinity of the given vCPU. + +[xenguest](../../../xenopsd/walkthroughs/VM.build/xenguest/#walkthrough-of-the-xenguest-build-mode) +uses it when building domains if +[xenopsd](../../xenopsd/walkthroughs/VM.build/Domain.build) +added vCPU affinity information to the XenStore platform data path +`platform/vcpu/#domid/affinity` of the domain. + +### Updating the NUMA node affinity of a domain + +Besides that, `xc_set_vcpu_affinity()` can also modify the NUMA node +affinity of the Xen domain if the vCPU: + +When Xen creates a domain, it enables the domain's `d->auto_node_affinity` +feature flag. + +When it is enabled, setting the vCPU affinity also updates the NUMA node +affinity which is used for memory allocations for the domain: + +### Simplified flowchart + +{{% include "xc_vcpu_setaffinity-simplified.md" %}} + +## Current use by xenopsd and xenguest + +When `Host.numa_affinity_policy` is set to +[best_effort](../../../toolstack/features/NUMA/#xapi-datamodel-design), +[xenopsd](../../../xenopsd/walkthroughs/VM.build) attempts NUMA node placement +when building new VMs and instructs +[xenguest](../../../xenopsd/walkthroughs/VM.build/xenguest/#walkthrough-of-the-xenguest-build-mode) +to set the vCPU affinity of the domain. + +With the domain's `auto_node_affinity` flag enabled by default in Xen, +this automatically also sets the `d->node_affinity` mask of the domain. + +This then causes the Xen memory allocator to prefer the NUMA nodes in the +`d->node_affinity` NUMA node mask when allocating memory. + +That is, (for completeness) unless Xen's allocation function +`alloc_heap_pages()` receives a specific NUMA node in its `memflags` +argument when called. + +See [xc_domain_node_setaffinity()](xc_domain_node_setaffinity) for more +information about another way to set the `node_affinity` NUMA node mask +of Xen domains and more depth on how it is used in Xen. + +### Flowchart of its current use for NUMA affinity + +{{% include "xc_vcpu_setaffinity-xenopsd.md" %}} From 766ac533420cf9d960b9c4c24d78e61aadb6a1c5 Mon Sep 17 00:00:00 2001 From: Bernhard Kaindl Date: Mon, 3 Mar 2025 12:00:00 +0100 Subject: [PATCH 2/2] Update to the current situation where numa_placement calls xc_vcpu_setaffinity() Signed-off-by: Bernhard Kaindl --- .../xenctrl/xc_vcpu_setaffinity-xenopsd.md | 155 ++++++++++++++---- 1 file changed, 122 insertions(+), 33 deletions(-) diff --git a/doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd.md b/doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd.md index 312b572e46a..ca7c2c0aa1d 100644 --- a/doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd.md +++ b/doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd.md @@ -3,42 +3,114 @@ title: Flowchart of the use of xc_vcpu_setaffinity() by xenopsd description: Shows how xenopsd uses xc_vcpu_setaffinity() to set NUMA affinity hidden: true --- +Two code paths are set in bold to show +- when numa_affinity_policy is the default (off) in `xenopsd`. +- when `xc_vcpu_setaffinity(XEN_VCPUAFFINITY_SOFT)` is called in Xen, + and the auto_node_affinity flag is enabled (default), + which updates the node_affinity as well. + ```mermaid flowchart TD -subgraph xenopsd["xenopsd VM.build"] -Host.numa_affinity_policy{Host.numa_affinity_policy
is} - --best_effort-->numa_placement-->XenStore +subgraph VM.create["xenopsd VM.create"] + + %% Is xe vCPU-params:mask= set? If yes, write to Xenstore: + + is_xe_vCPUparams_mask_set?{" + + Is + xe vCPU-params:mask= + set? Example: 1,2,3 + (Is used to enable vCPU
hard-affinity) + + "} --"yes"--> set_hard_affinity("Write hard-affinity to XenStore: + platform/vcpu/#domid/affinity") + +end + +subgraph VM.build["xenopsd VM.build"] + + %% Labels of the decision nodes + + is_Host.numa_affinity_policy_set?{ + Is

Host.numa_affinity_policy

set?} + has_hard_affinity?{ + Is hard-affinity configured in

platform/vcpu/#domid/affinity?} + + %% Connections from VM.create: + set_hard_affinity --> is_Host.numa_affinity_policy_set? + is_xe_vCPUparams_mask_set? == "no"==> is_Host.numa_affinity_policy_set? + + %% The Subgraph itself: + + %% Check Host.numa_affinity_policy + + is_Host.numa_affinity_policy_set? + + %% If Host.numa_affinity_policy is "best_effort": + + -- Host.numa_affinity_policy is

best_effort --> + + %% If has_hard_affinity is set, skip numa_placement: + + has_hard_affinity? + --"yes"-->exec_xenguest + + %% If has_hard_affinity is not set, run numa_placement: + + has_hard_affinity? + --"no"-->numa_placement-->exec_xenguest + + %% If Host.numa_affinity_policy is off (default, for now), + %% skip NUMA placement: + + is_Host.numa_affinity_policy_set? + =="default: disabled"==> + exec_xenguest end +%% xenguest subgraph + subgraph xenguest -Host.numa_affinity_policy=="default: disabled"==>stub_xc_hvm_build -XenStore(Add to Xenstore:
platform/vcpu/#domid/affinity)--> - stub_xc_hvm_build("stub_xc_hvm_build()") - ==> configure_vcpus("configure_vcpus()") - ==> affinity_set{"Is
platform/vcpu/#domid/affinity
set?"} - =="affinity is found (automatically only set on numa_placement success)"==> - xc_vcpu_setaffinity("xc_vcpu_setaffinity()") - - stub_xc_hvm_build["stub_xc_hvm_build()"] - <==> get_flags["get_flags()
gets Xenstore platform data"] - - subgraph xenctrl[XenCtrl] - xc_vcpu_setaffinity - xc_domain_node_setaffinity - end + + exec_xenguest + + ==> stub_xc_hvm_build("stub_xc_hvm_build()") + + ==> configure_vcpus("configure_vcpus()") + + %% Decision + ==> set_hard_affinity?{" + Is platform/
vcpu/#domid/affinity
+ set?"} + end -xc_vcpu_setaffinity ==Currently used hypercall==> do_domctl -xc_domain_node_setaffinity --Currently not used by the Xapi toolstack--> do_domctl +%% do_domctl Hypercalls + +numa_placement + --Set the NUMA placement using soft-affinity--> + XEN_VCPUAFFINITY_SOFT("xc_vcpu_setaffinity(SOFT)") + ==> do_domctl + +set_hard_affinity? + --yes--> + XEN_VCPUAFFINITY_HARD("xc_vcpu_setaffinity(HARD)") + --> do_domctl + +xc_domain_node_setaffinity + --Currently not used by the Xapi toolstack + --> do_domctl + +%% Xen subgraph subgraph xen[Xen Hypervisor] subgraph domain_update_node_affinity["domain_update_node_affinity()"] domain_update_node_aff("domain_update_node_aff()") - ==> check_auto_node{"Is
d->auto_node_affinity
enabled?"} + ==> check_auto_node{"Is domain's
auto_node_affinity
enabled?"} =="yes (default)"==>set_node_affinity_from_vcpu_affinities(" - Set the domain's node_affinity mask as well + Calculate the domain's node_affinity mask from vCPU affinity (used for further NUMA memory allocation for the domain)") end @@ -48,20 +120,37 @@ subgraph xen[Xen Hypervisor] ==>domain_update_node_aff do_domctl --XEN_DOMCTL_setnodeaffinity (not used currently) - -->nodes_full + -->is_new_affinity_all_nodes? subgraph domain_set_node_affinity["domain_set_node_affinity()"] - nodes_full{new_affinity
is #34;all#34;?} - --is #34;all#34;--> - set_auto("auto_node_affinity=1")--> - domain_update_node_aff - nodes_full - --not #34;all#34;-->fixed("auto_node_affinity=0 - node_affinity=new_affinity") - -->domain_update_node_aff + + is_new_affinity_all_nodes?{new_affinity
is #34;all#34;?} + + --is #34;all#34; + + --> enable_auto_node_affinity("auto_node_affinity=1") + --> domain_update_node_aff + + is_new_affinity_all_nodes? + + --not #34;all#34; + + --> disable_auto_node_affinity("auto_node_affinity=0") + --> domain_update_node_aff end + +%% setting and getting the struct domain's node_affinity: + +disable_auto_node_affinity + --node_affinity=new_affinity--> + domain_node_affinity + +set_node_affinity_from_vcpu_affinities + ==> domain_node_affinity@{ shape: bow-rect,label: "domain: node_affinity" } + --XEN_DOMCTL_getnodeaffinity--> do_domctl + end -click Host.numa_affinity_policy +click is_Host.numa_affinity_policy_set? "https://github.com/xapi-project/xen-api/blob/90ef043c1f3a3bc20f1c5d3ccaaf6affadc07983/ocaml/xenopsd/xc/domain.ml#L951-L962" click numa_placement "https://github.com/xapi-project/xen-api/blob/90ef043c/ocaml/xenopsd/xc/domain.ml#L862-L897" @@ -75,7 +164,7 @@ click domain_set_node_affinity "https://github.com/xen-project/xen/blob/7cf163879/xen/common/domain.c#L943-L970" _blank click configure_vcpus "https://github.com/xenserver/xen.pg/blob/65c0438b/patches/xenguest.patch#L1297-L1348" _blank -click affinity_set +click set_hard_affinity? "https://github.com/xenserver/xen.pg/blob/65c0438b/patches/xenguest.patch#L1305-L1326" _blank click xc_vcpu_setaffinity "https://github.com/xen-project/xen/blob/7cf16387/tools/libs/ctrl/xc_domain.c#L199-L250" _blank