Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
spapr: Support NVIDIA V100 GPU with NVLink2
The NVIDIA V100 GPU comes with some on-board RAM which is mapped into
the host memory space and accessible as normal RAM via NVLink bus.
The VFIO-PCI driver implements special regions for such GPU and emulated
NVLink bridge (below referred as NPU). The POWER9 CPU also provides
address translation services which includes TLB invalidation register
exposes via the NVLink bridge; the feature is called "ATSD".

This adds a quirk to VFIO to map the memory and create an MR; the new MR
is stored in a GPU as a QOM link. The sPAPR PCI uses this to get the MR
and map it to the system address space. Another quirk does the same for
ATSD.

This adds 4 additional steps to the FDT builder in spapr-pci:

1. Search for specific GPUs and NPUs, collects findings in sPAPRPHBState;

2. Add properties in the DT: "ibm,npu", "ibm,gpu", "memory-block",
and some other as these are required by the guest platform and GPU driver;

3. Add new memory blocks with one addition - they have
"linux,memory-usable" property configured in the way which prevents
the guest from onlining it automatically as it needs to be deferred till
the guest GPU driver trains NVLink.

4. Add a npuphb# node representing an NPU for every vPHB; the pseries
guest uses it to detect NPU2 hardware.

A couple of notes:
- this changes the FDT rendeder as doing 1-2-3 from sPAPRPHBClass::realize
impossible - devices are not yet attached;
- this does not add VFIO quirk MRs to the system address space as
the address is selected in sPAPRPHBState, similar to MMIO.

This puts new memory nodes in a separate NUMA node to replicate the host
system setup as close as possible (the GPU driver relies on this too).

This adds requirement similar to EEH - one IOMMU group per vPHB; if not
met, this disables ATSD support for a vPHB (with a warning message)
as we present ATSD via the vPHB's device tree and we must not mix ATSD
registers from different NPUs units on the host system. The IOMMU
grouping makes sure this does not happen so QEMU only follows the lead
here.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---

The example command line for redbud system:

pbuild/qemu-aiku1804le-ppc64/ppc64-softmmu/qemu-system-ppc64 \
-nodefaults \
-chardev stdio,id=STDIO0,signal=off,mux=on \
-device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
-mon id=MON0,chardev=STDIO0,mode=readline -nographic -vga none \
-enable-kvm -m 384G \
-chardev socket,id=SOCKET0,server,nowait,host=localhost,port=40000 \
-mon chardev=SOCKET0,mode=control \
-smp 80,sockets=1,threads=4 \
-netdev "tap,id=TAP0,helper=/home/aik/qemu-bridge-helper --br=br0" \
-device "virtio-net-pci,id=vnet0,mac=52:54:00:12:34:56,netdev=TAP0" \
img/vdisk0.img \
-device "vfio-pci,id=vfio0004_04_00_0,host=0004:04:00.0" \
-device "vfio-pci,id=vfio0006_00_00_0,host=0006:00:00.0" \
-device "vfio-pci,id=vfio0006_00_00_1,host=0006:00:00.1" \
-device "vfio-pci,id=vfio0006_00_00_2,host=0006:00:00.2" \
-device "vfio-pci,id=vfio0004_05_00_0,host=0004:05:00.0" \
-device "vfio-pci,id=vfio0006_00_01_0,host=0006:00:01.0" \
-device "vfio-pci,id=vfio0006_00_01_1,host=0006:00:01.1" \
-device "vfio-pci,id=vfio0006_00_01_2,host=0006:00:01.2" \
-device spapr-pci-host-bridge,id=phb1,index=1 \
-device "vfio-pci,id=vfio0035_03_00_0,host=0035:03:00.0" \
-device "vfio-pci,id=vfio0007_00_00_0,host=0007:00:00.0" \
-device "vfio-pci,id=vfio0007_00_00_1,host=0007:00:00.1" \
-device "vfio-pci,id=vfio0007_00_00_2,host=0007:00:00.2" \
-device "vfio-pci,id=vfio0035_04_00_0,host=0035:04:00.0" \
-device "vfio-pci,id=vfio0007_00_01_0,host=0007:00:01.0" \
-device "vfio-pci,id=vfio0007_00_01_1,host=0007:00:01.1" \
-device "vfio-pci,id=vfio0007_00_01_2,host=0007:00:01.2" -snapshot \
-machine pseries \
-L /home/aik/t/qemu-ppc64-bios/ -d guest_errors

Note that QEMU attaches PCI devices to the last added vPHB so first
8 devices - 4:04:00.0 till 6:00:01.2 - go to the default vPHB, and
35:03:00.0..7:00:01.2 to the vPHB with id=phb1.

---
Changes:
v2:
* got rid of extra compatible type as the updated guest won't need them
* due to the host firmware bug, some NVLink bridges come without ATSD
register, skip those
  • Loading branch information
aik committed Dec 20, 2018
1 parent a1bdecf commit 7073cad
Show file tree
Hide file tree
Showing 8 changed files with 494 additions and 4 deletions.
12 changes: 10 additions & 2 deletions hw/ppc/spapr.c
Expand Up @@ -3760,7 +3760,8 @@ static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState *machine)
static void spapr_phb_placement(sPAPRMachineState *spapr, uint32_t index,
uint64_t *buid, hwaddr *pio,
hwaddr *mmio32, hwaddr *mmio64,
unsigned n_dma, uint32_t *liobns, Error **errp)
unsigned n_dma, uint32_t *liobns,
hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
{
/*
* New-style PHB window placement.
Expand Down Expand Up @@ -3807,6 +3808,9 @@ static void spapr_phb_placement(sPAPRMachineState *spapr, uint32_t index,
*pio = SPAPR_PCI_BASE + index * SPAPR_PCI_IO_WIN_SIZE;
*mmio32 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM32_WIN_SIZE;
*mmio64 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM64_WIN_SIZE;

*nv2gpa = SPAPR_PCI_NV2RAM64_WIN_BASE + index * SPAPR_PCI_NV2RAM64_WIN_SIZE;
*nv2atsd = SPAPR_PCI_NV2ATSD_WIN_BASE + index * SPAPR_PCI_NV2ATSD_WIN_SIZE;
}

static ICSState *spapr_ics_get(XICSFabric *dev, int irq)
Expand Down Expand Up @@ -4191,7 +4195,8 @@ DEFINE_SPAPR_MACHINE(2_8, "2.8", false);
static void phb_placement_2_7(sPAPRMachineState *spapr, uint32_t index,
uint64_t *buid, hwaddr *pio,
hwaddr *mmio32, hwaddr *mmio64,
unsigned n_dma, uint32_t *liobns, Error **errp)
unsigned n_dma, uint32_t *liobns,
hwaddr *nv2_gpa, hwaddr *nv2atsd, Error **errp)
{
/* Legacy PHB placement for pseries-2.7 and earlier machine types */
const uint64_t base_buid = 0x800000020000000ULL;
Expand Down Expand Up @@ -4235,6 +4240,9 @@ static void phb_placement_2_7(sPAPRMachineState *spapr, uint32_t index,
* fallback behaviour of automatically splitting a large "32-bit"
* window into contiguous 32-bit and 64-bit windows
*/

*nv2_gpa = 0;
*nv2atsd = 0;
}

static void spapr_machine_2_7_class_options(MachineClass *mc)
Expand Down

0 comments on commit 7073cad

Please sign in to comment.