From bf5308cfce2a58332a334b8c33773f72697b8b62 Mon Sep 17 00:00:00 2001 From: Alex Richardson Date: Wed, 13 Oct 2021 11:37:22 +0100 Subject: [PATCH] Avoid calling probe_read() twice in load_cap_from_memory_raw_tag_mmu_idx() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit probe_read() is a very hot function when booting CheriBSD (6.42% of the total time for a purecap MFS_ROOT /sbin/startup-benchmark.sh boot). Looking at the perf report, we were calling probe read once inside load_cap_from_memory_raw_tag_mmu_idx() and then calling it again in cheri_tag_get(), even though we know that the second call cannot trigger a MMU fault. Pass the host address as a argument to cheri_tag_get() and skip the probe_read() call if the passed argument is non-NULL. With this change probe_read() is down to 3.69% of the total. ``` hyperfine -L qemu qemu-system-riscv64cheri,qemu-system-riscv64cheri.dev-post-morello '/local/scratch/alr48/cheri/output/sdk/bin/{qemu} -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh -device virtio-rng-pci' -m 10 Benchmark #1: /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh -device virtio-rng-pci Time (mean ± σ): 8.005 s ± 0.461 s [User: 7.585 s, System: 0.151 s] Range (min … max): 7.741 s … 8.908 s 10 runs Warning: Statistical outliers were detected. Consider re-running this benchmark on a quiet PC without any interferences from other programs. It might help to use the '--warmup' or '--prepare' options. Benchmark #2: /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.dev-post-morello -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh -device virtio-rng-pci Time (mean ± σ): 8.599 s ± 0.530 s [User: 7.816 s, System: 0.122 s] Range (min … max): 7.821 s … 8.989 s 10 runs Summary '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh -device virtio-rng-pci' ran 1.07 ± 0.09 times faster than '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.dev-post-morello -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh -device virtio-rng-pci' ``` ``` perf stat -r 5 /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh -device virtio-rng-pci > /dev/null Performance counter stats for '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh -device virtio-rng-pci' (5 runs): 8177.658418 task-clock (msec) # 0.936 CPUs utilized ( +- 0.71% ) 2,398 context-switches # 0.293 K/sec ( +- 1.64% ) 0 cpu-migrations # 0.000 K/sec ( +-100.00% ) 12,858 page-faults # 0.002 M/sec ( +- 0.59% ) 27,261,165,519 cycles # 3.334 GHz ( +- 0.17% ) 75,402,569,943 instructions # 2.77 insn per cycle ( +- 0.16% ) 11,183,417,524 branches # 1367.557 M/sec ( +- 0.15% ) 162,719,113 branch-misses # 1.46% of all branches ( +- 0.20% ) 8.738376962 seconds time elapsed ( +- 3.03% ) ``` Before this change: ``` Performance counter stats for '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.dev-post-morello -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh -device virtio-rng-pci' (5 runs): 8429.485885 task-clock (msec) # 0.911 CPUs utilized ( +- 0.35% ) 2,508 context-switches # 0.298 K/sec ( +- 0.61% ) 1 cpu-migrations # 0.000 K/sec ( +- 66.67% ) 12,867 page-faults # 0.002 M/sec ( +- 0.79% ) 27,831,470,600 cycles # 3.302 GHz ( +- 0.08% ) 77,801,651,546 instructions # 2.80 insn per cycle ( +- 0.01% ) 11,223,140,290 branches # 1331.415 M/sec ( +- 0.01% ) 164,672,766 branch-misses # 1.47% of all branches ( +- 0.08% ) 9.256983251 seconds time elapsed ( +- 0.23% ) ``` --- target/cheri-common/cheri_tagmem.c | 7 +++++-- target/cheri-common/cheri_tagmem.h | 7 ++++++- target/cheri-common/op_helper_cheri_common.c | 3 ++- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/target/cheri-common/cheri_tagmem.c b/target/cheri-common/cheri_tagmem.c index ed25865e91f..5b601a646f5 100644 --- a/target/cheri-common/cheri_tagmem.c +++ b/target/cheri-common/cheri_tagmem.c @@ -553,10 +553,13 @@ void cheri_tag_set(CPUArchState *env, target_ulong vaddr, int reg, } bool cheri_tag_get(CPUArchState *env, target_ulong vaddr, int reg, - hwaddr *ret_paddr, int *prot, uintptr_t pc, int mmu_idx) + hwaddr *ret_paddr, int *prot, uintptr_t pc, int mmu_idx, + void *host_addr) { - void *host_addr = probe_read(env, vaddr, 1, mmu_idx, pc); + if (host_addr == NULL) { + host_addr = probe_read(env, vaddr, 1, mmu_idx, pc); + } handle_paddr_return(read); uintptr_t tagmem_flags; diff --git a/target/cheri-common/cheri_tagmem.h b/target/cheri-common/cheri_tagmem.h index e54185fbc6b..0da0232e731 100644 --- a/target/cheri-common/cheri_tagmem.h +++ b/target/cheri-common/cheri_tagmem.h @@ -56,8 +56,13 @@ void cheri_tag_invalidate(CPUArchState *env, target_ulong vaddr, int32_t size, */ void cheri_tag_invalidate_aligned(CPUArchState *env, target_ulong vaddr, uintptr_t pc, int mmu_idx); +/** + * If probe_read() has already been called, the result can be passed as the + * @p host_addr argument to avoid another (expensive) probe_read() call. + */ bool cheri_tag_get(CPUArchState *env, target_ulong vaddr, int reg, - hwaddr *ret_paddr, int *prot, uintptr_t pc, int mmu_idx); + hwaddr *ret_paddr, int *prot, uintptr_t pc, int mmu_idx, + void *host_addr); /* * Get/set many currently don't have an mmu_idx because no targets currently * require it. diff --git a/target/cheri-common/op_helper_cheri_common.c b/target/cheri-common/op_helper_cheri_common.c index d84fa4c7beb..d2e81ff953a 100644 --- a/target/cheri-common/op_helper_cheri_common.c +++ b/target/cheri-common/op_helper_cheri_common.c @@ -1276,7 +1276,8 @@ bool load_cap_from_memory_raw_tag_mmu_idx( *cursor = cpu_ld_cap_word_ra(env, vaddr + CHERI_MEM_OFFSET_CURSOR, retpc); } int prot; - bool tag = cheri_tag_get(env, vaddr, cb, physaddr, &prot, retpc, mmu_idx); + bool tag = + cheri_tag_get(env, vaddr, cb, physaddr, &prot, retpc, mmu_idx, host); if (raw_tag) { *raw_tag = tag; }