Skip to content

Commit

Permalink
Avoid calling probe_read() twice in load_cap_from_memory_raw_tag_mmu_…
Browse files Browse the repository at this point in the history
…idx()

probe_read() is a very hot function when booting CheriBSD (6.42% of the
total time for a purecap MFS_ROOT /sbin/startup-benchmark.sh boot).
Looking at the perf report, we were calling probe read once inside
load_cap_from_memory_raw_tag_mmu_idx() and then calling it again in
cheri_tag_get(), even though we know that the second call cannot trigger
a MMU fault. Pass the host address as a argument to cheri_tag_get() and
skip the probe_read() call if the passed argument is non-NULL.

With this change probe_read() is down to 3.69% of the total.
```
hyperfine -L qemu qemu-system-riscv64cheri,qemu-system-riscv64cheri.dev-post-morello  '/local/scratch/alr48/cheri/output/sdk/bin/{qemu} -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh -device virtio-rng-pci' -m 10
Benchmark CTSRD-CHERI#1: /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh -device virtio-rng-pci
  Time (mean ± σ):      8.005 s ±  0.461 s    [User: 7.585 s, System: 0.151 s]
  Range (min … max):    7.741 s …  8.908 s    10 runs

  Warning: Statistical outliers were detected. Consider re-running this benchmark on a quiet PC without any interferences from other programs. It might help to use the '--warmup' or '--prepare' options.

Benchmark CTSRD-CHERI#2: /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.dev-post-morello -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh -device virtio-rng-pci
  Time (mean ± σ):      8.599 s ±  0.530 s    [User: 7.816 s, System: 0.122 s]
  Range (min … max):    7.821 s …  8.989 s    10 runs

Summary
  '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh -device virtio-rng-pci' ran
    1.07 ± 0.09 times faster than '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.dev-post-morello -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh -device virtio-rng-pci'
```

```
perf stat -r 5 /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh -device virtio-rng-pci > /dev/null

 Performance counter stats for '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh -device virtio-rng-pci' (5 runs):

       8177.658418      task-clock (msec)         #    0.936 CPUs utilized            ( +-  0.71% )
             2,398      context-switches          #    0.293 K/sec                    ( +-  1.64% )
                 0      cpu-migrations            #    0.000 K/sec                    ( +-100.00% )
            12,858      page-faults               #    0.002 M/sec                    ( +-  0.59% )
    27,261,165,519      cycles                    #    3.334 GHz                      ( +-  0.17% )
    75,402,569,943      instructions              #    2.77  insn per cycle           ( +-  0.16% )
    11,183,417,524      branches                  # 1367.557 M/sec                    ( +-  0.15% )
       162,719,113      branch-misses             #    1.46% of all branches          ( +-  0.20% )

       8.738376962 seconds time elapsed                                          ( +-  3.03% )
```

Before this change:
```
 Performance counter stats for '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.dev-post-morello -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh -device virtio-rng-pci' (5 runs):

       8429.485885      task-clock (msec)         #    0.911 CPUs utilized            ( +-  0.35% )
             2,508      context-switches          #    0.298 K/sec                    ( +-  0.61% )
                 1      cpu-migrations            #    0.000 K/sec                    ( +- 66.67% )
            12,867      page-faults               #    0.002 M/sec                    ( +-  0.79% )
    27,831,470,600      cycles                    #    3.302 GHz                      ( +-  0.08% )
    77,801,651,546      instructions              #    2.80  insn per cycle           ( +-  0.01% )
    11,223,140,290      branches                  # 1331.415 M/sec                    ( +-  0.01% )
       164,672,766      branch-misses             #    1.47% of all branches          ( +-  0.08% )

       9.256983251 seconds time elapsed                                          ( +-  0.23% )
```
  • Loading branch information
arichardson committed Oct 13, 2021
1 parent bfbf8bb commit bf5308c
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 4 deletions.
7 changes: 5 additions & 2 deletions target/cheri-common/cheri_tagmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -553,10 +553,13 @@ void cheri_tag_set(CPUArchState *env, target_ulong vaddr, int reg,
}

bool cheri_tag_get(CPUArchState *env, target_ulong vaddr, int reg,
hwaddr *ret_paddr, int *prot, uintptr_t pc, int mmu_idx)
hwaddr *ret_paddr, int *prot, uintptr_t pc, int mmu_idx,
void *host_addr)
{

void *host_addr = probe_read(env, vaddr, 1, mmu_idx, pc);
if (host_addr == NULL) {
host_addr = probe_read(env, vaddr, 1, mmu_idx, pc);
}
handle_paddr_return(read);

uintptr_t tagmem_flags;
Expand Down
7 changes: 6 additions & 1 deletion target/cheri-common/cheri_tagmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,13 @@ void cheri_tag_invalidate(CPUArchState *env, target_ulong vaddr, int32_t size,
*/
void cheri_tag_invalidate_aligned(CPUArchState *env, target_ulong vaddr,
uintptr_t pc, int mmu_idx);
/**
* If probe_read() has already been called, the result can be passed as the
* @p host_addr argument to avoid another (expensive) probe_read() call.
*/
bool cheri_tag_get(CPUArchState *env, target_ulong vaddr, int reg,
hwaddr *ret_paddr, int *prot, uintptr_t pc, int mmu_idx);
hwaddr *ret_paddr, int *prot, uintptr_t pc, int mmu_idx,
void *host_addr);
/*
* Get/set many currently don't have an mmu_idx because no targets currently
* require it.
Expand Down
3 changes: 2 additions & 1 deletion target/cheri-common/op_helper_cheri_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -1276,7 +1276,8 @@ bool load_cap_from_memory_raw_tag_mmu_idx(
*cursor = cpu_ld_cap_word_ra(env, vaddr + CHERI_MEM_OFFSET_CURSOR, retpc);
}
int prot;
bool tag = cheri_tag_get(env, vaddr, cb, physaddr, &prot, retpc, mmu_idx);
bool tag =
cheri_tag_get(env, vaddr, cb, physaddr, &prot, retpc, mmu_idx, host);
if (raw_tag) {
*raw_tag = tag;
}
Expand Down

0 comments on commit bf5308c

Please sign in to comment.