Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion cranelift/codegen/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ use crate::{timing, CompileError};
use alloc::string::String;
use alloc::vec::Vec;
use cranelift_control::ControlPlane;
use target_lexicon::Architecture;

#[cfg(feature = "souper-harvest")]
use crate::souper_harvest::do_souper_harvest;
Expand Down Expand Up @@ -282,7 +283,15 @@ impl Context {

/// Perform NaN canonicalizing rewrites on the function.
pub fn canonicalize_nans(&mut self, isa: &dyn TargetIsa) -> CodegenResult<()> {
do_nan_canonicalization(&mut self.func);
// Currently only RiscV64 is the only arch that may not have vector support.
let has_vector_support = match isa.triple().architecture {
Architecture::Riscv64(_) => match isa.isa_flags().iter().find(|f| f.name == "has_v") {
Some(value) => value.as_bool().unwrap_or(false),
None => false,
},
_ => true,
};
do_nan_canonicalization(&mut self.func, has_vector_support);
self.verify_if(isa)
}

Expand Down
1 change: 1 addition & 0 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -1428,6 +1428,7 @@
(decl pure partial all_ones_or_all_zeros (Value) bool)
(rule (all_ones_or_all_zeros (and (icmp _ _ _) (value_type (multi_lane _ _)))) $true)
(rule (all_ones_or_all_zeros (and (fcmp _ _ _) (value_type (multi_lane _ _)))) $true)
(rule (all_ones_or_all_zeros (and (bitcast _ (fcmp _ _ _)) (value_type (multi_lane _ _)))) $true)
Copy link
Contributor Author

@adambratschikaye adambratschikaye Apr 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original pattern was never triggered when doing NaN-canonicalization because fcmp will result in either an I32X4 or I64X2 which always needs to be bitcast back to F32X4 or F64X2 before it can be passed to bitselect.

(rule (all_ones_or_all_zeros (vconst (vconst_all_ones_or_all_zeros))) $true)

(decl pure vconst_all_ones_or_all_zeros () Constant)
Expand Down
38 changes: 29 additions & 9 deletions cranelift/codegen/src/nan_canonicalization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
use crate::cursor::{Cursor, FuncCursor};
use crate::ir::condcodes::FloatCC;
use crate::ir::immediates::{Ieee32, Ieee64};
use crate::ir::types;
use crate::ir::types::{self};
use crate::ir::{Function, Inst, InstBuilder, InstructionData, Opcode, Value};
use crate::opts::MemFlags;
use crate::timing;
Expand All @@ -15,13 +15,13 @@ static CANON_32BIT_NAN: u32 = 0b01111111110000000000000000000000;
static CANON_64BIT_NAN: u64 = 0b0111111111111000000000000000000000000000000000000000000000000000;

/// Perform the NaN canonicalization pass.
pub fn do_nan_canonicalization(func: &mut Function) {
pub fn do_nan_canonicalization(func: &mut Function, has_vector_support: bool) {
let _tt = timing::canonicalize_nans();
let mut pos = FuncCursor::new(func);
while let Some(_block) = pos.next_block() {
while let Some(inst) = pos.next_inst() {
if is_fp_arith(&mut pos, inst) {
add_nan_canon_seq(&mut pos, inst);
add_nan_canon_seq(&mut pos, inst, has_vector_support);
}
}
}
Expand Down Expand Up @@ -57,24 +57,36 @@ fn is_fp_arith(pos: &mut FuncCursor, inst: Inst) -> bool {
}

/// Append a sequence of canonicalizing instructions after the given instruction.
fn add_nan_canon_seq(pos: &mut FuncCursor, inst: Inst) {
fn add_nan_canon_seq(pos: &mut FuncCursor, inst: Inst, has_vector_support: bool) {
// Select the instruction result, result type. Replace the instruction
// result and step forward before inserting the canonicalization sequence.
let val = pos.func.dfg.first_result(inst);
let val_type = pos.func.dfg.value_type(val);
let new_res = pos.func.dfg.replace_result(val, val_type);
let _next_inst = pos.next_inst().expect("block missing terminator!");

// Insert a comparison instruction, to check if `inst_res` is NaN. Select
// the canonical NaN value if `val` is NaN, assign the result to `inst`.
let is_nan = pos.ins().fcmp(FloatCC::NotEqual, new_res, new_res);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Without any of the other changes, just changing this comparison from NotEqual to Unordered removes one of the two jumps which is a significant improvement. With the other changes I don't think there's a difference between using NotEqual or Unordered, but Unordered seemed more precise.

// Insert a comparison instruction, to check if `inst_res` is NaN (comparing
// against NaN is always unordered). Select the canonical NaN value if `val`
// is NaN, assign the result to `inst`.
let comparison = FloatCC::Unordered;

let vectorized_scalar_select = |pos: &mut FuncCursor, canon_nan: Value, ty: types::Type| {
let canon_nan = pos.ins().scalar_to_vector(ty, canon_nan);
let new_res = pos.ins().scalar_to_vector(ty, new_res);
let is_nan = pos.ins().fcmp(comparison, new_res, new_res);
let is_nan = pos.ins().bitcast(ty, MemFlags::new(), is_nan);
let simd_result = pos.ins().bitselect(is_nan, canon_nan, new_res);
pos.ins().with_result(val).extractlane(simd_result, 0);
};
let scalar_select = |pos: &mut FuncCursor, canon_nan: Value| {
let is_nan = pos.ins().fcmp(comparison, new_res, new_res);
pos.ins()
.with_result(val)
.select(is_nan, canon_nan, new_res);
};

let vector_select = |pos: &mut FuncCursor, canon_nan: Value| {
let is_nan = pos.ins().fcmp(comparison, new_res, new_res);
let is_nan = pos.ins().bitcast(val_type, MemFlags::new(), is_nan);
pos.ins()
.with_result(val)
Expand All @@ -84,11 +96,19 @@ fn add_nan_canon_seq(pos: &mut FuncCursor, inst: Inst) {
match val_type {
types::F32 => {
let canon_nan = pos.ins().f32const(Ieee32::with_bits(CANON_32BIT_NAN));
scalar_select(pos, canon_nan);
if has_vector_support {
vectorized_scalar_select(pos, canon_nan, types::F32X4);
} else {
scalar_select(pos, canon_nan);
}
}
types::F64 => {
let canon_nan = pos.ins().f64const(Ieee64::with_bits(CANON_64BIT_NAN));
scalar_select(pos, canon_nan);
if has_vector_support {
vectorized_scalar_select(pos, canon_nan, types::F64X2);
} else {
scalar_select(pos, canon_nan);
}
}
types::F32X4 => {
let canon_nan = pos.ins().f32const(Ieee32::with_bits(CANON_32BIT_NAN));
Expand Down
140 changes: 140 additions & 0 deletions cranelift/filetests/filetests/isa/aarch64/nan-canonicalization.clif
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
test compile precise-output
set enable_nan_canonicalization=true
target x86_64 sse41

function %f0(f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4):
v2 = fadd v0, v1
return v2
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; addps %xmm0, %xmm1, %xmm0
; movl $2143289344, %r10d
; movd %r10d, %xmm7
; shufps $0, %xmm7, const(0), %xmm7
; movdqa %xmm0, %xmm1
; cmpps $3, %xmm1, %xmm0, %xmm1
; movdqa %xmm0, %xmm2
; movdqa %xmm1, %xmm0
; movdqa %xmm2, %xmm1
; pblendvb %xmm1, %xmm7, %xmm1
; movdqa %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; addps %xmm1, %xmm0
; movl $0x7fc00000, %r10d
; movd %r10d, %xmm7
; shufps $0, 0x26(%rip), %xmm7
; movdqa %xmm0, %xmm1
; cmpunordps %xmm0, %xmm1
; movdqa %xmm0, %xmm2
; movdqa %xmm1, %xmm0
; movdqa %xmm2, %xmm1
; pblendvb %xmm0, %xmm7, %xmm1
; movdqa %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; sarb $0, (%rdi)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)

function %f1(f64, f64) -> f64 {
block0(v0: f64, v1: f64):
v2 = fadd v0, v1
return v2
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; addsd %xmm0, %xmm1, %xmm0
; movabsq $9221120237041090560, %r9
; movq %r9, %xmm1
; movdqa %xmm0, %xmm7
; cmppd $3, %xmm7, %xmm0, %xmm7
; movdqa %xmm0, %xmm5
; movdqa %xmm7, %xmm0
; pblendvb %xmm5, %xmm1, %xmm5
; movdqa %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; addsd %xmm1, %xmm0
; movabsq $0x7ff8000000000000, %r9
; movq %r9, %xmm1
; movdqa %xmm0, %xmm7
; cmpunordpd %xmm0, %xmm7
; movdqa %xmm0, %xmm5
; movdqa %xmm7, %xmm0
; pblendvb %xmm0, %xmm1, %xmm5
; movdqa %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

function %f1(f32, f32) -> f32 {
block0(v0: f32, v1: f32):
v2 = fadd v0, v1
return v2
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; addss %xmm0, %xmm1, %xmm0
; movl $2143289344, %r9d
; movd %r9d, %xmm1
; movdqa %xmm0, %xmm7
; cmpps $3, %xmm7, %xmm0, %xmm7
; movdqa %xmm0, %xmm5
; movdqa %xmm7, %xmm0
; pblendvb %xmm5, %xmm1, %xmm5
; movdqa %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; addss %xmm1, %xmm0
; movl $0x7fc00000, %r9d
; movd %r9d, %xmm1
; movdqa %xmm0, %xmm7
; cmpunordps %xmm0, %xmm7
; movdqa %xmm0, %xmm5
; movdqa %xmm7, %xmm0
; pblendvb %xmm0, %xmm1, %xmm5
; movdqa %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

Loading