Skip to content

Commit

Permalink
implement a SIMD version of drawFillOver
Browse files Browse the repository at this point in the history
  • Loading branch information
aarzilli committed Aug 26, 2018
1 parent 1d42e34 commit 14e8ef6
Show file tree
Hide file tree
Showing 6 changed files with 336 additions and 18 deletions.
96 changes: 79 additions & 17 deletions context.go
Expand Up @@ -283,10 +283,10 @@ func (ctx *context) restackClick(w *Window) bool {
}

var cnt = 0
var ln, frect, brrect, frrect, ftri, circ, fcirc, txt int
var ln, frect, frectover, brrect, frrect, ftri, circ, fcirc, txt int

func (ctx *context) Draw(wimg *image.RGBA) int {
var txttim, tritim, brecttim, frecttim, frrecttim time.Duration
var txttim, tritim, brecttim, frecttim, frectovertim, frrecttim time.Duration
var t0 time.Time

img := wimg
Expand Down Expand Up @@ -331,11 +331,11 @@ func (ctx *context) Draw(wimg *image.RGBA) int {
if cmd.Begin.X == cmd.End.X {
// draw vertical line
r := image.Rect(cmd.Begin.X-h1, cmd.Begin.Y, cmd.Begin.X+h2, cmd.End.Y)
draw.Draw(img, r, colimg, r.Min, op)
drawFill(img, r, colimg, r.Min, op)
} else if cmd.Begin.Y == cmd.End.Y {
// draw horizontal line
r := image.Rect(cmd.Begin.X, cmd.Begin.Y-h1, cmd.End.X, cmd.Begin.Y+h2)
draw.Draw(img, r, colimg, r.Min, op)
drawFill(img, r, colimg, r.Min, op)
} else {
if rasterizer == nil {
setupRasterizer()
Expand Down Expand Up @@ -399,8 +399,8 @@ func (ctx *context) Draw(wimg *image.RGBA) int {
top := image.Rect(body.Min.X, body.Min.Y, body.Max.X, body.Min.Y+border)
bot := image.Rect(body.Min.X, body.Max.Y-border, body.Max.X, body.Max.Y)

draw.Draw(img, top, colimg, top.Min, op)
draw.Draw(img, bot, colimg, bot.Min, op)
drawFill(img, top, colimg, top.Min, op)
drawFill(img, bot, colimg, bot.Min, op)

if border < int(cmd.Rounding) {
// wings need shrinking
Expand All @@ -414,23 +414,27 @@ func (ctx *context) Draw(wimg *image.RGBA) int {
xlwing := image.Rect(top.Min.X, top.Max.Y, top.Min.X+d, bot.Min.Y)
xrwing := image.Rect(top.Max.X-d, top.Max.Y, top.Max.X, bot.Min.Y)

draw.Draw(img, xlwing, colimg, xlwing.Min, op)
draw.Draw(img, xrwing, colimg, xrwing.Min, op)
drawFill(img, xlwing, colimg, xlwing.Min, op)
drawFill(img, xrwing, colimg, xrwing.Min, op)
}

brrect++
} else {
draw.Draw(img, body, colimg, body.Min, op)
drawFill(img, body, colimg, body.Min, op)
if cmd.Rounding == 0 {
frect++
if op == draw.Src {
frect++
} else {
frectover++
}
} else {
frrect++
}
}

if rounding {
draw.Draw(img, lwing, colimg, lwing.Min, op)
draw.Draw(img, rwing, colimg, rwing.Min, op)
drawFill(img, lwing, colimg, lwing.Min, op)
drawFill(img, rwing, colimg, rwing.Min, op)

rangle := math.Pi / 2

Expand All @@ -454,7 +458,15 @@ func (ctx *context) Draw(wimg *image.RGBA) int {
if cmd.Rounding > 0 {
frrecttim += time.Now().Sub(t0)
} else {
frecttim += time.Now().Sub(t0)
d := time.Now().Sub(t0)
if op == draw.Src {
frecttim += d
} else {
if d > 8*time.Millisecond {
fmt.Printf("outstanding rect")
}
frectovertim += d
}
}
}
}
Expand Down Expand Up @@ -529,13 +541,13 @@ func (ctx *context) Draw(wimg *image.RGBA) int {
}

if perfUpdate {
fmt.Printf("triangle: %0.4fms text: %0.4fms brect: %0.4fms frect: %0.4fms frrect %0.4f\n", tritim.Seconds()*1000, txttim.Seconds()*1000, brecttim.Seconds()*1000, frecttim.Seconds()*1000, frrecttim.Seconds()*1000)
fmt.Printf("triangle: %0.4fms text: %0.4fms brect: %0.4fms frect: %0.4fms frectover: %0.4fms frrect %0.4f\n", tritim.Seconds()*1000, txttim.Seconds()*1000, brecttim.Seconds()*1000, frecttim.Seconds()*1000, frectovertim.Seconds()*1000, frrecttim.Seconds()*1000)
}

cnt++
if perfUpdate && (cnt%100) == 0 {
fmt.Printf("ln %d, frect %d, frrect %d, brrect %d, ftri %d, circ %d, fcirc %d, txt %d\n", ln, frect, frrect, brrect, ftri, circ, fcirc, txt)
ln, frect, frrect, brrect, ftri, circ, fcirc, txt = 0, 0, 0, 0, 0, 0, 0, 0
if perfUpdate /*&& (cnt%100) == 0*/ {
fmt.Printf("ln %d, frect %d, frectover %d, frrect %d, brrect %d, ftri %d, circ %d, fcirc %d, txt %d\n", ln, frect, frectover, frrect, brrect, ftri, circ, fcirc, txt)
ln, frect, frectover, frrect, brrect, ftri, circ, fcirc, txt = 0, 0, 0, 0, 0, 0, 0, 0, 0
}

return len(ctx.cmds)
Expand Down Expand Up @@ -942,3 +954,53 @@ func percentages(bounds rect.Rect, f float64) (r [4]rect.Rect) {
r[3].W = pw
return
}

func clip(dst *image.RGBA, r *image.Rectangle, src image.Image, sp *image.Point) {
orig := r.Min
*r = r.Intersect(dst.Bounds())
*r = r.Intersect(src.Bounds().Add(orig.Sub(*sp)))
dx := r.Min.X - orig.X
dy := r.Min.Y - orig.Y
if dx == 0 && dy == 0 {
return
}
sp.X += dx
sp.Y += dy
}

func drawFill(dst *image.RGBA, r image.Rectangle, src *image.Uniform, sp image.Point, op draw.Op) {
clip(dst, &r, src, &sp)
sr, sg, sb, sa := src.RGBA()
switch op {
case draw.Over:
drawFillOver(dst, r, sr, sg, sb, sa)
case draw.Src:
drawFillSrc(dst, r, sr, sg, sb, sa)
default:
draw.Draw(dst, r, src, sp, op)
}
}

func drawFillSrc(dst *image.RGBA, r image.Rectangle, sr, sg, sb, sa uint32) {
sr8 := uint8(sr >> 8)
sg8 := uint8(sg >> 8)
sb8 := uint8(sb >> 8)
sa8 := uint8(sa >> 8)
// The built-in copy function is faster than a straightforward for loop to fill the destination with
// the color, but copy requires a slice source. We therefore use a for loop to fill the first row, and
// then use the first row as the slice source for the remaining rows.
i0 := dst.PixOffset(r.Min.X, r.Min.Y)
i1 := i0 + r.Dx()*4
for i := i0; i < i1; i += 4 {
dst.Pix[i+0] = sr8
dst.Pix[i+1] = sg8
dst.Pix[i+2] = sb8
dst.Pix[i+3] = sa8
}
firstRow := dst.Pix[i0:i1]
for y := r.Min.Y + 1; y < r.Max.Y; y++ {
i0 += dst.Stride
i1 += dst.Stride
copy(dst.Pix[i0:i1], firstRow)
}
}
14 changes: 14 additions & 0 deletions drawfillover_amd64.go
@@ -0,0 +1,14 @@
package nucular

import "image"

func drawFillOver_SIMD_internal(base *uint8, i0, i1 int, stride, n int, adivm, sr, sg, sb, sa uint32)

func drawFillOver(dst *image.RGBA, r image.Rectangle, sr, sg, sb, sa uint32) {
const m = 1<<16 - 1
a := (m - sa) * 0x101
adivm := a / m
i0 := dst.PixOffset(r.Min.X, r.Min.Y)
i1 := i0 + r.Dx()*4
drawFillOver_SIMD_internal(&dst.Pix[0], i0, i1, dst.Stride, r.Max.Y-r.Min.Y, adivm, sr, sg, sb, sa)
}
77 changes: 77 additions & 0 deletions drawfillover_amd64.s
@@ -0,0 +1,77 @@
#include "textflag.h"

GLOBL drawFillOver_SIMD_shufflemap<>(SB), (NOPTR+RODATA), $4
DATA drawFillOver_SIMD_shufflemap<>+0x00(SB)/4, $0x0d090501

TEXT ·drawFillOver_SIMD_internal(SB),0,$0-60
// base+0(FP)
// i0+8(FP)
// i1+16(FP)
// stride+24(FP)
// n+32(FP)
// adivm+40(FP)
// sr+44(FP)
// sg+48(FP)
// sb+52(FP)
// sa+56(FP)

// DX row index
// CX column index
// AX pointer to current pixel
// R14 i0
// R15 i1

// X0 zeroed register
// X1 current pixel
// X3 source pixel
// X4 is the shuffle map to do the >> 8 and pack everything back into a single 32bit value

MOVSS drawFillOver_SIMD_shufflemap<>(SB), X4

PXOR X0, X0
MOVQ i0+8(FP), R14
MOVQ i1+16(FP), R15

// load adivm to X2, fill all uint32s with it
MOVSS advim+40(FP), X2
VBROADCASTSS X2, X2

// load source pixel to X3
VMOVDQU sr+44(FP), X3

MOVQ $0, DX
row_loop:
CMPQ DX, n+32(FP)
JGE row_loop_end

MOVQ R14, CX
MOVQ base+0(FP), AX
LEAQ (AX)(CX*1), AX
column_loop:
CMPQ CX, R15
JGE column_loop_end

// load current pixel to X1, unpack twice to get uint32s
MOVSS (AX), X1
PUNPCKLBW X0, X1
VPUNPCKLWD X0, X1, X1

VPMULLD X2, X1, X1 // component * a/m
VPADDD X3, X1, X1 // (component * a/m) + source_component

VPSHUFB X4, X1, X1 // get the second byte of every 32bit word and pack it into the lowest word of X1
MOVSS X1, (AX) // write back to memory

ADDQ $4, CX
ADDQ $4, AX
JMP column_loop

column_loop_end:
ADDQ stride+24(FP), R14
ADDQ stride+24(FP), R15
INCQ DX
JMP row_loop

row_loop_end:

RET
31 changes: 31 additions & 0 deletions drawfillover_other.go
@@ -0,0 +1,31 @@
// +build !amd64

package nucular

import (
"image"
)

func drawFillOver(dst *image.RGBA, r image.Rectangle, sr, sg, sb, sa uint32) {
fmt.Printf("fucked up!\n")
const m = 1<<16 - 1
// The 0x101 is here for the same reason as in drawRGBA.
a := (m - sa) * 0x101
i0 := dst.PixOffset(r.Min.X, r.Min.Y)
i1 := i0 + r.Dx()*4
for y := r.Min.Y; y != r.Max.Y; y++ {
for i := i0; i < i1; i += 4 {
dr := &dst.Pix[i+0]
dg := &dst.Pix[i+1]
db := &dst.Pix[i+2]
da := &dst.Pix[i+3]

*dr = uint8((uint32(*dr)*a/m + sr) >> 8)
*dg = uint8((uint32(*dg)*a/m + sg) >> 8)
*db = uint8((uint32(*db)*a/m + sb) >> 8)
*da = uint8((uint32(*da)*a/m + sa) >> 8)
}
i0 += dst.Stride
i1 += dst.Stride
}
}

0 comments on commit 14e8ef6

Please sign in to comment.