In [2]:
from halide import *
import numpy as np
import math

In [3]:
x,y = Var("x"), Var("y")

In [3]:
print("=" * 50)
producer, consumer = Func("producer_default"), Func("consumer_default")
producer[x,y] = sqrt(x * y)
consumer[x,y] = (producer[x,y] + producer[x,y+1] + producer[x+1,y] + producer[x+1,y+1])

consumer.trace_stores()
producer.trace_stores()

print("\nEvaluating producer-consumer pipeline with default schedule")
consumer.realize(4,4)

result = np.empty((4,4), dtype=np.float32)
for yy in range(4):
    for xx in range(4):
        result[yy][xx] = (math.sqrt(xx * yy) + math.sqrt(xx * (yy+1)) + math.sqrt((xx+1) * yy) + math.sqrt((xx+1) * (yy+1)))
print()

print("Pseudo-code for the schedule:")
consumer.print_loop_nest()
print()


Evaluating producer-consumer pipeline with default schedule

Pseudo-code for the schedule:



In [4]:
print("=" * 50)
producer, consumer = Func("producer_root"), Func("consumer_root")
producer[x,y] = sqrt( x * y )
consumer[x,y] = producer[x,y] + producer[x,y+1] + producer[x+1,y] + producer[x+1,y+1]

producer.compute_root()

consumer.trace_stores()
producer.trace_stores()

print("\nEvaluating producer.compute_root()")
consumer.realize(4,4)

# Equivalnet C:
result = np.empty((4,4), dtype=np.float32)
producer_storage = np.empty((5,5), dtype = np.float32)
for yy in range(5):
    for xx in range(5):
        producer_storage[yy][xx] = math.sqrt(xx * yy)
for yy in range(4):
    for xx in range(4):
        result[yy][xx] = producer_storage[yy][xx] + producer_storage[yy+1][xx] + producer_storage[yy][xx+1] + producer_storage[yy+1][xx+1]

print("Pseudo-code for the schedule:")
consumer.print_loop_nest()
print()


Evaluating producer.compute_root()
Pseudo-code for the schedule:



In [5]:
print("=" * 50)
producer, consumer = Func("producer_y"), Func("consumer_y")
producer[x,y] = sqrt(x * y)
consumer[x,y] = producer[x,y] + producer[x,y+1] + producer[x+1,y] + producer[x+1,y+1]

producer.compute_at(consumer, y)

producer.trace_stores()
consumer.trace_stores()

print("\nEvaluating producer.compute_at(consumer,y)")
consumer.realize(4,4)

result = np.empty((4,4), dtype=np.float32)

for yy in range(4):
    producer_storage = np.empty((2,5), dtype=np.float32)
    for py in range(yy,yy + 2):
        for px in range(5):
            producer_storage[py-yy][px] = math.sqrt(px * py)
    
    for xx in range(4):
        result[yy][xx] = producer_storage[0][xx] + producer_storage[1][xx] + producer_storage[0][xx+1] + producer_storage[1][xx+1]

print("Pseudo-code for the schedule:")
consumer.print_loop_nest()
print()


Evaluating producer.compute_at(consumer,y)
Pseudo-code for the schedule:



In [6]:
print("=" * 50)
producer, consumer = Func("producer_store,root_compute_y"), Func("consumer_store_root_compute_y")
producer[x,y] = sqrt(x * y)
consumer[x,y] = producer[x,y] + producer[x,y+1] + producer[x+1,y] + producer[x+1,y+1]

producer.store_root()
producer.compute_at(consumer, y)

producer.trace_stores()
consumer.trace_stores()

print("\nEvaluating producer.store_root().compute_at(consumer,y)")
consumer.realize(4,4)

producer_storage = np.empty((2,5), dtype=np.float32)

for yy in range(4):
    for py in range(yy, yy + 2):
        if yy > 0 and py == yy:
            continue
        
        for px in range(5):
            producer_storage[py & 1][px] = math.sqrt(px * py)
    for xx in range(4):
        result[yy][xx] = producer_storage[yy&1][xx] + producer_storage[(yy+1)&1][xx] + producer_storage[yy&1][xx+1] + producer_storage[(yy+1)&1][xx+1]

print("Pseudo-code for the schedule:")
consumer.print_loop_nest()
print()


Evaluating producer.store_root().compute_at(consumer,y)
Pseudo-code for the schedule:



In [7]:
print("=" * 50)
producer, consumer = Func("producer_store_root_compute_x"), Func("consumer_store_root_compute_x")
producer[x,y] = sqrt(x * y)
consumer[x,y] = producer[x,y] + producer[x, y + 1] + producer[x + 1, y] + producer[x+1,y+1]

producer.store_root().compute_at(consumer,x)

producer.trace_stores()
consumer.trace_stores()

print("\nEvaluating producer.store_root().compute_at(consumer,x)")
consumer.realize(4,4)

result = np.empty((4,4), dtype=np.float32)

producer_storage = np.empty((2,5), dtype=np.float32)

for yy in range(4):
    for xx in range(4):
        if (yy == 0) and (xx == 0):
            producer_storage[yy][xx] = math.sqrt(xx * yy)
        if yy == 0:
            producer_storage[yy][xx + 1] = math.sqrt((xx+1) * yy)
        if xx == 0:
            producer_storage[(yy + 1)&1][xx] = math.sqrt(xx * (yy+1))
                                                         
        producer_storage[(yy+1)&1][xx+1] = math.sqrt((xx+1) * (yy+1))
        result[yy][xx] = (producer_storage[yy & 1][xx] + producer_storage[(yy+1) & 1][xx] + producer_storage[yy & 1][xx+1] + producer_storage[(yy+1) & 1][xx+1])
           
print("Pseudo-code for the schedule:")
consumer.print_loop_nest()
print()


Evaluating producer.store_root().compute_at(consumer,x)
Pseudo-code for the schedule:



In [8]:
print("="*50)
producer, consumer = Func("producer_tile"), Func("consumer_tile")
producer[x, y] = sqrt(x * y)
consumer[x, y] = (producer[x, y] +
                  producer[x, y+1] +
                  producer[x+1, y] +
                  producer[x+1, y+1])

x_outer, y_outer = Var("x_outer"), Var("y_outer")
x_inner, y_inner = Var("x_inner"), Var("y_inner")
consumer.tile(x, y, x_outer, y_outer, x_inner, y_inner, 2, 2)

producer.compute_at(consumer, x_outer)

producer.trace_stores()
consumer.trace_stores()

print("\nEvaluating:"
      "consumer.tile(x, y, x_outer, y_outer, x_inner, y_inner, 2, 2)"
      "producer.compute_at(consumer, x_outer)")
consumer.realize(4,4)

result = np.empty((4,4), dtype=np.float32)
for y_outer in range(2):
    for x_outer in range(2):
        x_base = x_outer * 2
        y_base = y_outer * 2
        
        producer_storage = np.empty((3,3), dtype=np.float32)
        for py in range(y_base,y_base + 3):
            for px in range(x_base+3):
                producer_storage[py-y_base][px-x_base] = math.sqrt(px * py)
        
        for y_inner in range(2):
            for x_inner in range(2):
                xx = x_base + x_inner
                yy = y_base + y_inner
                result[yy][xx] = producer_storage[yy-y_base][xx-x_base] + producer_storage[yy-y_base+1][xx-x_base] + producer_storage[yy-y_base][xx-x_base+1] + producer_storage[yy-y_base+1][xx-x_base+1]

print("Pseudo-code for the schedule:")
consumer.print_loop_nest()
print()


Evaluating:consumer.tile(x, y, x_outer, y_outer, x_inner, y_inner, 2, 2)producer.compute_at(consumer, x_outer)
Pseudo-code for the schedule:



In [5]:
#consumer.split(x, x_outer, x_inner, 2)
consumer.split(y, y_outer, y_inner, 2)

<halide.Func 'consumer_tile'>

In [None]:
consumer.split(x, x_outer, x_inner, 2)

In [3]:
print("="*50)
producer, consumer = Func("producer_tile"), Func("consumer_tile")
producer[x, y] = sqrt(x * y)
consumer[x, y] = (producer[x, y] +
                  producer[x, y+1] +
                  producer[x+1, y] +
                  producer[x+1, y+1])


# Tile the consumer using 2x2 tiles.
x_outer, y_outer = Var("x_outer"), Var("y_outer")
x_inner, y_inner = Var("x_inner"), Var("y_inner")
consumer.tile(x, y, x_outer, y_outer, x_inner, y_inner, 2, 2)



<halide.Func 'consumer_tile'>

In [9]:
print("="*50)
producer, consumer = Func("producer_mixed"), Func("consumer_mixed")
producer[x, y] = sqrt(x * y)
consumer[x, y] = (producer[x, y] +
                  producer[x, y+1] +
                  producer[x+1, y] +
                  producer[x+1, y+1])

# Split the y coordinate of the consumer into strips of 16 scanlines:
yo, yi = Var("yo"), Var("yi")
consumer.split(y, yo, yi, 16)
# Compute the strips using a thread pool and a task queue.
consumer.parallel(yo)
# Vectorize across x by a factor of four.
consumer.vectorize(x, 4)

# Now store the producer per-strip. This will be 17 scanlines
# of the producer (16+1), but hopefully it will fold down
# into a circular buffer of two scanlines:
producer.store_at(consumer, yo)
# Within each strip, compute the producer per scanline of the
# consumer, skipping work done on previous scanlines.
producer.compute_at(consumer, yi)
# Also vectorize the producer (because sqrt is vectorizable on x86 using SSE).
producer.vectorize(x, 4)

# Let's leave tracing off this time, because we're going to
# evaluate over a larger image.
# consumer.trace_stores()
# producer.trace_stores()

halide_result = consumer.realize(800, 600)

# Here's the equivalent (serial) C:

c_result = np.empty((600, 800), dtype=np.float32)

# For every strip of 16 scanlines
for yo in range(600//16 + 1): # (this loop is parallel in the Halide version)
    # 16 doesn't divide 600, so push the last slice upwards to fit within [0, 599] (see lesson 05).
    y_base = yo * 16
    if y_base > (600-16):
        y_base = 600-16

    # Allocate a two-scanline circular buffer for the producer
    producer_storage = np.empty((2, 801), dtype=np.float32)

    # For every scanline in the strip of 16:
    for yi in range(16):
        yy = y_base + yi

        for py in range(yy, yy+2):
            # Skip scanlines already computed *within this task*
            if (yi > 0) and (py == yy):
                continue

            # Compute this scanline of the producer in 4-wide vectors
            for x_vec in range(800//4 + 1):
                x_base = x_vec*4
                # 4 doesn't divide 801, so push the last vector left (see lesson 05).
                if x_base > (801 - 4):
                    x_base = 801 - 4

                # If you're on x86, Halide generates SSE code for this part:
                xx = [x_base + 0, x_base + 1, x_base + 2, x_base + 3]
                vec= [math.sqrt(xx[0] * py),
                      math.sqrt(xx[1] * py),
                      math.sqrt(xx[2] * py),
                      math.sqrt(xx[3] * py)]
                producer_storage[py & 1][xx[0]] = vec[0]
                producer_storage[py & 1][xx[1]] = vec[1]
                producer_storage[py & 1][xx[2]] = vec[2]
                producer_storage[py & 1][xx[3]] = vec[3]



        # Now compute consumer for this scanline:
        for x_vec in range(800//4):
            x_base = x_vec * 4
            # Again, Halide's equivalent here uses SSE.
            xx = [x_base, x_base + 1, x_base + 2, x_base + 3]
            vec = [
                (producer_storage[yy & 1][xx[0]] +
                 producer_storage[(yy+1) & 1][xx[0]] +
                 producer_storage[yy & 1][xx[0]+1] +
                 producer_storage[(yy+1) & 1][xx[0]+1]),
                (producer_storage[yy & 1][xx[1]] +
                 producer_storage[(yy+1) & 1][xx[1]] +
                 producer_storage[yy & 1][xx[1]+1] +
                 producer_storage[(yy+1) & 1][xx[1]+1]),
                (producer_storage[yy & 1][xx[2]] +
                 producer_storage[(yy+1) & 1][xx[2]] +
                 producer_storage[yy & 1][xx[2]+1] +
                 producer_storage[(yy+1) & 1][xx[2]+1]),
                (producer_storage[yy & 1][xx[3]] +
                 producer_storage[(yy+1) & 1][xx[3]] +
                 producer_storage[yy & 1][xx[3]+1] +
                 producer_storage[(yy+1) & 1][xx[3]+1])
            ]

            c_result[yy][xx[0]] = vec[0]
            c_result[yy][xx[1]] = vec[1]
            c_result[yy][xx[2]] = vec[2]
            c_result[yy][xx[3]] = vec[3]


print("Pseudo-code for the schedule:")
consumer.print_loop_nest()
print()

# Look on my code, ye mighty, and despair!

# Let's check the C result against the Halide result. Doing
# this I found several bugs in my C implementation, which
# should tell you something.
for yy in range(600):
    for xx in range(800):
        error = halide_result(xx, yy) - c_result[yy][xx]
        # It's floating-point math, so we'll allow some slop:
        if (error < -0.001) or (error > 0.001):
            raise Exception("halide_result(%d, %d) = %f instead of %f" % (
                   xx, yy, halide_result(xx, yy), c_result[yy][xx]))

Pseudo-code for the schedule:

