## Agile Hardware Design
***
# Queue Design Case Study

## Prof. Scott Beamer
### sbeamer@ucsc.edu

## [CSE 293](https://classes.soe.ucsc.edu/cse293/Winter22/)

## Plan for Today

* Designing for reuse
* Designing a Queue
* Iteratively improving Queue design

## Loading The Chisel Library Into a Notebook

In [None]:
val path = System.getProperty("user.dir") + "/../resource/chisel_deps.sc"
interp.load.module(ammonite.ops.Path(java.nio.file.FileSystems.getDefault().getPath(path)))

In [None]:
import chisel3._
import chisel3.util._
import chisel3.tester._
import chisel3.tester.RawTester.test

## Goals for Reuse

* Need to recognize _pattern_ of functionality

* Include necessary parameterization and generation to support users' needs

## Planning for Progressive Design

* Reduce the complexity/challenge of any one step
* _Close the loop_ as early as possible, then augment/extend/revise
* Look for opportunities to defer features/optimizations to later
* While developing, re-evaluate plan and revise as needed
* Consider
  * What is the simplest thing I can implement?
  * How can I test it? (both at start and as it evolves)
  * Plotting a roadmap of order to develop features/optimizations

## Case Study: Designing a Queue

* **Goal:** A _Queue_ with `Decoupled` interfaces on both sides
  * Would like to parameterize queue depth and data types
  * Power/performance/area (PPA) goals

* **How to get started:** deferring features
  * Parameters (queue depth & data type)
  * Performance (correct but slow ok at first)

<img src="images/queue.svg" alt="queue high-level" style="width:70%;margin-left:auto;margin-right:auto"/>

## First Attempt at Queue

* _Simplification_: only single entry
* _Behavior_: can enqueue if full and dequeueing (`pipe` is true), but can't bypass if empty (`flow` is false)

<img src="images/single.svg" alt="single-entry queue" style="width:60%;margin-left:auto;margin-right:auto"/>

## V0 - First Attempt at Queue

In [None]:
class QueueIO(bitWidth: Int) extends Bundle {
    val enq = Flipped(Decoupled(UInt(bitWidth.W)))
    val deq = Decoupled(UInt(bitWidth.W))
}

class MyQueueV0(bitWidth: Int) extends Module {
    val io = IO(new QueueIO(bitWidth))
    val entry = Reg(UInt(bitWidth.W))
    val full = RegInit(false.B)
    io.enq.ready := !full || io.deq.fire
    io.deq.valid := full
    io.deq.bits := entry
    when (io.deq.fire) {
        full := false.B
    }
    when (io.enq.fire) {
        entry := io.enq.bits
        full := true.B
    }
}

## Testing Our Queue - Scala Model

In [None]:
class QueueModel(numEntries: Int) {
    val mq = scala.collection.mutable.Queue[Int]()
    var deqReady = false

    def attemptEnq(elem: Int) { 
        if (enqReady()) mq += elem
    }

    // call first within a cycle
    // improve with Option & None
    def attemptDeq() = if (deqReady) mq.dequeue() else -1
    
    def enqReady() = mq.size < numEntries || (mq.size == numEntries && deqReady)
    def deqValid() = mq.nonEmpty
}

## Testing Our Queue - Harness + Simulation

In [None]:
def simCycle(qm: QueueModel, c: MyQueueV0, enqValid: Boolean, deqReady: Boolean, enqData: Int=0) {
    qm.deqReady = deqReady
    c.io.deq.ready.poke(qm.deqReady.B)
    if (c.io.deq.valid.peek.litToBoolean && deqReady) {
        assert(qm.deqValid)
        c.io.deq.bits.expect(qm.attemptDeq().U)
    }
    c.io.enq.ready.expect(qm.enqReady.B)
    c.io.enq.valid.poke(enqValid.B)
    c.io.enq.bits.poke(enqData.U)
    if (enqValid)
        qm.attemptEnq(enqData)
    c.clock.step()
    println(qm.mq)
}

## Testing Our Queue - Simulation

In [None]:
test(new MyQueueV0(8)) { c =>
    val qm = new QueueModel(1)
    simCycle(qm, c, false, false)
    simCycle(qm, c, true, false, 1)
    simCycle(qm, c, true, false, 2)
    simCycle(qm, c, true, true, 2)
    simCycle(qm, c, false, true)
}

## Assessing MyQueue `V0`

* Accomplished
    * Implements queueing behavior
    * Parameterized data width (still limited to `UInt`)
* Shortcommings
    * Only one entry (_next goal_ to fix)

## Parameterizing Number of Queue Entries

* First attempt at parameterizing number of entries: _shift register_

<img src="images/shift.svg" alt="queue via shift register" style="width:60%;margin-left:auto;margin-right:auto"/>

## V1 - Parameterizing Number of Queue Entries

In [None]:
class MyQueueV1(numEntries: Int, bitWidth: Int) extends Module {
    val io = IO(new QueueIO(bitWidth))
    require(numEntries > 0)
    // enqueue into index numEntries-1 (last) and dequeue from index 0 (head)
    val entries = Seq.fill(numEntries)(Reg(UInt(bitWidth.W)))
    val fullBits = Seq.fill(numEntries)(RegInit(false.B))
    val shiftDown = io.deq.fire || !fullBits.head
    io.enq.ready := !fullBits.last || shiftDown
    io.deq.valid := fullBits.head
    io.deq.bits := entries.head
    when (shiftDown) { // dequeue / shift
        for (i <- 0 until numEntries - 1) {
            entries(i) := entries(i+1)
            fullBits(i) := fullBits(i+1)
        }
        fullBits.last := false.B
    }
    when (io.enq.fire) { // enqueue
        entries.last := io.enq.bits
        fullBits.last := true.B
    }
//     when (shiftDown || io.enq.fire) {
//         entries.foldRight(io.enq.bits){(thisEntry, lastEntry) => thisEntry := lastEntry; thisEntry}
//         fullBits.foldRight(io.enq.fire){(thisEntry, lastEntry) => thisEntry := lastEntry; thisEntry}
//     }
}

## Testing Our Queue

In [None]:
def simCycle(qm: QueueModel, c: MyQueueV1, enqValid: Boolean, deqReady: Boolean, enqData: Int=0) {
    qm.deqReady = deqReady
    c.io.deq.ready.poke(qm.deqReady.B)
    if (c.io.deq.valid.peek.litToBoolean && deqReady) {
        assert(qm.deqValid)
        c.io.deq.bits.expect(qm.attemptDeq().U)
    }
    c.io.enq.ready.expect(qm.enqReady.B)
    c.io.enq.valid.poke(enqValid.B)
    c.io.enq.bits.poke(enqData.U)
    if (enqValid)
        qm.attemptEnq(enqData)
    c.clock.step()
    println(qm.mq)
}

test(new MyQueueV1(3,8)) { c =>
    val qm = new QueueModel(3)
    simCycle(qm, c, false, false)
    simCycle(qm, c, true, false, 1)
    simCycle(qm, c, true, true, 2)
    simCycle(qm, c, false, true)
    simCycle(qm, c, false, true)
}

## Assessing MyQueue `V1`

* Accomplished
    * Implements queueing behavior
    * Parameterized data width & _number of entries_
* Shortcommings
    * Long latency when queue is empty (all elements go through all entries)
    * Not good at handling bubbles midway (might even be buggy)

## Squishing Bubbles in Queue

* Use a _Priority Encoder_ to squeeze out bubbles
  * Insert in first free slot

<img src="images/priority.svg" alt="priority encoder queue" style="width:60%;margin-left:auto;margin-right:auto"/>

## V2 - Using Priority Encoder for Insertion

In [None]:
class MyQueueV2(numEntries: Int, bitWidth: Int) extends Module {
    val io = IO(new QueueIO(bitWidth))
    require(numEntries > 0)
    // enqueue into lowest empty and dequeue from index 0 (head)
    val entries = Reg(Vec(numEntries, UInt(bitWidth.W)))
    val fullBits = RegInit(VecInit(Seq.fill(numEntries)(false.B)))
    val emptyBits = fullBits map { !_ }
    io.enq.ready := emptyBits reduce { _ || _ } // any empties?
    io.deq.valid := fullBits.head
    io.deq.bits := entries.head
    when (io.deq.fire) { // dequeue & shift up
        fullBits.last := false.B
        for (i <- 0 until numEntries - 1) {
            entries(i) := entries(i+1)
            fullBits(i) := fullBits(i+1)
        }
    }
    when (io.enq.fire) { // priority enqueue
        val currFreeIndex = PriorityEncoder(emptyBits)
        val writeIndex = Mux(io.deq.fire, currFreeIndex - 1.U, currFreeIndex)
        entries(writeIndex) := io.enq.bits
        fullBits(writeIndex) := true.B
    }
}

## Testing Our Queue

In [None]:
def simCycle(qm: QueueModel, c: MyQueueV2, enqValid: Boolean, deqReady: Boolean, enqData: Int=0) {
    qm.deqReady = deqReady
    c.io.deq.ready.poke(qm.deqReady.B)
    if (c.io.deq.valid.peek.litToBoolean && deqReady) {
        assert(qm.deqValid)
        c.io.deq.bits.expect(qm.attemptDeq().U)
    }
    c.io.enq.ready.expect(qm.enqReady.B)
    c.io.enq.valid.poke(enqValid.B)
    c.io.enq.bits.poke(enqData.U)
    if (enqValid)
        qm.attemptEnq(enqData)
    c.clock.step()
    println(qm.mq)
}

test(new MyQueueV2(4, 8)) { c =>
    val qm = new QueueModel(4)
    simCycle(qm, c, false, false)
    simCycle(qm, c, true, false, 1)
    simCycle(qm, c, false, true)
}

## Assessing MyQueue `V2`

* Accomplished
  * Implements queueing behavior
  * Parameterized data width & number of entries
  * Latency based on occupancy

* Shortcommings
  * _Performance:_ can't simultaneously enqueue/dequeue to a full queue
  * _Power Efficiency:_ lots of bits shifting
  * _Potential Critical Path:_ priority encoder logic depth

## Keeping Data in Place with a Circular Buffer

* _Circular buffer_ uses two pointers (indices) and fixed size storage to make a FIFO
  * Insert new data at _in_ (and increment _in_)
  * Pop from _out_ (and increment _out_)
  * Wrap pointers around when they get to end
* How to tell when empty vs full?
  * First try: _empty_ when pointers are equal, _full_ when in+1 == out

<img src="images/circular.svg" alt="circular buffer" style="width:60%;margin-left:auto;margin-right:auto"/>

## V3 - Keeping Data in Place with Circular Buffer

In [None]:
class MyQueueV3(numEntries: Int, bitWidth: Int) extends Module {
    val io = IO(new QueueIO(bitWidth))
    require(numEntries > 1)
    require(isPow2(numEntries))
    val entries = Reg(Vec(numEntries, UInt(bitWidth.W))) // Mem?
    val enqIndex = RegInit(0.U(log2Ceil(numEntries).W))
    val deqIndex = RegInit(0.U(log2Ceil(numEntries).W))
    val empty = enqIndex === deqIndex
    val full = (enqIndex +% 1.U) === deqIndex
    io.enq.ready := !full
    io.deq.valid := !empty
    io.deq.bits := entries(deqIndex)
    when (io.deq.fire) {
        deqIndex := deqIndex +% 1.U
    }
    when (io.enq.fire) {
        entries(enqIndex) := io.enq.bits
        enqIndex := enqIndex +% 1.U
    }
}

In [None]:
def simCycle(qm: QueueModel, c: MyQueueV3, enqValid: Boolean, deqReady: Boolean, enqData: Int=0) {
    qm.deqReady = deqReady
    c.io.deq.ready.poke(qm.deqReady.B)
    c.io.deq.valid.expect(qm.deqValid.B)
    if (deqReady && qm.deqValid)
        c.io.deq.bits.expect(qm.attemptDeq().U)
    c.io.enq.ready.expect(qm.enqReady.B)
    c.io.enq.valid.poke(enqValid.B)
    c.io.enq.bits.poke(enqData.U)
    if (enqValid)
        qm.attemptEnq(enqData)
    c.clock.step()
    println(qm.mq)
}

test(new MyQueueV3(4, 8)) { c =>
    val qm = new QueueModel(2)
    simCycle(qm, c, false, false)
    simCycle(qm, c, true, false, 1)
    simCycle(qm, c, false, true)
}

## Assessing MyQueue `V3`

* Accomplished
  * Implements queueing behavior
  * Parameterized data width & number of entries
  * Latency based on occupancy
  * Efficiency? Less bits shifting and shallower logic

* Shortcommings
  * _Capacity:_ loose one entry (to detect if full), and must be power of 2
  * _Performance:_ can't simultaneously enqueue/dequeue to a full queue

## Reclaiming Last Entry

* _Problem:_ with circular buffer (initially), had to keep last entry empty to differentiate a full queue from an empty queue
    * Otherwise, if `enqIndex === deqIndex`, is it full or empty?
* _Solution:_ add an extra bit of state (`maybeFull`) to capture this corner case
    * If indices are equal and `maybeFull` => _full_
    * If indices are equal and `!maybeFull` => _empty_
    * If indices are not equal => not full or empty (has room)

## V4 - Adding State (`maybeFull`) Track Last Entry

In [None]:
class MyQueueV4(numEntries: Int, bitWidth: Int) extends Module {
    val io = IO(new QueueIO(bitWidth))
    require(numEntries > 1)
    require(isPow2(numEntries))
    val entries = Reg(Vec(numEntries, UInt(bitWidth.W)))
    val enqIndex = RegInit(0.U(log2Ceil(numEntries).W))
    val deqIndex = RegInit(0.U(log2Ceil(numEntries).W))
    val maybeFull = RegInit(false.B)
    val empty = enqIndex === deqIndex && !maybeFull
    val full = enqIndex === deqIndex && maybeFull
    io.enq.ready := !full
    io.deq.valid := !empty
    io.deq.bits := entries(deqIndex)
    when (io.deq.fire) {
        deqIndex := deqIndex +% 1.U
        when (enqIndex =/= deqIndex) {
            maybeFull := false.B
        }
    }
    when (io.enq.fire) {
        entries(enqIndex) := io.enq.bits
        enqIndex := enqIndex +% 1.U
        when ((enqIndex +% 1.U) === deqIndex) {
            maybeFull := true.B
        }
    }
}

In [None]:
def simCycle(qm: QueueModel, c: MyQueueV4, enqValid: Boolean, deqReady: Boolean, enqData: Int=0) {
    qm.deqReady = deqReady
    c.io.deq.ready.poke(qm.deqReady.B)
    c.io.deq.valid.expect(qm.deqValid.B)
    if (deqReady && qm.deqValid)
        c.io.deq.bits.expect(qm.attemptDeq().U)
    c.io.enq.ready.expect(qm.enqReady.B)
    c.io.enq.valid.poke(enqValid.B)
    c.io.enq.bits.poke(enqData.U)
    if (enqValid)
        qm.attemptEnq(enqData)
    c.clock.step()
    println(qm.mq)
}

test(new MyQueueV4(2, 8)) { c =>
    val qm = new QueueModel(2)
    simCycle(qm, c, false, false)
    simCycle(qm, c, true, false, 1)
    simCycle(qm, c, false, true)
}

## Assessing MyQueue `V4`

* Accomplished
  * Implements queueing behavior
  * Parameterized data width & number of entries (can now use all of them all)
  * Latency based on occupancy
  * Efficiency? Less bits shifting and shallower logic

* Shortcommings
  * _Capacity:_ must be power of 2
  * _Performance:_ can't simultaneously enqueue/dequeue to a full queue

## V5 - Simultaneous Enqueue/Dequeue When Full

In [None]:
class MyQueueV5(numEntries: Int, bitWidth: Int) extends Module {
    val io = IO(new QueueIO(bitWidth))
    require(numEntries > 1)
    require(isPow2(numEntries))
    val entries = Reg(Vec(numEntries, UInt(bitWidth.W)))
    val enqIndex = RegInit(0.U(log2Ceil(numEntries).W))
    val deqIndex = RegInit(0.U(log2Ceil(numEntries).W))
    val maybeFull = RegInit(false.B)
    val empty = enqIndex === deqIndex && !maybeFull
    val full = enqIndex === deqIndex && maybeFull
    io.enq.ready := !full || io.deq.ready  // NOTE: io.enq.ready now attached to io.deq.ready
    io.deq.valid := !empty
    io.deq.bits := entries(deqIndex)
    when (io.deq.fire) {
        deqIndex := deqIndex +% 1.U
        when (enqIndex =/= deqIndex) {
            maybeFull := false.B
        }
    }
    when (io.enq.fire) {
        entries(enqIndex) := io.enq.bits
        enqIndex := enqIndex +% 1.U
        when ((enqIndex +% 1.U) === deqIndex) {
            maybeFull := true.B
        }
    }
}

In [None]:
def simCycle(qm: QueueModel, c: MyQueueV5, enqValid: Boolean, deqReady: Boolean, enqData: Int=0) {
    qm.deqReady = deqReady
    c.io.deq.ready.poke(qm.deqReady.B)
    c.io.deq.valid.expect(qm.deqValid.B)
    if (deqReady && qm.deqValid)
        c.io.deq.bits.expect(qm.attemptDeq().U)
    c.io.enq.ready.expect(qm.enqReady.B)
    c.io.enq.valid.poke(enqValid.B)
    c.io.enq.bits.poke(enqData.U)
    if (enqValid)
        qm.attemptEnq(enqData)
    c.clock.step()
    println(qm.mq)
}

test(new MyQueueV5(2, 8)) { c =>
    val qm = new QueueModel(2)
    simCycle(qm, c, false, false)
    simCycle(qm, c, true, false, 1)
    simCycle(qm, c, true, false, 2)
    simCycle(qm, c, true, true, 3)
    simCycle(qm, c, false, true)
}

## Assessing MyQueue `V5`

* Accomplished
  * Implements queueing behavior
  * Parameterized data width & number of entries
  * Latency based on occupancy
  * Efficiency? Less bits shifting and shallower logic
  * Can now enqueue/dequeue in same cycle

* Shortcommings
  * _Capacity:_ must be power of 2
  * _Possible combinational loop_ more likely with `io.enq.ready` now attached to `io.deq.ready`

## V6 - Tidying up Code

In [None]:
class MyQueueV6(numEntries: Int, bitWidth: Int, pipe: Boolean=true) extends Module {
    val io = IO(new QueueIO(bitWidth))
    require(numEntries > 1)
//     require(isPow2(numEntries))    // no longer needed
    val entries = Mem(numEntries, UInt(bitWidth.W))
    val enqIndex = Counter(numEntries)
    val deqIndex = Counter(numEntries)
    val maybeFull = RegInit(false.B)
    val indicesEqual = enqIndex.value === deqIndex.value
    val empty = indicesEqual && !maybeFull
    val full = indicesEqual && maybeFull
    if (pipe)
        io.enq.ready := !full || io.deq.ready
    else
        io.enq.ready := !full
    io.deq.valid := !empty
    io.deq.bits := entries(deqIndex.value)
    when (io.deq.fire =/= io.enq.fire) {
        maybeFull := io.enq.fire
    }
    when (io.deq.fire) {
        deqIndex.inc()
    }
    when (io.enq.fire) {
        entries(enqIndex.value) := io.enq.bits
        enqIndex.inc()
    }
}

In [None]:
def simCycle(qm: QueueModel, c: MyQueueV6, enqValid: Boolean, deqReady: Boolean, enqData: Int=0) {
    qm.deqReady = deqReady
    c.io.deq.ready.poke(qm.deqReady.B)
    c.io.deq.valid.expect(qm.deqValid.B)
    if (deqReady && qm.deqValid)
        c.io.deq.bits.expect(qm.attemptDeq().U)
    c.io.enq.ready.expect(qm.enqReady.B)
    c.io.enq.valid.poke(enqValid.B)
    c.io.enq.bits.poke(enqData.U)
    if (enqValid)
        qm.attemptEnq(enqData)
    c.clock.step()
    println(qm.mq)
}

test(new MyQueueV6(3, 8)) { c =>
    val qm = new QueueModel(3)
    simCycle(qm, c, false, false)
    simCycle(qm, c, true, false, 1)
    simCycle(qm, c, true, false, 2)
    simCycle(qm, c, true, false, 3)
    simCycle(qm, c, false, true)
}

## Assessing MyQueue `V6`

* Accomplished
  * Implements queueing behavior
  * Parameterized data width & number of entries
  * Latency based on occupancy
  * Efficiency? Less bits shifting and shallower logic
  * Can now enqueue/dequeue in same cycle (optionally) and support non-power of 2 capacities

* Shortcommings
  * Data type is `UInt` - What about arbitrary data?