In [None]:
from kernel_runner import KernelRunner
from dataclasses import dataclass

@dataclass(frozen=True)
class RegisterTiledKernelRunner(KernelRunner):
    template: str = "<float, 128, 128, 8, 8, 8>"
    kernel_name: str = "register_tiled"

# Exploiting another level of the memory hierarchy

Registers are pieces of memory where compute happens on a chip. This is the
ideal place to keep values if at all possible.

Instead of splitting loops, we can permute their order to change the how compute
happens. Register tiling relies on this permutation.

Here's the shared memory loop structure, can you permute the loops to expose more
reuse that we can exploit with register usage?
```python
for io in range(N // BM):                  # parallel outer
    for jo in range(N // BN):              # parallel outer
        
        for ii in range(BM):               # parallel inner
            for ji in range(BN):           # parallel inner

                for ko in range(N // BK):  # sequential outer
                    for ki in range(BK):   # sequential inner
                        # compute
```

In [None]:
N = 8192 
BM, BN, TM, TN = 128, 128, 8, 8
block_dim = (BN // TN, BM // TM)
grid_dim = ((N + BM - 1) // BM,
            (N + BN - 1) // BN)

runner = RegisterTiledKernelRunner()
_ = runner(block_dim, grid_dim, (N,), read_full_src=False, niterations=20)