# Using Numba to Accelerate 2048

In [1]:
import numpy as np
import cupy as cp
from numba import cuda, jit, guvectorize

In [2]:
def base_rotate_index(x, y, rotation, board_size):
    num_rotate = rotation % 4
    while num_rotate > 0:
        x, y = y, board_size - x - 1
        num_rotate -= 1
    return x, y

In [3]:
def base_do_action(state, action, punishment, simulate):
    reward = 0
    update = False
    for x in range(state.shape[0]):
        for y1 in range(state.shape[1] - 1):
            for y2 in range(y1 + 1, state.shape[1]):
                rot_x1, rot_y1 = rotate_index(x, y1, action, state.shape[0])
                rot_x2, rot_y2 = rotate_index(x, y2, action, state.shape[0])
                if state[rot_x2, rot_y2] == 0:
                    continue
                elif state[rot_x1, rot_y1] == 0:
                    if not simulate:
                        state[rot_x1, rot_y1] = state[rot_x2, rot_y2]
                        state[rot_x2, rot_y2] = 0
                    update |= True
                else:
                    if state[rot_x1, rot_y1] == state[rot_x2, rot_y2]:
                        if not simulate:
                            state[rot_x1, rot_y1] += 1
                            state[rot_x2, rot_y2] = 0
                        reward += 2 ** state[rot_x1, rot_y1]
                        update |= True
                    break
    return reward if update else -1 * punishment

## Pure Python

In [4]:
rotate_index = base_rotate_index
do_action = base_do_action

In [5]:
state = np.arange(16, dtype=np.uint8).reshape(4, 4)
action = 3
punishment = 1
simulate = False

In [6]:
%timeit base_do_action(state, action, punishment, simulate)

49.7 µs ± 1.09 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## Just in Time Compilation

In [7]:
rotate_index = jit(nopython=True)(base_rotate_index)
do_action = jit(nopython=True)(base_do_action)

In [8]:
do_action(state, action, punishment, simulate);

In [9]:
%timeit do_action(state, action, punishment, simulate)

440 ns ± 1.18 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


## Generelized Universal Functions

In [10]:
num = int(1e4)
states = np.arange(16 * num, dtype=np.uint8).reshape(num, 4, 4)
actions = np.arange(num, dtype=np.int8)

In [11]:
%%timeit
for index in range(num):
    do_action(states[index], actions[index], punishment, simulate)

8.15 ms ± 174 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
@guvectorize(['void(u1[:,:], i1[:], i4, b1, i4[:])'], '(n,n),(),(),()->()', target='cpu', nopython=True)
def ufunc_do_action(state, action, punishment, simulate, reward):
    reward[0] = do_action(state, action[0], punishment, simulate)

In [13]:
ufunc_do_action(state, action, punishment, simulate);

In [14]:
%timeit ufunc_do_action(states, actions, punishment, simulate)

778 µs ± 7.09 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Cuda Generalized Universal Functions

In [15]:
num = int(1e8)
states = np.arange(16 * num, dtype=np.uint8).reshape(num, 4, 4)
actions = np.arange(num, dtype=np.int8)

In [16]:
%timeit ufunc_do_action(states, actions, punishment, simulate)

7.93 s ± 147 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
rotate_index = cuda.jit(device=True)(base_rotate_index)
do_action = cuda.jit(device=True)(base_do_action)

In [18]:
@guvectorize(['void(u1[:,:], i1[:], i4, b1, i4[:])'], '(n,n),(),(),()->()', target='cuda', nopython=True)
def gpu_do_action(state, action, punishment, simulate, reward):
    reward[0] = do_action(state, action[0], punishment, simulate)

In [19]:
gpu_do_action(state, action, punishment, simulate);

In [20]:
%timeit gpu_do_action(states, actions, punishment, simulate)

356 ms ± 3.84 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
state = cp.arange(16, dtype=cp.uint8).reshape(4, 4)
states = cp.arange(16 * num, dtype=cp.uint8).reshape(num, 4, 4)
actions = cp.arange(num, dtype=cp.int8)
rewards = cp.zeros(num, dtype=cp.int32)

In [22]:
%timeit gpu_do_action(states, actions, punishment, simulate, out=rewards)

33 ms ± 2.07 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [23]:
%timeit gpu_do_action(state, action, punishment, simulate)

1.33 ms ± 17 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [24]:
import torch

In [27]:
states = torch.arange(16 * num, dtype=torch.uint8).reshape(num, 4, 4)
actions = torch.arange(num, dtype=torch.int8)
rewards = torch.zeros(num, dtype=torch.int32)

In [28]:
%timeit gpu_do_action(state, action, punishment, simulate)

1.37 ms ± 23 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
