In [1]:
!pip install pycuda

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

import numpy as np
import math

# Memory Management

In [None]:
cuda.mem_get_info()

(15734669312, 15843721216)

In [None]:
a = np.zeros((100000000)).astype(np.float32)
a_gpu = cuda.mem_alloc(a.nbytes)

In [None]:
cuda.mem_get_info()

(15334113280, 15843721216)

In [None]:
cuda.memcpy_htod(a_gpu, a)

a_out = np.empty_like(a)

cuda.memcpy_dtoh(a_out, a_gpu)

print(a_out)

[0. 0. 0. ... 0. 0. 0.]


In [None]:
cuda.mem_get_info()

(15334113280, 15843721216)

In [None]:
a_gpu.free()

In [None]:
cuda.mem_get_info()

(15734669312, 15843721216)

# Memory Training

In [None]:
from pycuda.compiler import SourceModule

In [None]:
## block=(6,1,1), grid=(step,1,1)
get_G_matrix_ker_function = \
"""
#define tx (threadIdx.x)
#define bx (blockIdx.x)
#define step (gridDim.x)

__global__ void get_G_matrix(float* input_matrix, float dt, float* G) {
    // 6: DOF, 18: DOF * axis
    int index = tx + (tx%3) * 6 + bx * 18;

    if (tx < 3) {
        float value;
        value = input_matrix[0] + (step - bx - 1) * dt * input_matrix[1];

        G[index] = value;
    }
    else {
        G[index] = dt;
    }

    __syncthreads();
}
"""
get_G_matrix_ker = SourceModule(get_G_matrix_ker_function)

get_G_matrix = get_G_matrix_ker.get_function("get_G_matrix")

In [None]:
step = 100000
a = np.float32(np.array([1]))
a.nbytes

4

In [None]:
## check memory usage
print(cuda.mem_get_info())

## set constant
dt = np.float32(0.05)

## G: 6 x 150 matrix... each number has 4 bytes
G_cpu = np.zeros((6*3*step)).astype(np.float32)
G = cuda.mem_alloc(4*6*3*step)

## check memory usage
print(cuda.mem_get_info())

## input_matrix: 2 x 1 matrix... 
input_matrix = cuda.mem_alloc(4*2)
input_matrix_cpu = np.array([0.5*dt*dt, dt]).astype(np.float32)
cuda.memcpy_htod(input_matrix, input_matrix_cpu)

## feel G matrix
get_G_matrix(input_matrix,
             dt,
             G,
             block=(6,1,1),
             grid=(step,1,1))

## check memory usage
print(cuda.mem_get_info())

## copy result...
G_out = np.empty_like(G_cpu)
cuda.memcpy_dtoh(G_out, G)

(15734669312, 15843721216)
(15726280704, 15843721216)
(15726280704, 15843721216)


In [None]:
G_out.reshape(3*step,6).T[:,:3]

array([[2.4999876e+02, 0.0000000e+00, 0.0000000e+00],
       [0.0000000e+00, 2.4999876e+02, 0.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 2.4999876e+02],
       [5.0000001e-02, 0.0000000e+00, 0.0000000e+00],
       [0.0000000e+00, 5.0000001e-02, 0.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 5.0000001e-02]], dtype=float32)

In [None]:
(99999+0.5) * 0.05 * 0.05

249.99875000000003

In [None]:
cuda.mem_get_info()

(15726280704, 15843721216)

In [None]:
G.free()
input_matrix.free()

In [None]:
cuda.mem_get_info()

(15734669312, 15843721216)

# First of all, Let's rebuild our MEC class

In [3]:
class MinimumEnergyControl:
    def __init__(self, x_des, x_0, step=50, dt=0.05):

        ## very important constants
        self.byte = 4
        self.axis = 3
        self.DOF  = 6
        self.step = step

        ## gravity, criterion: moon
        gravity = 1.62      # N/kg

        ## A
        state_transition_matrix = \
        np.array([[ 1, 0, 0,dt, 0, 0],
                  [ 0, 1, 0, 0,dt, 0],
                  [ 0, 0, 1, 0, 0,dt],
                  [ 0, 0, 0, 1, 0, 0],
                  [ 0, 0, 0, 0, 1, 0],
                  [ 0, 0, 0, 0, 0, 1]])

        ## B
        input_matrix = \
        np.array([[0.5*dt*dt,        0,        0],
                  [        0,0.5*dt*dt,        0],
                  [        0,        0,0.5*dt*dt],
                  [        dt,       0,        0],
                  [        0,        dt,       0],
                  [        0,        0,       dt]])
        self.input_matrix = cuda.mem_alloc(self.byte*2)
        cuda.memcpy_htod(self.input_matrix, input_matrix[::3,0].astype(np.float32))

        ## g
        gravity_matrix = \
        np.array([[                0],
                  [                0],
                  [0.5*gravity*dt*dt],
                  [                0],
                  [                0],
                  [       gravity*dt]]) 
        self.gravity_matrix = cuda.mem_alloc(self.byte*2)
        cuda.memcpy_htod(self.gravity_matrix, gravity_matrix[2::3].astype(np.float32))

        ## desired state: x_des
        self.x_des = cuda.mem_alloc(self.byte*self.DOF)
        cuda.memcpy_htod(self.x_des, x_des.astype(np.float32))

        ## initial state: x_0
        self.x_0 = cuda.mem_alloc(self.byte*self.DOF)
        cuda.memcpy_htod(self.x_0, x_0.astype(np.float32))

        ## current state: x_current
        self.x_current = cuda.mem_alloc(self.byte*self.DOF)
        cuda.memcpy_htod(self.x_0, x_0.astype(np.float32))
        
        ## dt
        self.dt = np.float32(dt)

        ## weight
        self.rho = 100

        ## define kernel function
        self.kernel_function()

    def run(self):
        ## get_gradient
        self.get_gradient(self.gram_G,
                          self.u,
                          self.G_C,
                          self.iteration,
                          self.gradient,
                          np.int32(self.step),
                          block=(self.TPB,1,1),
                          grid=(self.axis*self.step,1,1))

    def define_problem(self):
        ## initialize
        try:
            self.memory_free()
        except:
            pass

        ## TPB, iteration
        self.TPB, self.iteration = self.define_optimal_kernel_size()

        ## matrices
        self.memory_allocation()
        self.define_matrix()

    def define_optimal_kernel_size(self):
        thread_per_block = int(math.sqrt(self.step/2))
        
        iteration = int(self.step / thread_per_block) + 1

        return thread_per_block, np.int32(iteration)

    def memory_allocation(self):
        ## rho matrix: 36 * step * step bytes
        rho_matrix = math.sqrt(self.rho) * np.identity(self.axis*self.step)
        rho_matrix_byte = self.byte * self.axis * self.axis * self.step * self.step
        self.rho_matrix = cuda.mem_alloc(rho_matrix_byte)
        cuda.memcpy_htod(self.rho_matrix, rho_matrix.astype(np.float32))

        ## solution!!!
        u = np.zeros((self.axis*self.step))
        u_byte = self.byte * self.axis * self.step
        self.u = cuda.mem_alloc(u_byte)
        cuda.memcpy_htod(self.u, u.astype(np.float32))

        ## G
        G_byte = self.byte * self.DOF * self.axis * self.step
        self.G = cuda.mem_alloc(G_byte)

        ## gram_G
        gram_G_byte = self.byte * self.axis * self.axis * self.step * self.step
        self.gram_G = cuda.mem_alloc(gram_G_byte)

        ## Q
        Q_byte = self.byte * self.DOF
        self.Q = cuda.mem_alloc(Q_byte)

        ## C
        C_byte = self.byte * self.DOF
        self.C = cuda.mem_alloc(C_byte)

        ## G_C
        G_C_byte = self.byte * self.axis * self.step
        self.G_C = cuda.mem_alloc(G_C_byte)

        ## gradient
        gradient_byte = self.byte * self.axis * self.step
        self.gradient = cuda.mem_alloc(gradient_byte)

    def define_matrix(self):
        self.get_G_matrix(self.input_matrix,
                          self.dt,
                          self.G,
                          block=(6,1,1),
                          grid=(self.step,1,1))
        
        self.get_Q_matrix(self.gravity_matrix,
                          self.dt,
                          self.Q,
                          block=(self.step,1,1),
                          grid=(2,1,1))
        
        self.get_G_gram_matrix(self.G,
                               self.rho_matrix,
                               self.gram_G,
                               np.int32(self.step),
                               block=(3,1,1),
                               grid=(self.step,self.step,1))
                               
        self.get_G_C_matrix(self.G,
                            self.x_des,
                            self.x_0,
                            self.Q,
                            self.C,
                            self.G_C,
                            block=(3,1,1),
                            grid=(self.step,1,1))

    def memory_free(self):
        self.rho_matrix.free()
        self.u.free()
        self.G.free()
        self.gram_G.free()
        self.Q.free()
        self.C.free()
        self.G_C.free()
        self.gradient.free()

    def memory_freeall(self):

        try:
            self.memory_free()
        except:
            pass

        self.input_matrix.free()
        self.gravity_matrix.free()
        self.x_des.free()
        self.x_0.free()
        self.x_current.free()

    def kernel_function(self):
        ## block=(TPB,1,1), grid=(axis*step,1,1)
        get_gradient_ker_function = \
        """
        #define tx (threadIdx.x)
        #define bx (blockIdx.x)
        #define bs (blockDim.x)
        #define gs (gridDim.x)

        __global__ void get_gradient(float* matrix, float* vector1, float* vector2, int iteration, float* result, int step) {

            __shared__ float result_jerk[1000];

            result_jerk[tx] = 0.0;

            for (int i = 0; i < iteration; i++) {            
                int index1 = i + tx * iteration;
                int index2 = index1 + bx * 3 * step;

                if (index1 < gs) {
                    result_jerk[tx] += matrix[index2] * vector1[index1];
                }
                else {
                    result_jerk[1000-tx] = 0.0;
                }
            }

            __syncthreads();

            if (tx == 0) {
                for (int j = 0; j < bs; j++) {
                    result[bx] += result_jerk[j];
                }

                result[bx] -= vector2[bx];
            }
            else {
                result_jerk[1000-tx] = 0.0;
            }

            __syncthreads();
        }
        """
        get_gradient_ker = SourceModule(get_gradient_ker_function)

        ## block=(6,1,1), grid=(step,1,1)
        get_G_matrix_ker_function = \
        """
        #define tx (threadIdx.x)
        #define bx (blockIdx.x)
        #define step (gridDim.x)

        __global__ void get_G_matrix(float* input_matrix, float dt, float* G) {
            // 6: DOF, 18: axis * DOF
            int index = tx + (tx%3) * 6 + bx * 18;

            if (tx < 3) {
                float value;
                value = input_matrix[0] + (step - bx - 1) * dt * input_matrix[1];

                G[index] = value;
            }
            else {
                G[index] = dt;
            }

            __syncthreads();
        }
        """
        get_G_matrix_ker = SourceModule(get_G_matrix_ker_function)

        ## block=(step,1,1), grid=(2,1,1)
        get_Q_matrix_ker_function = \
        """
        #define tx (threadIdx.x)
        #define bx (blockIdx.x)
        #define step (blockDim.x)

        __global__ void get_Q_matrix(float* gravity, float dt, float* Q) {
            
            __shared__ float value[1000];

            if (bx == 0) {
                value[tx] = gravity[0] + (tx * dt) * gravity[1];
            }
            else {
                value[tx] = gravity[1];
            }

            __syncthreads();

            if (bx == 0) {
                if (tx == 0) {
                    for (int i = 0; i < step; i++) {
                        Q[2] += value[i];
                    }
                }
            }
            else {
                if (tx == 0) {
                    for (int i = 0; i < step; i++) {
                        Q[5] += value[i];
                    }
                }
            }

            __syncthreads();
        }
        """
        get_Q_matrix_ker = SourceModule(get_Q_matrix_ker_function)

        ## block=(3,1,1), grid=(step,step,1)
        get_G_gram_matrix_ker_function = \
        """
        #define tx (threadIdx.x)
        #define bx (blockIdx.x)
        #define by (blockIdx.y)
        #define step (gridDim.x)

        __global__ void get_G_gram_matrix(float* G, float* rho_matrix, float* gram_G) {
            // 3: axis
            int index1 = 3 * step + 1;
            int index2 = 3 * 3 * step;
            int index3 = tx * index1 + bx * 3 + by * index2;

            // 7: DOF+1, 18: axis*DOF
            int index4 = tx * 7 + bx * 18;
            int index5 = tx * 7 + by * 18;

            float value = 0.0;
            value = G[index4] * G[index5] + G[index4+3] * G[index5+3];

            gram_G[index3] = value;

            __syncthreads();

            gram_G[index3] += rho_matrix[index3] * rho_matrix[index3];

            __syncthreads();
        }
        """
        get_G_gram_matrix_ker = SourceModule(get_G_gram_matrix_ker_function)

        ## block=(3,1,1), grid=(step,1,1)
        get_G_C_matrix_ker_function = \
        """
        #define tx (threadIdx.x)
        #define bx (blockIdx.x)

        __global__ void get_G_C_matrix(float* G, float* x_des, float* x_current, float* Q, float* C, float * G_C) {

            __shared__ float C_jerk[6];

            C_jerk[tx] = x_des[tx] - Q[tx] - x_current[tx];
            C_jerk[tx+3] = x_des[tx+3] - Q[tx+3] - x_current[tx+3];

            __syncthreads();

            C[tx] = C_jerk[tx];
            C[tx+3] = C_jerk[tx+3];


            __syncthreads();

            // 7: DOF+1, 18: axis*DOF;
            int index1 = tx * 7 + bx * 18;
            int index2 = tx + bx * 3;

            float value;
            value = G[index1] * C_jerk[tx] + G[index1+3] * C_jerk[tx+3];

            __syncthreads();

            G_C[index2] = value;

            __syncthreads();
        }
        """
        get_G_C_matrix_ker = SourceModule(get_G_C_matrix_ker_function)

        self.get_G_matrix      = get_G_matrix_ker.get_function("get_G_matrix")
        self.get_Q_matrix      = get_Q_matrix_ker.get_function("get_Q_matrix")
        self.get_G_gram_matrix = get_G_gram_matrix_ker.get_function("get_G_gram_matrix")
        self.get_G_C_matrix    = get_G_C_matrix_ker.get_function("get_G_C_matrix")
        self.get_gradient      = get_gradient_ker.get_function("get_gradient")

    def copy_and_unpack_result(self):
        ## copy rho matrix
        rho_matrix = np.empty((self.axis*self.axis*self.step*self.step)).astype(np.float32)
        cuda.memcpy_dtoh(rho_matrix, self.rho_matrix)

        ## copy solution
        u = np.empty((self.axis*self.step)).astype(np.float32)
        cuda.memcpy_dtoh(u, self.u)

        ## copy G matrix        
        G = np.empty((self.DOF*self.axis*self.step)).astype(np.float32)
        cuda.memcpy_dtoh(G, self.G)

        ## copy gram matrix of G
        gram_G = np.empty((self.axis*self.axis*self.step*self.step)).astype(np.float32)
        cuda.memcpy_dtoh(gram_G, self.gram_G)

        ## copy Q matrix
        Q = np.empty((self.DOF)).astype(np.float32)
        cuda.memcpy_dtoh(Q, self.Q)

        ## copy C matrix
        C = np.empty((self.DOF)).astype(np.float32)
        cuda.memcpy_dtoh(C, self.C)

        ## copy G_C matrix
        G_C = np.empty((self.axis*self.step)).astype(np.float32)
        cuda.memcpy_dtoh(G_C, self.G_C)

        ## pack data
        matrices = dict()
        matrices["rho_matrix"] = rho_matrix.reshape(self.axis*self.step,self.axis*self.step)
        matrices["u"]          = u.reshape(self.axis*self.step,1)
        matrices["G"]          = G.reshape(self.axis*self.step,self.DOF).T 
        matrices["gram_G"]     = gram_G.reshape(self.axis*self.step,self.axis*self.step) 
        matrices["Q"]          = Q.reshape(self.DOF,1)
        matrices["C"]          = C.reshape(self.DOF,1)
        matrices["G_C"]        = G_C.reshape(self.axis*self.step,1)

        ## delete all memory
        self.memory_free()

        return matrices

## Test

In [None]:
x_des = np.array([0,0,0,0,0,0])

x_0 = np.array([100,0,-1500,1-10,0,80])

MEC = MinimumEnergyControl(x_des, x_0, step=50)

In [None]:
MEC.define_problem()

In [None]:
rho_matrix_cpu = np.empty((MEC.axis*MEC.axis*MEC.step*MEC.step)).astype(np.float32)
cuda.memcpy_dtoh(rho_matrix_cpu, MEC.rho_matrix)
rho_matrix_cpu.reshape(150,150)

array([[10.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0., 10.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0., 10., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0., 10.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0., 10.]], dtype=float32)

In [None]:
G_cpu = np.empty((MEC.DOF*MEC.axis*MEC.step)).astype(np.float32)
cuda.memcpy_dtoh(G_cpu, MEC.G)
G_cpu.reshape(150,6).T[:,:3]

array([[0.12375, 0.     , 0.     ],
       [0.     , 0.12375, 0.     ],
       [0.     , 0.     , 0.12375],
       [0.05   , 0.     , 0.     ],
       [0.     , 0.05   , 0.     ],
       [0.     , 0.     , 0.05   ]], dtype=float32)

In [None]:
Q_cpu = np.empty((MEC.DOF)).astype(np.float32)
cuda.memcpy_dtoh(Q_cpu, MEC.Q)
Q_cpu.reshape(6,1)

array([[0.       ],
       [0.       ],
       [5.062501 ],
       [0.       ],
       [0.       ],
       [4.0500016]], dtype=float32)

In [None]:
gram_G_cpu = np.empty((MEC.axis*MEC.axis*MEC.step*MEC.step)).astype(np.float32)
cuda.memcpy_dtoh(gram_G_cpu, MEC.gram_G)
gram_G_cpu.reshape(150,150)[:3,:3]

array([[100.017815,   0.      ,   0.      ],
       [  0.      , 100.017815,   0.      ],
       [  0.      ,   0.      , 100.017815]], dtype=float32)

In [None]:
C_cpu = np.empty((MEC.DOF)).astype(np.float32)
cuda.memcpy_dtoh(C_cpu, MEC.C)
C_cpu.reshape(6,1)

array([[-100.    ],
       [   0.    ],
       [1494.9375],
       [   9.    ],
       [   0.    ],
       [ -84.05  ]], dtype=float32)

In [None]:
G_C_cpu = np.empty((MEC.axis*MEC.step)).astype(np.float32)
cuda.memcpy_dtoh(G_C_cpu, MEC.G_C)
G_C_cpu.reshape(150,1)[:6]

array([[-11.925  ],
       [  0.     ],
       [180.79602],
       [-11.675  ],
       [  0.     ],
       [177.05867]], dtype=float32)

In [None]:
MEC.memory_freeall()

## Test with more step

In [None]:
x_des = np.array([0,0,0,0,0,0])

x_0 = np.array([100,0,-1500,1-10,0,80])

MEC = MinimumEnergyControl(x_des, x_0, step=70)

In [None]:
MEC.define_problem()

In [None]:
rho_matrix_cpu = np.empty((MEC.axis*MEC.axis*MEC.step*MEC.step)).astype(np.float32)
cuda.memcpy_dtoh(rho_matrix_cpu, MEC.rho_matrix)
rho_matrix_cpu.reshape(210,210)

array([[10.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0., 10.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0., 10., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0., 10.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0., 10.]], dtype=float32)

In [None]:
G_cpu = np.empty((MEC.DOF*MEC.axis*MEC.step)).astype(np.float32)
cuda.memcpy_dtoh(G_cpu, MEC.G)
G_cpu.reshape(210,6).T[:,:3]

array([[0.17375, 0.     , 0.     ],
       [0.     , 0.17375, 0.     ],
       [0.     , 0.     , 0.17375],
       [0.05   , 0.     , 0.     ],
       [0.     , 0.05   , 0.     ],
       [0.     , 0.     , 0.05   ]], dtype=float32)

In [None]:
Q_cpu = np.empty((MEC.DOF)).astype(np.float32)
cuda.memcpy_dtoh(Q_cpu, MEC.Q)
Q_cpu.reshape(6,1)

array([[0.       ],
       [0.       ],
       [9.922501 ],
       [0.       ],
       [0.       ],
       [5.6699986]], dtype=float32)

In [None]:
C_cpu = np.empty((MEC.DOF)).astype(np.float32)
cuda.memcpy_dtoh(C_cpu, MEC.C)
C_cpu.reshape(6,1)

array([[-100.    ],
       [   0.    ],
       [1490.0775],
       [   9.    ],
       [   0.    ],
       [ -85.67  ]], dtype=float32)

In [None]:
G_C_cpu = np.empty((MEC.axis*MEC.step)).astype(np.float32)
cuda.memcpy_dtoh(G_C_cpu, MEC.G_C)
G_C_cpu.reshape(210,1)[:6]

array([[-16.925  ],
       [  0.     ],
       [254.61746],
       [-16.675  ],
       [  0.     ],
       [250.89227]], dtype=float32)

In [None]:
MEC.memory_freeall()

In [None]:
cuda.mem_get_info()

(15734669312, 15843721216)

## 문제 해결!!

In [4]:
x_des = np.array([0,0,0,0,0,0])

x_0 = np.array([100,0,-1500,1-10,0,80])

MEC = MinimumEnergyControl(x_des, x_0, step=300)

In [5]:
MEC.define_problem()

In [6]:
rho_matrix_cpu = np.empty((MEC.axis*MEC.axis*MEC.step*MEC.step)).astype(np.float32)
cuda.memcpy_dtoh(rho_matrix_cpu, MEC.rho_matrix)
rho_matrix_cpu.reshape(900,900)

array([[10.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0., 10.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0., 10., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0., 10.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0., 10.]], dtype=float32)

In [7]:
G_cpu = np.empty((MEC.DOF*MEC.axis*MEC.step)).astype(np.float32)
cuda.memcpy_dtoh(G_cpu, MEC.G)
G_cpu.reshape(900,6).T[:,:3]

array([[0.74875003, 0.        , 0.        ],
       [0.        , 0.74875003, 0.        ],
       [0.        , 0.        , 0.74875003],
       [0.05      , 0.        , 0.        ],
       [0.        , 0.05      , 0.        ],
       [0.        , 0.        , 0.05      ]], dtype=float32)

In [8]:
Q_cpu = np.empty((MEC.DOF)).astype(np.float32)
cuda.memcpy_dtoh(Q_cpu, MEC.Q)
Q_cpu.reshape(6,1)

array([[  0.      ],
       [  0.      ],
       [182.25    ],
       [  0.      ],
       [  0.      ],
       [ 24.299963]], dtype=float32)

In [9]:
C_cpu = np.empty((MEC.DOF)).astype(np.float32)
cuda.memcpy_dtoh(C_cpu, MEC.C)
C_cpu.reshape(6,1)

array([[-100.      ],
       [   0.      ],
       [1317.75    ],
       [   9.      ],
       [   0.      ],
       [-104.299965]], dtype=float32)

In [10]:
G_C_cpu = np.empty((MEC.axis*MEC.step)).astype(np.float32)
cuda.memcpy_dtoh(G_C_cpu, MEC.G_C)
G_C_cpu.reshape(900,1)[:6]

array([[-74.425 ],
       [  0.    ],
       [981.4504],
       [-74.175 ],
       [  0.    ],
       [978.156 ]], dtype=float32)

In [11]:
cuda.mem_get_info()

(15726280704, 15843721216)

In [12]:
MEC.memory_freeall()

In [13]:
cuda.mem_get_info()

(15736766464, 15843721216)

# Next...

In [17]:
class OptimizerForGuidance:
    def __init__(self, learning_rate):
        ## set parameters
        self.learning_rate = np.float32(learning_rate)
        
        ## kernel function
        self.kernel_function()

    def run(self, theta, gradient, step):
        ## theta, gradient: gpuarray type variable
        self.basic_optimizer(theta,
                             gradient,
                             self.learning_rate,
                             block=(3,1,1),
                             grid=(step,1,1))

    def kernel_function(self):
        ## block=(3,1,1), grid=(step,1,1)
        basic_optimizer_ker_function = \
        """
        #define tx (threadIdx.x)
        #define bx (blockIdx.x)

        __global__ void basic_optimizer(float* theta, float* gradient, float learning_rate) {
            int index = tx + bx * 3;

            theta[index] -= gradient[index] * learning_rate;

            __syncthreads();
        }
        """
        basic_optimizer_ker = SourceModule(basic_optimizer_ker_function)

        self.basic_optimizer = basic_optimizer_ker.get_function("basic_optimizer")

In [18]:
class ConstraintsForInput:
    def __init__(self, problem, upper_boundary, downer_boundary):
        ## ex> MEC(minimum energy control)
        self.problem = problem

        self.upper_boundary = np.float32(upper_boundary)
        self.downer_boundary = np.float32(downer_boundary)

        ## kernel function
        self.kernel_function()

    def projection(self):
        self.project_function(self.problem.u,
                              self.upper_boundary,
                              self.downer_boundary,
                              block=(3,1,1),
                              grid=(self.problem.step,1,1))

    def kernel_function(self):
        ##block=(3,1,1), grid=(step,1,1)
        projection_ker_function = \
        """
        #define tx (threadIdx.x)
        #define bx (blockIdx.x)

        __device__ float square_root(float value) {
            float s = 0;
            float t = 0;

            s = value / 2;

            for (;s != t;) {
                t = s;
                s = ((value/t) + t) / 2;
            }

            return s;
        }

        __device__ float get_norm(float* vector, int length) {
            float value = 0.0;
            float norm;

            for (int i = 0; i < length; i++) {
                value += vector[i] * vector[i];
            }

            norm = square_root(value);

            return norm;
        }

        __global__ void projection(float* theta, float upper_boundary, float downer_boundary) {

            __shared__ float u[3];
            __shared__ float norm[1];
            __shared__ float value[1];

            int index = tx + bx * 3;

            u[tx] = theta[index];

            __syncthreads();

            if (tx == 0) {
                norm[0] = get_norm(u, 3);
            }

            __syncthreads();

            if ((norm[0] > downer_boundary) && (norm[0] < upper_boundary)) {
                value[0] = u[tx];
            }
            else {
                value[0] = u[tx] * upper_boundary / norm[0];
            }

            __syncthreads();

            theta[index] = value[0];
        }
        """
        projection_ker = SourceModule(projection_ker_function)

        self.project_function = projection_ker.get_function("projection")

# Last One... MECS class

In [31]:
class MinimumEnergyControlSolver:
    def __init__(self, x_des, x_0, upper_boundary, downer_boundary, step=50, max_iteration=200):
        ## important constants
        self.byte = 4
        self.axis = 3
        self.DOF  = 6
        self.step = step
        self.initial_step = step

        ## max iteration
        self.max_iteration = max_iteration

        ## initialize MEC(minimum energy control)
        self.MEC = MinimumEnergyControl(x_des, x_0, step=step)

        ## initialize optimizer
        learning_rate = 1e-4

        self.optimizer = OptimizerForGuidance(learning_rate)

        ## constraint
        self.upper_boundary  = upper_boundary
        self.downer_boundary = downer_boundary

        self.constraint = ConstraintsForInput(self.MEC, self.upper_boundary, self.downer_boundary)

        ## evaluate
        ## error_vector
        error_vector_byte = self.byte * (self.DOF + self.axis*self.step)
        self.error_vector = cuda.mem_alloc(error_vector_byte)

        ## error
        error_byte = self.byte * self.initial_step
        self.error = cuda.mem_alloc(error_byte)

        ## state record
        state_byte = self.byte * self.DOF * self.step
        self.state = cuda.mem_alloc(state_byte)

        ## kernel function
        self.kernel_function()

    def solve(self):
    # for step in range(self.MEC.step-1):
        ## define problem: fit matrices for left step
        self.MEC.define_problem()

        for iteration in range(self.max_iteration):
            ## get gradient
            self.MEC.run()

            ## optimize
            self.optimizer.run(self.MEC.u, self.MEC.gradient, self.MEC.step)

            ## constraint
            self.constraint.projection()

            ## evaluate
            self.evaluate(0)

            ## update state
            self.update_state(0)

            ## record data

            ## next step
            self.MEC.step -= 1
    
    def define_optimal_kernel_size(self):
        thread_per_block = int(math.sqrt(self.step/2))
        
        iteration = int(self.step / thread_per_block) + 1

        return thread_per_block, np.int32(iteration)

    def evaluate(self, current_step):
        ## set size
        block_size = self.step + 2
        grid_size  = self.axis * self.step + self.DOF

        ## evaluate learning
        self.get_error_vector(self.MEC.G,
                              self.MEC.rho_matrix,
                              self.MEC.u,
                              self.MEC.C,
                              self.MEC.iteration,
                              self.error_vector,
                              block=(self.MEC.TPB,1,1),
                              grid=(grid_size,1,1))
        
        self.get_error(self.error_vector,
                       self.error,
                       np.int32(current_step),
                       block=(block_size,1,1),
                       grid=(1,1,1))

    def update_state(self, step):
        ## update state
        self.get_next_state(self.MEC.x_current,
                            self.MEC.u,
                            self.MEC.dt,
                            self.MEC.gravity_matrix,
                            self.state,
                            np.int32(self.step),
                            block=(6,1,1),
                            grid=(1,1,1))

    def memory_freeall(self):

        try:
            self.MEC.memory_freeall()

        except:
            pass

        self.error_vector.free()
        self.error.free()
        self.state.free()

    def kernel_function(self):
        ## block=(TPB,1,1), grid=(DOF+axis*step,1,1)
        get_error_vector_ker_function = \
        """
        #define tx (threadIdx.x)
        #define bx (blockIdx.x)
        #define bs (blockDim.x)
        #define gs (gridDim.x)

        __global__ void get_error_vector(float* G, float* rho_matrix, float* u, float* C, int iteration, float* error_vector) {

            if (bx < 6) {

                __shared__ float value[100];

                value[tx] = 0.0;

                __syncthreads();

                for (int i = 0; i < iteration; i++) {
                    int index1 = bx % 3;
                    int index2 = i * 5 + tx % 5;
                    int index3 = index1 + index2 * 3;
                    int index4 = bx + index1 * 6 + index2 * 18;

                    value[tx] += G[index4] * u[index3];
                }

                __syncthreads();

                if (tx == 0) {
                    value[50] = 0.0;

                    for (int j = 0; j < bs; j++) {
                        value[50] += value[j];
                    }

                    error_vector[bx] = value[50] - C[bx];
                }

                __syncthreads();
            }
            else {
                if (tx == 0) {
                    int index1 = bx - 6;
                    int index2 = gs - 5;
                    int index3 = index1 * index2;

                    error_vector[bx] = rho_matrix[index3] * u[index1];
                }

                __syncthreads();
            }
        }
        """
        get_error_vector_ker = SourceModule(get_error_vector_ker_function)

        ## block=(step+2,1,1), grid=(1,1,1)
        get_error_ker_function = \
        """
        #define tx (threadIdx.x)
        #define bx (blockIdx.x)
        #define bs (blockDim.x)

        __device__ float square_root(float value) {
            float s = 0;
            float t = 0;

            s = value / 2;

            for (;s != t;) {
                t = s;
                s = ((value/t) + t) / 2;
            }

            return s;
        }

        __device__ float get_norm(float* vector, int length) {
            float value = 0.0;
            float norm;

            for (int i = 0; i < length; i++) {
                value += vector[i] * vector[i];
            }

            norm = square_root(value);

            return norm;
        }

        __global__ void get_error(float* error_vector, float* error, int current_step) {

            __shared__ float value[1000];

            int index = tx * 3;

            for (int i = 0; i < 3; i++) {
                value[index+i] = error_vector[index+i];
            }

            __syncthreads();

            if (tx == 0) {
                int length = bs;

                error[current_step] = get_norm(value, length);
            }

            __syncthreads();
        }
        """
        get_error_ker = SourceModule(get_error_ker_function)

        ## block=(6,1,1), grid=(1,1,1)
        get_next_state_ker_function = \
        """
        #define tx (threadIdx.x)

        __global__ void get_next_state(float* x, float* u, float dt, float* gravity_matrix, float* state, int step) {

            __shared__ float momentum[6];
            __shared__ float input[6];
            __shared__ float gravity[6];

            int index1 = tx + step * 6;

            if (tx < 3) {
                int index2 = tx % 3;

                momentum[tx] = x[tx] + dt * x[tx+3];
                input[tx]    = 0.5*dt*dt * u[index2];

                if (index2 == 2) {
                    gravity[tx] = gravity_matrix[tx];
                }
                else {
                    gravity[tx] = 0.0;
                }
            }
            else {
                int index2 = tx % 3;

                momentum[tx] = x[tx];
                input[tx]    = dt * u[index2];

                if (index2 == 2) {
                    gravity[tx] = gravity_matrix[tx];
                }
                else {
                    gravity[tx] = 0.0;
                }
            }

            __syncthreads();

            x[tx] = momentum[tx] + input[tx] + gravity[tx];
            state[index1] = x[tx];

            __syncthreads();
        }
        """
        get_next_state_ker = SourceModule(get_next_state_ker_function)

        self.get_error_vector = get_error_vector_ker.get_function("get_error_vector")
        self.get_error        = get_error_ker.get_function("get_error")
        self.get_next_state   = get_next_state_ker.get_function("get_next_state")

    def copy_and_unpack_result(self):
        ## copy state
        state = np.empty((self.DOF*self.initial_step)).astype(np.float32)
        cuda.memcpy_dtoh(state, self.state)

        ## copy error
        error = np.empty((self.initial_step)).astype(np.float32)
        cuda.memcpy_dtoh(error, self.error)

        ## pack data
        matrices = dict()
        matrices["state"] = state.reshape(self.initial_step,self.DOF).T
        matrices["error"] = error.reshape(self.initial_step)

        ## delete all memory
        self.memory_freeall()

        return matrices

# Test

In [52]:
x_des = np.array([0,0,0,0,0,0])

x_0 = np.array([100,0,-1500,1-10,0,80])

MEC = MinimumEnergyControl(x_des, x_0, step=50)

In [53]:
cuda.mem_get_info()

(15726280704, 15843721216)

In [54]:
MEC.define_problem()
result = MEC.copy_and_unpack_result()

In [55]:
cuda.mem_get_info()

(15726280704, 15843721216)

In [56]:
result["G"][:,:3]

array([[0.12375, 0.     , 0.     ],
       [0.     , 0.12375, 0.     ],
       [0.     , 0.     , 0.12375],
       [0.05   , 0.     , 0.     ],
       [0.     , 0.05   , 0.     ],
       [0.     , 0.     , 0.05   ]], dtype=float32)

In [57]:
MEC.memory_freeall()

In [58]:
cuda.mem_get_info()

(15726280704, 15843721216)

In [59]:
x_des = np.array([0,0,0,0,0,0])

x_0 = np.array([100,0,-1500,1-10,0,80])

MEC = MinimumEnergyControl(x_des, x_0, step=300)

In [60]:
cuda.mem_get_info()

(15726280704, 15843721216)

In [61]:
MEC.define_problem()
result = MEC.copy_and_unpack_result()

In [62]:
cuda.mem_get_info()

(15726280704, 15843721216)

In [63]:
result["G"][:,:3]

array([[0.74875003, 0.        , 0.        ],
       [0.        , 0.74875003, 0.        ],
       [0.        , 0.        , 0.74875003],
       [0.05      , 0.        , 0.        ],
       [0.        , 0.05      , 0.        ],
       [0.        , 0.        , 0.05      ]], dtype=float32)

In [64]:
MEC.memory_freeall()

In [65]:
cuda.mem_get_info()

(15726280704, 15843721216)

# Test 2

In [32]:
x_des = np.array([0,0,0,0,0,0])

x_0 = np.array([100,0,-1500,1-10,0,80])

upper_boundary = 5.8
downer_boundary = 0.0

MECS = MinimumEnergyControlSolver(x_des, x_0, upper_boundary, downer_boundary, step=300)

In [33]:
cuda.mem_get_info()

(15726280704, 15843721216)

In [34]:
MECS.solve()

In [35]:
cuda.mem_get_info()

(15717892096, 15843721216)

In [36]:
result = MECS.copy_and_unpack_result()

In [50]:
cuda.mem_get_info()

(15726280704, 15843721216)

In [47]:
result["state"][:,0]

array([17.041866, 17.041866, 17.041866, 18.413485, 18.413485, 18.413485],
      dtype=float32)

In [48]:
result["error"].shape

(300,)

In [49]:
MECS.memory_freeall()

LogicError: ignored