In [1]:
!pip install pycuda

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycuda
  Downloading pycuda-2022.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 33.2 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pytools>=2011.2
  Downloading pytools-2022.1.12.tar.gz (70 kB)
[K     |████████████████████████████████| 70 kB 10.8 MB/s 
[?25hCollecting mako
  Downloading Mako-1.2.1-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 9.3 MB/s 
Collecting platformdirs>=2.2.0
  Downloading platformdirs-2.5.2-py3-none-any.whl (14 kB)
Building wheels for collected packages: pycuda, pytools
  Building wheel for pycuda (PEP 517) ... [?25l[?25hdone
  Created wheel for pycuda: filename=pycuda-2022.1-cp37-cp37m-linux_x86_64.whl size=629484 sha256=08d3ef0f1a88cb3bdb303f15040fd132219f7a5e558

In [2]:
import pycuda.autoinit
import numpy as np
from pycuda import gpuarray
from pycuda.compiler import SourceModule
import math

In [286]:
class MinimumEnergyControl:
    def __init__(self, x_des, x_0, step=50, dt=0.05, damping=False):

        ## gravity, criterion: moon
        gravity = -1.62     # N/kg

        ## no drag or something disturb movement
        if not damping:
            ## A
            self.state_transition_matrix = \
                np.array([[ 1, 0, 0,dt, 0, 0],
                          [ 0, 1, 0, 0,dt, 0],
                          [ 0, 0, 1, 0, 0,dt],
                          [ 0, 0, 0, 1, 0, 0],
                          [ 0, 0, 0, 0, 1, 0],
                          [ 0, 0, 0, 0, 0, 1]])

            ## B
            input_matrix = \
                np.array([[dt*dt/2,      0,      0],
                          [      0,dt*dt/2,      0],
                          [      0,      0,dt*dt/2],
                          [     dt,      0,      0],
                          [      0,     dt,      0],
                          [      0,      0,     dt]])
            
            self.input_matrix = \
                gpuarray.to_gpu(np.float32(np.array([0.5*dt*dt, dt])))

            ## g
            gravity_matrix = \
                np.array([[              0],
                          [              0],
                          [gravity*dt*dt/2],
                          [              0],
                          [              0],
                          [     gravity*dt]])
                
            self.gravity_matrix = \
                gpuarray.to_gpu(np.float32(np.array([0.5*gravity*dt*dt, gravity*dt])))

        ## drag or something exist...
        else:
            pass

        ## desired state: x_des
        self.x_des = gpuarray.to_gpu(np.float32(x_des))

        ## initial state: x_0
        self.x_0 = gpuarray.to_gpu(np.float32(x_0))

        self.dt = np.float32(dt)

        self.step = step

        ## weight
        self.rho = 3

        self.rho_matrix = \
            gpuarray.to_gpu(np.float32(self.rho * np.identity(3*self.step).reshape(3*3*self.step*self.step)))
        
        ## solution!!!
        self.u = gpuarray.to_gpu(np.float32(np.zeros((3*self.step,1))))

        ## G, gram_G, Q
        self.G = gpuarray.to_gpu(np.float32(np.zeros((3*self.step,6)).reshape(6*3*self.step)))
        self.gram_G = gpuarray.to_gpu(np.float32(np.zeros((3*self.step,3*self.step)).reshape(3*3*self.step*self.step)))
        self.Q = gpuarray.to_gpu(np.float32(np.zeros((6,1))))
        self.G_C = gpuarray.to_gpu(np.float32(np.zeros((150,1))))
        self.gradient = gpuarray.to_gpu(np.float32(np.zeros((150,1))))

        ## TPB: thread_per_block, BPG: block_per_grid
        self.TPB, self.iteration = self.optimal_size(3*self.step) 

    def optimal_size(self, n):
        thread_per_block = int(math.sqrt(n/2))

        iteration = int(n / thread_per_block) + 1

        return thread_per_block, np.int32(iteration)

    def define_object_function_at_kernel(self):
        self.ker_function()

        self.get_G_matrix(self.input_matrix, self.dt, self.G, block=(6,1,1), grid=(self.step,1,1))
        self.get_Q_matrix(self.gravity_matrix, self.dt, self.Q, block=(self.step,1,1), grid=(2,1,1))
        self.get_G_gram_matrix(self.G, self.rho_matrix, self.gram_G, block=(3,1,1), grid=(self.step,self.step,1))
        self.get_G_C_matrix(self.G, self.x_des, self.x_0, self.Q, self.G_C, block=(3,1,1), grid=(self.step,1,1))

    def ker_function(self):
        ## We'll gonna do 150 x 150 @ 150 x 1
        ## block=(thread_per_block,1,1), grid=(3*self.step,1,1)
        get_gradient_ker_function = \
        """
        #define tx (threadIdx.x)
        #define bx (blockIdx.x)
        #define bs (blockDim.x)
        #define gs (gridDim.x)

        __global__ void get_gradient(float* matrix, float* vector1, float* vector2, int iteration, float* result) {

            __shared__ float result_jerk[1000];

            result_jerk[tx] = 0.0;

            for (int i = 0; i < iteration; i++) {
                int index1 = i + tx * iteration;
                int index2 = index1 + bx * 150;

                if (index1 < gs) {
                    result_jerk[tx] += matrix[index2] * vector1[index1];
                }
                else {
                    result_jerk[1000-tx] = 0;
                }
            }

            __syncthreads();

            if (tx == 0) {
                for (int j = 0; j < bs; j++) {
                    result[bx] += result_jerk[j];
                }

                result[bx] -= vector2[bx];
            }
            else {
                result_jerk[1000-tx] = 0;
            }

            __syncthreads();
        }
        """
        get_gradient_ker = SourceModule(get_gradient_ker_function)

        ## block=(6,1,1), grid=(self.step,1,1)
        get_G_matrix_ker_function = \
        """
        #define bx (blockIdx.x)
        #define tx (threadIdx.x)
        #define step (gridDim.x)

        __global__ void get_G_matrix(float* input_matrix, float dt, float* G) {
            // 6: DOF, 18: DOF*axis
            int index = tx + (tx%3) * 6 + bx * 18;

            if (tx < 3) {
                float value;
                value = input_matrix[0] + (step - bx - 1) * input_matrix[1];

                G[index] = value;
            }
            else {
                G[index] = dt;
            }

            __syncthreads();
        }
        """
        get_G_matrix_ker = SourceModule(get_G_matrix_ker_function)

        ## block=(self.step,1,1), grid=(2,1,1)
        get_Q_matrix_ker_function = \
        """
        #define bx (blockIdx.x)
        #define tx (threadIdx.x)
        #define step (blockDim.x)

        __global__ void get_Q_matrix(float* gravity, float dt, float* Q) {
            
            __shared__ float value[50];
            
            if (bx == 0) {
                value[tx] = gravity[0] + (tx * dt) * gravity[1];
            }
            else {
                value[tx] = gravity[1];
            }

            __syncthreads();

            if (bx == 0) {
                if(tx == 0) {
                    for (int i = 0; i < step; i++) {
                        Q[2] += value[i];
                    }
                }
            }
            else {
                if(tx == 0) {
                    for (int i = 0; i < step; i++) {
                        Q[5] += value[i];
                    }
                }
            }

            __syncthreads();
        }
        """
        get_Q_matrix_ker = SourceModule(get_Q_matrix_ker_function)

        ## block=(3,1,1), grid=(self.step,self.step,1)
        get_G_gram_matrix_ker_function = \
        """
        #define bx (blockIdx.x)
        #define by (blockIdx.y)
        #define tx (threadIdx.x)
        #define step (gridDim.x)

        __global__ void get_G_gram_matrix(float* G, float* rho_matrix, float* gram_G) {
            // 9: axis, 151: axis*step+1, 450: axis*axis*step
            int index1 = tx * 151 + bx * 3 + by * 450;

            // 7: DOF+1, 18: DOF*axis
            int index2 = tx * 7 + bx * 18;
                
            float value;
            value = G[index2] * G[index2] + G[index2+3] * G[index2+3];

            gram_G[index1] = value; 

            __syncthreads();

            gram_G[index1] += rho_matrix[index1];

            __syncthreads();
        }
        """
        get_G_gram_matrix_ker = SourceModule(get_G_gram_matrix_ker_function)

        ## block=(3,1,1), grid=(self.step,1,1)
        get_G_C_matrix_ker_function = \
        """
        #define bx (blockIdx.x)
        #define tx (threadIdx.x)

        __global__ void get_G_C_matrix(float* G, float* x_des, float* x_0, float* Q, float* G_C) {
            // C first in each block
            __shared__ float C[6];

            C[tx] = x_des[tx] - Q[tx] - x_0[tx];
            C[tx+3] = x_des[tx+3] - Q[tx+3] - x_0[tx+3];

            __syncthreads();

            // G_C Next
            int index1 = tx * 7 + bx * 18;
            int index2 = tx + bx * 3;

            float value;
            value = G[index1] * C[tx] + G[index1+3] * C[tx+3];

            __syncthreads();

            G_C[index2] = value;

            __syncthreads();
        }
        """
        get_G_C_matrix_ker = SourceModule(get_G_C_matrix_ker_function)

        self.get_G_matrix = get_G_matrix_ker.get_function("get_G_matrix")
        self.get_Q_matrix = get_Q_matrix_ker.get_function("get_Q_matrix")
        self.get_G_gram_matrix = get_G_gram_matrix_ker.get_function("get_G_gram_matrix")
        self.get_G_C_matrix = get_G_C_matrix_ker.get_function("get_G_C_matrix")
        self.get_gradient = get_gradient_ker.get_function("get_gradient")

    def define_object_function(self):

        ## 6 x 150 matrix...
        self.G = np.zeros((6,3*self.step))
        
        for n in range(self.step):
            ## could be calculated in GPU
            value = self.input_matrix[0,0] + (self.step - n - 1) * self.input_matrix[3,0]

            self.G[0,3*n+0] = value
            self.G[1,3*n+1] = value
            self.G[2,3*n+2] = value

            self.G[3,3*n+0] = self.dt
            self.G[4,3*n+1] = self.dt
            self.G[5,3*n+2] = self.dt

        ## 6 x 1 matrix...
        self.Q = np.zeros((6,1))

        for n in range(self.step):
           ## could be calculated in GPU with shared memory 
           self.Q[2] += self.gravity[2] + (n * self.dt) * self.gravity[5]
           self.Q[5] += self.gravity[5]

        ## 6 x 6 matrix...
        value = self.step * self.dt

        A_power_n = \
            np.array([[    1,    0,    0,value,    0,    0],
                      [    0,    1,    0,    0,value,    0],
                      [    0,    0,    1,    0,    0,value],
                      [    0,    0,    0,    1,    0,    0],
                      [    0,    0,    0,    0,    1,    0],
                      [    0,    0,    0,    0,    0,    1]])
                      
        ## 6 x 1 matrix...
        self.C = self.x_des - self.Q - np.dot(A_power_n, self.x_0)

        ## object function: norm(Ax - b)
        self.rho_matrix = \
            gpuarray.to_gpu(np.float32(self.rho * np.identity(3*self.step)))
        self.A = np.vstack((self.G, self.rho_matrix))

        self.b = np.vstack((self.C, np.zeros(3*self.step)))

        ## get gradient: (gram(G) + rho) @ u - G.T @ C
        self.gram_G = np.zeros((3*self.step,3*self.step))

        for p in range(self.step):
            for q in range(self.step):
                ## could be calculated in GPU with 2 dimension thread
                self.gram_G[3*p+0,3*q+0] = self.G[0,3*p+0]*self.G[0,3*q+0] + self.G[3,3*p+0]*self.G[3,3*q+0] + self.rho
                self.gram_G[3*p+1,3*q+1] = self.G[1,3*p+1]*self.G[1,3*q+1] + self.G[3,3*p+1]*self.G[3,3*q+1] + self.rho
                self.gram_G[3*p+2,3*q+2] = self.G[1,3*p+2]*self.G[1,3*q+2] + self.G[3,3*p+2]*self.G[3,3*q+2] + self.rho

        self.G_C = np.zeros((3*self.step,1))

In [287]:
MEC = MinimumEnergyControl(np.array([1,1,1,1,1,1]), np.array([0,0,0,0,0,0]))

In [214]:
MEC.input_matrix.get()

array([0.00125, 0.05   ], dtype=float32)

In [215]:
MEC.gravity_matrix.get()

array([-0.002025, -0.081   ], dtype=float32)

In [288]:
MEC.define_object_function_at_kernel()
MEC.get_gradient(MEC.gram_G, MEC.u, MEC.G_C, MEC.iteration, MEC.gradient, block=(MEC.TPB,1,1), grid=(3*MEC.step,1,1))

## $G$ matrix check

In [217]:
MEC.G.get().reshape(150,6).T[:,0]

array([2.45125, 0.     , 0.     , 0.05   , 0.     , 0.     ],
      dtype=float32)

In [218]:
## 6 x 150 matrix...
G_cpu = np.zeros((6,3*MEC.step))

for n in range(MEC.step):
    ## could be calculated in GPU
    value = MEC.input_matrix[0].get() + (MEC.step - n - 1) * MEC.input_matrix[1].get()
    
    G_cpu[0,3*n+0] = value
    G_cpu[1,3*n+1] = value
    G_cpu[2,3*n+2] = value

    G_cpu[3,3*n+0] = MEC.dt
    G_cpu[4,3*n+1] = MEC.dt
    G_cpu[5,3*n+2] = MEC.dt

In [219]:
G_cpu[:,0]

array([2.45125004, 0.        , 0.        , 0.05      , 0.        ,
       0.        ])

In [220]:
np.allclose(MEC.G.get().reshape(150,6).T, G_cpu)

True

## $Q$ matrix check

In [221]:
MEC.Q.get()

array([[ 0.       ],
       [ 0.       ],
       [-5.062501 ],
       [ 0.       ],
       [ 0.       ],
       [-4.0500016]], dtype=float32)

In [222]:
## 6 x 1 matrix...
Q_cpu = np.zeros((6,1))

for n in range(MEC.step):
    Q_cpu[2] += MEC.gravity_matrix.get()[0] + (n * MEC.dt) * MEC.gravity_matrix.get()[1]
    Q_cpu[5] += MEC.gravity_matrix.get()[1]

In [223]:
Q_cpu

array([[ 0.        ],
       [ 0.        ],
       [-5.06250009],
       [ 0.        ],
       [ 0.        ],
       [-4.05000001]])

In [224]:
np.allclose(MEC.Q.get(), Q_cpu)

True

## $G^TG + \rho I$ check

In [225]:
MEC.gram_G.get().reshape(150,150)[12:15,12:15]

array([[8.070627, 0.      , 0.      ],
       [0.      , 8.070627, 0.      ],
       [0.      , 0.      , 8.070627]], dtype=float32)

In [226]:
## 150 x 150 matrix...
gram_G_cpu = np.zeros((3*MEC.step,3*MEC.step))

for p in range(MEC.step):
    for q in range(MEC.step):
        G = MEC.G.get().reshape(150,6).T

        gram_G_cpu[3*p+0,3*q+0] = G[0,3*p+0]*G[0,3*p+0] + G[3,3*p+0] *G[3,3*p+0]
        gram_G_cpu[3*p+1,3*q+1] = G[1,3*p+1]*G[1,3*p+1] + G[3,3*p+1] *G[3,3*p+1]
        gram_G_cpu[3*p+2,3*q+2] = G[2,3*p+2]*G[2,3*p+2] + G[3,3*p+2] *G[3,3*p+2]

        if (p == q):
            gram_G_cpu[3*p+0,3*q+0] += MEC.rho
            gram_G_cpu[3*p+1,3*q+1] += MEC.rho
            gram_G_cpu[3*p+2,3*q+2] += MEC.rho
        
        else:
            pass

In [227]:
gram_G_cpu.T[12:15,12:15]

array([[8.07062674, 0.        , 0.        ],
       [0.        , 8.06812668, 0.        ],
       [0.        , 0.        , 8.06812668]])

In [228]:
np.allclose(MEC.gram_G.get().reshape(150,150), gram_G_cpu.T)

False

In [229]:
np.linalg.norm(MEC.gram_G.get().reshape(150,150) - gram_G_cpu.T)

0.17677749200417425

## $G^TC$ matrix check

In [230]:
MEC.G_C.get()[:12]

array([[ 2.50125  ],
       [ 2.50125  ],
       [15.113206 ],
       [ 2.45125  ],
       [ 2.45125  ],
       [14.8100815],
       [ 2.40125  ],
       [ 2.40125  ],
       [14.506955 ],
       [ 2.35125  ],
       [ 2.35125  ],
       [14.203831 ]], dtype=float32)

In [231]:
## 150 x 1 matrix...
G_C_cpu = np.zeros((150,1))

C_cpu = np.zeros((6,1))
C_cpu = MEC.x_des.get() - MEC.Q.get().T - MEC.x_0.get()

G_C_cpu = np.dot(G_cpu.T, C_cpu.T)

In [232]:
G_C_cpu[:12]

array([[ 2.50125004],
       [ 2.50125004],
       [15.11320577],
       [ 2.45125004],
       [ 2.45125004],
       [14.81008072],
       [ 2.40125004],
       [ 2.40125004],
       [14.50695566],
       [ 2.35125003],
       [ 2.35125003],
       [14.20383061]])

In [233]:
np.allclose(MEC.G_C.get(), G_C_cpu)

True

## gradient check 
$\nabla Obj = (G^TG + \rho I)u - G^TC$

In [291]:
MEC.gradient.get()[:12]

array([[ -2.50125  ],
       [ -2.50125  ],
       [-15.113206 ],
       [ -2.45125  ],
       [ -2.45125  ],
       [-14.8100815],
       [ -2.40125  ],
       [ -2.40125  ],
       [-14.506955 ],
       [ -2.35125  ],
       [ -2.35125  ],
       [-14.203831 ]], dtype=float32)

In [292]:
gradient_cpu = np.dot(gram_G_cpu, np.zeros((150,1)))- G_C_cpu

In [293]:
gradient_cpu[:12]

array([[ -2.50125004],
       [ -2.50125004],
       [-15.11320577],
       [ -2.45125004],
       [ -2.45125004],
       [-14.81008072],
       [ -2.40125004],
       [ -2.40125004],
       [-14.50695566],
       [ -2.35125003],
       [ -2.35125003],
       [-14.20383061]])

In [294]:
np.allclose(MEC.gradient.get(), gradient_cpu)

True

## NICE!!