In [23]:
import numpy as np
import gym
from gym import spaces

X1_BOUND = np.pi/2.0
X2_BOUND = 1.0
X3_BOUND = 1.0
U_BOUND = np.pi


class SingleLinkManipulator(gym.Env):

    # metadata　= {
    #     "render_modes": ["rgb_array"]
    # }

    def __init__(self, N, B, M, Kb, R, L):
        self.N = N
        self.B = B
        self.M = M
        self.Kb = Kb
        self.R = R
        self.L = L
        self.dt = 0.05
        self.t = 0.0
        self.x1 = 0.0
        self.x2 = 0.0
        self.x3 = 0.0

        self.action_space = spaces.Box(
            low=-U_BOUND, high=U_BOUND, dtype=np.float32, shape=(1,))
        self.observation_space = spaces.Box(
            low=np.array([-X1_BOUND, -X2_BOUND, -X3_BOUND]),
            high=np.array([X1_BOUND, X2_BOUND, X3_BOUND]),
            dtype=np.float32
        )

    def _signal(self):
        # return (np.pi/2.0)*np.sin(8.0*np.pi*self.t/5.0)
        return np.pi/3.0

    def _get_obs(self):
        return np.array([self.x1, self.x2, self.x3])

    def _normalize_x1(self, x1):
        x1 = x1 % np.pi
        return x1 - (np.pi/2.0)

    def step(self, u):
        N = self.N
        B = self.B
        M = self.M
        Kb = self.Kb
        R = self.R
        L = self.L
        dt = self.dt

        # clip action (voltage can't be negative)
        u = np.clip(u, 0, U_BOUND)

        # calculate update parameters
        x1_dot = self.x2
        x2_dot = -(N/M)*np.sin(self.x1) - (B/M)*self.x2 + (self.x3/M)
        x3_dot = -(Kb/L)*self.x2 - (R/L)*self.x3 + (u/L)

        # update parameters
        # self.x1 = self._normalize_x1(self.x1 + x1_dot*dt)
        self.x1 = np.clip(self.x1 + x1_dot*dt, -X1_BOUND, X1_BOUND)
        self.x2 = np.clip(self.x2 + x2_dot*dt, -X2_BOUND, X2_BOUND)
        self.x3 = np.clip(self.x3 + x3_dot*dt, 0, X3_BOUND)

        self.t += dt

        # calculate results
        x1_expected = self._signal()
        loss = (x1_expected - self.x1)**2 + self.x2**2 + self.x3**2

        return self._get_obs(), -loss, False, False, {}

    def reset(self, state=None):
        self.t = 0
        if state == None:
            self.x1, self.x2, self.x3 = self.observation_space.sample()
        else:
            assert type(state) == type((0.0, 0.0, 0.0))
            assert len(state) == 3
            self.x1, self.x2, self.x3 = state
        return self._get_obs(), {}


In [24]:
import math
import numpy as np
from tensorflow import keras
import matplotlib.pyplot as plt

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


# constants
DISCRETIZATION_LEVEL = 50
NEGINF = -1000000.0
gamma = 0.99

# state table values init
X1 = np.linspace(-X1_BOUND, X1_BOUND, DISCRETIZATION_LEVEL)
X2 = np.linspace(-X2_BOUND, X2_BOUND, DISCRETIZATION_LEVEL)
X3 = np.linspace(0, X3_BOUND, DISCRETIZATION_LEVEL)
N1 = X1.size
N2 = X2.size
N3 = X3.size

# action space init
U = np.linspace(0, U_BOUND, DISCRETIZATION_LEVEL)

# optimal policy and value init
policy = np.zeros((N1, N2, N3))
V = np.full((N1, N2, N3), 0.0)
nextV = np.full((U.size), 0.0)

# parameter init
J = 1.625103
m = 0.506
M0 = 0.434
L0 = 0.305
R0 = 0.023
B0 = 16.25163
L = 25.0103
R = 5.0
Kt = Kb = 0.90
g = 9.8
M = J + m*L0*L0/3.0 + M0*L0*L0 + 2*M0*R0*R0/5/Kt
N = m*L0*g/2.0 + M0*L0*g/Kt
B = B0/Kt


# initialize env
env = SingleLinkManipulator(N, B, M, Kb, R, L)



  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [25]:
def find_nearest(array, value):
    idx = np.searchsorted(array, value, side="left")
    if idx > 0 and (idx == len(array) or math.fabs(value - array[idx-1]) < math.fabs(value - array[idx])):
        return idx-1
    else:
        return idx


In [None]:
for _ in range(100):
    for i in range(N1):
        for j in range(N2):
            for k in range(N3):
                state = (X1[i], X2[j], X3[k])
                for u in range(U.size):
                    env.reset(state)
                    new_state, _, _, _, _ = env.step(U[u])
                    x1, x2, x3 = new_state
                    x1 = find_nearest(X1, x1)
                    x2 = find_nearest(X2, x2)
                    x3 = find_nearest(X3, x3)
                    nextV[u] = V[x1, x2, x3]
                Vbest = np.max(nextV)
                reward = -(env._signal() - X1[i])**2
                V[i, j, k] = reward + gamma*Vbest


In [None]:
for i in range(N1):
    for j in range(N2):
        for k in range(N3):
            state = (X1[i], X2[j], X3[k])
            env.reset(state)
            for u in range(U.size):
                new_state, _, _, _, _ = env.step(U[u])
                x1, x2, x3 = new_state
                x1 = find_nearest(X1, x1)
                x2 = find_nearest(X2, x2)
                x3 = find_nearest(X3, x3)
                nextV[u] = V[x1, x2, x3]
            Vbest_idx = np.argmax(nextV)
            policy[i, j, k] = U[Vbest_idx]


In [None]:
model = keras.Sequential([
    keras.layers.Input(shape=(3,)),
    keras.layers.Dense(10, activation='softplus'),
    keras.layers.Dense(10, activation='softplus'),
    keras.layers.Dense(1, activation='relu'),
])

model.summary()
model.compile(
    optimizer=keras.optimizers.Adam(),  # Optimizer
    # Loss function to minimize
    loss=keras.losses.MeanSquaredError(),
    # List of metrics to monitor
    metrics=[keras.metrics.MeanSquaredError()],
)

x_train = np.array([[0,0,0]])
y_train = np.array([0])

for i in range(N1):
    for j in range(N2):
        for k in range(N3):
            x_train = np.append(x_train, [[i, j, k]], axis=0)
            y_train = np.append(y_train, policy[i, j, k])

history = model.fit(
    x_train,
    y_train,
    batch_size=64,
    epochs=100,
)


In [None]:
env.reset()
rewards = np.array([])
for eps in range(200):
    state = env._get_obs()
    # print(state)
    # print(state.shape)
    print(np.atleast_2d(state).shape)
    action = model.predict((np.atleast_2d(state)).astype(np.float32))[0][0]
    # print(action.shape)
    _ ,reward, _,_,_ = env.step(action)
    rewards = np.append(rewards, reward)

plt.plot(rewards) 