In [15]:
import numpy as np
from tensorflow import keras
import matplotlib.pyplot as plt

X1_BOUND = 30
X2_BOUND = 15
X3_BOUND = 15
U_BOUND = 40

In [16]:
def SingleLinkManipulator(curr_state, u):
    J = 1.625103
    m = 0.506
    M0 = 0.434
    L0 = 0.305
    R0 = 0.023
    B0 = 16.25163
    L = 25.0103
    R = 5.0
    Kt = 0.90
    Kb = 0.90
    g = 9.8
    M = J + m*L0*L0/3.0 + M0*L0*L0 + 2*M0*R0*R0/5/Kt
    N = m*L0*g/2.0 + M0*L0*g/Kt
    B = B0/Kt

    x1 = curr_state[0,0]
    x2 = curr_state[0,1]
    x3 = curr_state[0,2]
    x1dot = x2;
    x2dot = -(N/M)*np.sin(x1) - (B/M)*x2 + (x3/M);
    x3dot = -(Kb/L)*x2 - (R/L)*x3 + (u/L);
    dyndot = np.zeros((1,3));
    dyndot[0,0] = x1dot;
    dyndot[0,1] = x2dot;
    dyndot[0,2] = x3dot;
    return dyndot


In [17]:
def Reward(x1, **kwargs):
    return -np.abs((x1-(np.pi/3.0)))

In [18]:
def Quantize(state, X1,X2,X3):
    new_state = np.zeros((1,3))
    new_state[0,0] = np.digitize(state[0,0], X1)
    if new_state[0,0]>=X1_BOUND:
        new_state[0,0] = X1_BOUND-1
    new_state[0,1] = np.digitize(state[0,1], X2)
    if new_state[0,1]>=X2_BOUND:
        new_state[0,1] = X2_BOUND-1
    new_state[0,2] = np.digitize(state[0,2], X3)
    if new_state[0,2]>=X3_BOUND:
        new_state[0,2] = X3_BOUND-1
    return new_state

In [19]:
gamma = 0.99
X1 = np.linspace(0, 3.0, X1_BOUND)
X2 = np.linspace(0, 1.5, X2_BOUND)
X3 = np.linspace(0, 1.5, X3_BOUND)
N1 = X1.shape[0]
N2 = X2.shape[0]
N3 = X3.shape[0]
U = np.linspace(-10, 10, U_BOUND)
policy = np.zeros((N1, N2, N3))
pol = np.zeros((N1, N2, N3))
V = np.zeros((N1, N2, N3))


In [20]:
for runs in range(40):
    for i in range(N1):
        for j in range(N2):
            for k in range(N3):
                nextV = np.zeros((1,U.shape[0]))
                curr_state=np.zeros((1,3))
                curr_state[0,0] = X1[i]
                curr_state[0,1] = X2[j]
                curr_state[0,2] = X3[k]
                for u in range(U.shape[0]):
                    action = U[u];
                    dyndot = SingleLinkManipulator(curr_state, action)
                    next_state = curr_state+0.1*dyndot
                    quant_state = Quantize(next_state, X1,X2,X3)
                    x1 = int(quant_state[0,0])
                    x2 = int(quant_state[0,1])
                    x3 = int(quant_state[0,2])
                    nextV[0,u] = V[x1,x2,x3]
                print(nextV)
                [Vbest,bestind] = np.max(nextV), np.argmax(nextV) ;
                V[i,j,k]= Reward(X1[i]) +  gamma*Vbest ;
                pol[i,j,k]= U[bestind];

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 

array([[[-1.04719755, -1.04719755, -1.04719755, -1.04719755],
        [-1.04719755, -1.04719755, -1.04719755, -1.04719755],
        [-1.04719755, -1.04719755, -1.04719755, -1.04719755],
        [-1.04719755, -1.04719755, -1.04719755, -1.04719755]],

       [[-0.04719755, -0.04719755, -0.04719755, -0.04719755],
        [-0.04719755, -0.04719755, -0.04719755, -0.04719755],
        [-0.04719755, -0.04719755, -0.04719755, -0.04719755],
        [-0.04719755, -0.04719755, -0.04719755, -0.04719755]],

       [[-0.95280245, -0.95280245, -0.95280245, -0.95280245],
        [-0.95280245, -0.95280245, -0.95280245, -0.95280245],
        [-0.95280245, -0.95280245, -0.95280245, -0.95280245],
        [-0.95280245, -0.95280245, -0.95280245, -0.95280245]],

       [[-1.95280245, -1.95280245, -1.95280245, -1.95280245],
        [-3.88607687, -3.88607687, -1.95280245, -1.95280245],
        [-3.88607687, -3.88607687, -3.88607687, -3.88607687],
        [-3.88607687, -3.88607687, -3.88607687, -3.88607687]]])

In [34]:
for i in range(N1):
    for j in range(N2):
        for k in range(N3):
            nextV = np.zeros((1,U.shape[0]))
            curr_state=np.zeros((1,3))
            curr_state[0,0] = X1[i]
            curr_state[0,1] = X2[j]
            curr_state[0,2] = X3[k]
            for u in range(U.shape[0]):
                action = U[u];
                dyndot = SingleLinkManipulator(curr_state, action)
                next_state = curr_state+0.1*dyndot
                quant_state = Quantize(next_state, X1,X2,X3)
                x1 = int(quant_state[0,0])
                x2 = int(quant_state[0,1])
                x3 = int(quant_state[0,2])
                nextV[0,u] = V[x1,x2,x3]
            bestind = np.argmax(nextV) ;
            
            policy[i,j,k]= U[bestind];

In [33]:
np.savetxt("policy.csv", policy[:,:,0], delimiter=",")