In [1]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from functions import *

In [2]:
# grid problem from example 1

# states
# we include the block locations 4 and 11 for convenience of coding; though they will never be reached
states = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] # s = 17 is the EXIT state
NS = len(states) # number of states

# actions
actions = ['up', 'down', 'left', 'right', 'stop']
NA = len(actions) # number of actions

# rewards
reward = -0.1*np.ones(NS)
reward[7] = -10 # reward at state s = 8
reward[15] = +10 # reward at state s = 16
reward[16] = 0 # reward at exit satate s = 17

# target policy pi(a|s)
Pi = np.zeros((NA, NS)) # matrix Pi specifies the policy pi(a|s)
                      # each row is an action; each column is a state

for j in range(NS):
    s = states[j]
    if s in [1, 2, 3, 5, 6, 7, 9, 10, 12, 13, 14, 15]:
        Pi[0,j] = 1/4 # up
        Pi[1,j] = 1/4 # down
        Pi[2,j] = 1/4 # left
        Pi[3,j] = 1/4 # right
        Pi[4,j] = 0  # STOP
    
    else:
        Pi[0,j] = 0 # up
        Pi[1,j] = 0 # down
        Pi[2,j] = 0 # left
        Pi[3,j] = 0 # right
        Pi[4,j] = 1 # STOP

# transition kernel
P = np.zeros((NS, NA, NS)) # entries are Prob(s, a, s')

P[0, 0, 0] = 0.15 # start at s=1, move UP, end in state 1
P[0, 0, 1] = 0.15
P[0, 0, 7] = 0.7

P[0, 1, 0] = 0.85 # start at s=1, move DOWN, end in state 1
P[0, 1, 1] = 0.15

P[0,2,0] = 0.15 # start at s=1, move LEFT, end in state 1
P[0,2,1] = 0.70
P[0,2,7] = 0.15

P[0,3,0] = 0.85 # start at s=1, move RIGHT, end in state 1
P[0,3,7] = 0.15

P[1,0,0] = 0.15  # start at s=2, move UP, end in state 1
P[1,0,2] = 0.15
P[1,0,6] = 0.70

P[1,1,0] = 0.15  # start at s=2, move DOWN, end in state 1
P[1,1,2] = 0.15
P[1,1,1] = 0.70
                
P[1,2,2] = 0.70  # start at s=2, move LEFT, end in state 3                  
P[1,2,6] = 0.15
P[1,2,1] = 0.15

P[1,3,0] = 0.70   # start at s=2, move RIGHT, end in state 1 
P[1,3,6] = 0.15
P[1,3,1] = 0.15

P[2,0,5] = 0.70   # start at s=3, move UP
P[2,0,2] = 0.15
P[2,0,1] = 0.15

P[2,1,2] = 0.85  # start at s=3, move DOWN
P[2,1,1] = 0.15

P[2,2,2] = 0.85  # start at s=3, move LEFT
P[2,2,5] = 0.15 

P[2,3,1] = 0.70   # start at s=3, move RIGHT
P[2,3,5] = 0.15
P[2,3,4] = 0.15

P[4,0,11] = 0.70  # start at s=5, move UP
P[4,0,4]  = 0.15
P[4,0,5]  = 0.15

P[3,0,3] = 1 # values for location 4 this state is never reached
P[3,1,3] = 1 # so these values are irrelevant
P[3,2,3] = 1
P[3,3,3] = 1
P[3,4,3] = 1

P[4,1,4] = 0.85  # start at s=5, move DOWN
P[4,1,5] = 0.15

P[4,2,4]  = 0.85  # start at s=5, move LEFT
P[4,2,11] = 0.15

P[4,3,5]  = 0.70  # start at s=5, move RIGHT
P[4,3,11] = 0.15
P[4,3,4]  = 0.15

P[5,0,4] = 0.15   # start at s=6, move UP
P[5,0,5] = 0.70
P[5,0,6] = 0.15

P[5,1,2] = 0.70    # start at s=6, move DOWN
P[5,1,4] = 0.15
P[5,1,6] = 0.15

P[5,2,4] = 0.70   # start at s=6, move LEFT
P[5,2,5] = 0.15
P[5,2,2] = 0.15

P[5,3,6] = 0.70   # start at s=6, move RIGHT
P[5,3,5] = 0.15
P[5,3,2] = 0.15

P[6,0,9] = 0.70  # start at s=7, move UP
P[6,0,5]  = 0.15
P[6,0,7]  = 0.15

P[6,1,1] = 0.70  # start at s=7, move DOWN
P[6,1,5] = 0.15
P[6,1,7] = 0.15

P[6,2,5]  = 0.70 # start at s=7, move LEFT
P[6,2,9] = 0.15
P[6,2,1]  = 0.15

P[6,3,7] = 0.70  # start at s=7, move RIGHT
P[6,3,1] = 0.15
P[6,3,9] = 0.15

P[7,0,16] = 0   # start at s=8 [DANGER] EXIT
P[7,1,16] = 0
P[7,2,16] = 0
P[7,3,16] = 0
P[7,4,16] = 1 #STOP action

P[8,0,15] = 0.70   # start at s=9 move UP
P[8,0,9] = 0.15
P[8,0,8]  = 0.15

P[8,1,7]  = 0.70   # start at s=9 move DOWN
P[8,1,9] = 0.15
P[8,1,8]  = 0.15

P[8,2,9] = 0.70  # start at s=9 move LEFT
P[8,2,15] = 0.15
P[8,2,7]  = 0.15

P[8,3,8]  = 0.70  # start at s=9 move RIGHT
P[8,3,7]  = 0.15
P[8,3,15] = 0.15

P[9,0,14] = 0.70   # start at s=10 move UP
P[9,0,8]  = 0.15
P[9,0,9] = 0.15

P[9,1,6]  = 0.70  # start at s=10 move DOWN
P[9,1,8]  = 0.15
P[9,1,9] = 0.15

P[9,2,9] = 0.70  # start at s=10 move LEFT
P[9,2,14] = 0.15
P[9,2,6]  = 0.15

P[9,3,8]  = 0.70   # start at s=10 move RIGHT
P[9,3,6]  = 0.15
P[9,3,14] = 0.15

P[10,0,3] = 1 # values for location 11 this state is never reached
P[10,1,3] = 1 # so these values are irrelevant
P[10,2,3] = 1
P[10,3,3] = 1
P[10,4,3] = 1

P[11,0,12] = 0.70  # start at s=12 move UP
P[11,0,11] = 0.30

P[11,1,4]  = 0.70  # start at s=12 move DOWN
P[11,1,11] = 0.30

P[11,2,12] = 0.15  # start at s=12 move LEFT
P[11,2,4]  = 0.15
P[11,2,11] = 0.70

P[11,3,11] = 0.70  # start at s=12 move RIGHT
P[11,3,4]  = 0.15
P[11,3,12] = 0.15

P[12,0,12] = 0.85 # start at s=13 move UP
P[12,0,13] = 0.15

P[12,1,11] = 0.70  # start at s=13 move DOWN
P[12,1,12] = 0.15
P[12,1,13] = 0.15

P[12,2,12] = 0.85 # start at s=13 move LEFT
P[12,2,11] = 0.15

P[12,3,13] = 0.70  # start at s=13 move RIGHT
P[12,3,11] = 0.15
P[12,3,12] = 0.15

P[13,0,13] = 0.70 # start at s=14 move UP
P[13,0,12] = 0.15
P[13,0,14] = 0.15

P[13,1,13] = 0.70  # start at s=14 move DOWN
P[13,1,12] = 0.15
P[13,1,14] = 0.15

P[13,2,12] = 0.70  # start at s=14 move LEFT
P[13,2,13] = 0.30

P[13,3,14] = 0.70  # start at s=14 move RIGHT
P[13,3,13] = 0.30

P[14,0,14] = 0.70  # start at s=15 move UP
P[14,0,13] = 0.15
P[14,0,15] = 0.15

P[14,1,9] = 0.70   # start at s=15 move DOWN
P[14,1,13] = 0.15
P[14,1,15] = 0.15

P[14,2,13] = 0.70  # start at s=15 move LEFT
P[14,2,9] = 0.15
P[14,2,14] = 0.15

P[14,3,15] = 0.70   # start at s=15 move RIGHT
P[14,3,9] = 0.15
P[14,3,14] = 0.15

P[15,0,16] =0   # start at s=16 [REWARD] EXIT
P[15,1,16] =0
P[15,2,16] =0
P[15,3,16] =0
P[15,4,16] =1 # STOP action

P[16,0,16] = 0
P[16,1,16] = 0
P[16,2,16] = 0
P[16,3,16] = 0
P[16,4,16] = 1 # EXIT state

# Computing rpi(s)
rpi = np.zeros(NS)
for s in range(NS):
    policy = Pi[:, s]
    for a in range(NA):
        for sprime in range(NS):
            rpi[s] += policy[a]*P[s, a, sprime]*reward[s]

# Computing P^{\pi}
Ppi = np.zeros((NS, NS))
for s in range(NS):
    policy = Pi[:, s]
    for sprime in range(NS):
        for a in range(NA):
            Ppi[s, sprime] += policy[a]*P[s, a, sprime]

# behavior policy phi(a|s) used to simulate off-policy algorithms
Phi = np.zeros((NA, NS)) # matri Phi specifies the behavior policy phi(a|s)
                         # each row is an action; each column is a state

for j in range(NS):
    s = states[j]
    if s in [1, 2, 3, 5, 6, 7, 9, 10, 12, 13, 14, 15]:
        Phi[0,j] = 3/8 # up
        Phi[1,j] = 1/8 # down
        Phi[2,j] = 2/6 # left
        Phi[3,j] = 1/6 # right
        Phi[4,j] = 0  # STOP
    else:
        Phi[0,j] = 0 # up
        Phi[1,j] = 0 # down
        Phi[2,j] = 0 # left
        Phi[3,j] = 0 # right
        Phi[4,j] = 1  # STOP

# one-hot encoding for the actions
A = np.zeros((5, 5))
A[0, :] = np.array([1, 0, 0, 0, 0]) # up
A[1, :] = np.array([0, 1, 0, 0, 0]) # down
A[2, :] = np.array([0, 0, 1, 0, 0]) # left
A[3, :] = np.array([0, 0, 0, 1, 0]) # right
A[4, :] = np.array([0, 0, 0, 0, 1]) # STOP

# 4x1 reduced feature vectors with four binary entries
# is agent on same row as SUCCESS
# is agent on same row as DANGER
# is agent in rightmost two columns
# is agent in leftmost two columns

# reduced features for state-value function
# no offset is included in the feature vectors since v^{\pi}=0 at state 17
# v^{\pi}(s) = h'*w

Mr = 4
Hr = np.zeros((NS, Mr))
Hr[0,:]  = np.array([0, 0, 1, 0]) # state 1
Hr[1,:]  = np.array([0, 0, 1, 0]) # state 2
Hr[2,:]  = np.array([0, 0, 0, 1]) # state 3
Hr[3,:]  = np.array([0, 0, 0, 0]) # not a valid state
Hr[4,:]  = np.array([0, 1, 0, 1]) # state 5...
Hr[5,:]  = np.array([0, 1, 0, 1])
Hr[6,:]  = np.array([0, 1, 1, 0])
Hr[7,:]  = np.array([0, 1, 1, 0])
Hr[8,:]  = np.array([0, 0, 1, 0])
Hr[9,:] = np.array([0, 0, 1, 0])
Hr[10,:] = np.array([0, 0, 0, 0]) # not a valid state 
Hr[11,:] = np.array([0, 0, 0, 1])
Hr[12,:] = np.array([1, 0, 0, 1])
Hr[13,:] = np.array([1, 0, 0, 1])
Hr[14,:] = np.array([1, 0, 1, 0])
Hr[15,:] = np.array([1, 0, 1, 0]) # state 16
Hr[16,:] = np.array([0, 0, 0, 0]) # EXIT state

Fr = np.kron(Hr, A) # Kronecker product of dimensions (NSxNA) x (MrxNA)
Tr = Mr*NA

# one-hot encoded feature vectors for state-value function
# no offset is included in the feature vectors because v^{\pi}=0 at state 17
# v^{\pi}(s) = h'*w

Me = NS
He = np.zeros((NS, Me))
He[0,:]   = np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) # state 1
He[1,:]   = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) # state 2
He[2,:]   = np.array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) # state 3
He[3,:]   = np.array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) # not valid state
He[4,:]   = np.array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) # state 5
He[5,:]   = np.array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) # ...
He[6,:]   = np.array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
He[7,:]   = np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
He[8,:]   = np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0])
He[9,:]  = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0])
He[10,:]  = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]) # not valid state
He[11,:]  = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])
He[12,:]  = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])
He[13,:]  = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])
He[14,:]  = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])
He[15,:]  = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]) # state 16
He[16,:]  = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) # EXIT state

Fe = np.kron(He, A) # Kronecker product of dimensions (NSxNA) x (MexNA)
Te = Me*NA


In [8]:
def compute_policy(F,theta,T,NA,NS,Pi):

    # F: matrix of feature vectors f_{s,a}
    # theta: parameter for pi(a|h;theta)
    # T: size of theta
    # NA: number of actions
    # NS: number of states
    # Pi: original pi(a|s) used to determine which actions are permissible

    sum_ = np.zeros(NS)
    Pi_theta = np.zeros((NA, NS))
    for s in range(NS):
        pi_vec = Pi[:, s] # helps reveal which actions are permissible at state s
        for a in range(NA): # iterate over actions
            if pi_vec[a] > 0:
                row_idx = (s-1)*NA + a # index of row in F corresponding to (s, a)
                f = F[int(row_idx), :].T # feature vector
                sum_[s] += np.exp(f.T@theta)
    
    for s in range(NS):
        pi_vec = Pi[:, s]
        for a in range(NA):
            if pi_vec[a] > 0:
                row_idx = (s-1)*NA + a #index of row in F corresponding to (s, a)
                f = F[int(row_idx), :].T
                Pi_theta[a, s] = np.exp(f.T@theta)/sum_[s]
    
    return Pi_theta

def compute_gradient_log_pi(F,Pi_theta,s,a,NA):
    # F: matrix of feature vectors f_{s,a}
    # Pi_theta: Gibbs distribution matrix NA x NS
    # s: state
    # action
    # NA: number of actions

    pi_vec = Pi_theta[:, s] # pi(a|s; theta)
    row_idx = (s-1)*NA + a # index of row in F corresponding to (s, a)
    f = F[int(row_idx), :].T # feature vector is at row of index sxa; f_{s,a}
    fs_bar = 0
    for aprime in range(NA):
        row_idx_prime = (s-1)*NA + aprime
        faprime = F[int(row_idx_prime), :].T # f_{s, a'}
        fs_bar += pi_vec[aprime]*faprime
    g_vec = f - fs_bar
    return g_vec

In [None]:
# Advantage actor-critic (A2C) algorithm for optimal policy design

E = 100 # number of episodes within each iteration
max_episode_duration = 50
delta = 0
gamma = 0.9
mu = 0.01 # step-size for critic (w)
mug = 0.001 # step-size for actor (theta)
iter_ = 4000 # repeat for this many iterations to learn theta; each iteration has E episodes
use_reduced_features = 0 # set to zero to use the one-hot encoded extended feature

if use_reduced_features == 1:
    H = Hr 
    M = Mr 
    F = Fr 
    T = Tr 
else:
    H = He 
    M = Me 
    F = Fe 
    T = Te 

w = np.zeros(M) # linear value function model; includes bias coefficient
theta = np.zeros(T) # parameter for Gibbs distribution

saved_states = [] # need to save states, features, rewards actions during all episodes
saved_features = []
saved_rewards = []
saved_actions = []

kernel = np.zeros(NS)

for m in tqdm(range(iter_)): # each iteration involves multiple episodes
    # Current Gibbs distribution using current model theta
    Pi_theta = compute_policy(F,theta,T,NA,NS,Pi)
    counter_transitions = 1 # counts # of state transitions during this iteration

    for e in range(E): # iterates over episodes
        counter = 0
        sample = 1
        while sample == 1:
            idx = np.random.randint(NS-1)+1 # select a random non-exit state index
            if (idx != 4) and (idx != 11) and (idx != 17): # excluding the block locations and exit state
                s = states[idx-1]
                sample = 0

        while (s!=17) and (counter < max_episode_duration): # state s different from EXIT state
            h = H[s, :].T # state feature vector 
            policy = Pi_theta[:, s] # pi(a|s; theta) at state s
            act = select_action(policy)

            for j in range(NS):
                kernel[j] = P[s, act, j]
            
            sprime = select_next_state(kernel) # next state
            hprime = H[sprime, :].T # next feature

            # CRITIC for state value function learning
            r = reward[s] # in this example reward is only state s-dependent
            delta = r + (gamma*hprime-h).T@w
            w += mu*delta*h 

            #saved visited states, actions, rewards
            saved_features.append([h, hprime])
            saved_states.append([s, sprime])
            saved_rewards.append(r)
            saved_actions.append(act)

            s = sprime # set next sample
            counter += 1
            counter_transitions += 1 # plays role of N
    wo = w.copy() # learned CRITIC MODEL

    # ACTOR
    N = counter_transitions - 1
    sum_g = np.zeros(T)

    for nn in range(N):
        hx = saved_features[nn][0]
        hxprime = saved_features[nn][1]
        sx = saved_states[nn][0]
        sxprime = saved_states[nn][1]
        rx = saved_rewards[nn]
        ax = saved_actions[nn]

        deltax = rx + (gamma*hxprime - hx).T@wo 
        g_vec = compute_gradient_log_pi(F,Pi_theta,sx,ax,NA) # gradient of log pi
        sum_g += deltax*g_vec
    sum_g = sum_g/N 
    theta += mug*sum_g

In [17]:
thetao = theta 
Pi_theta = compute_policy(F,thetao,T,NA,NS,Pi) # optimal policy
max_values = Pi_theta.max(axis=0)
indexes = np.argmax(Pi_theta, axis=0)

act = [0]*NS
action_states = [0]*NS
for s in range(NS):
    act[s] = indexes[s] # indexes of the permissible action  
    action_states[s] = actions[act[s]]
    print(s+1, action_states[s])

print("policy AxS from ADVANTAGE (A2C) actor-critic algorithm")
print(Pi_theta)

1 up
2 left
3 left
4 stop
5 up
6 left
7 left
8 stop
9 up
10 up
11 stop
12 up
13 right
14 right
15 right
16 stop
17 stop


In [21]:
# Natural gradient actor-critic algorithm

E = 100 # number of episodes within each iteration
max_episode_duration = 50
delta = 0
gamma = 0.9
mu = 0.01 # step-size for critic (w)
mug = 0.001 # step-size for actor (theta)
iter_ = 10000 # repeat for this many iterations to learn theta; each iteration has E episodes
use_reduced_features = 0 # set to zero to use the one-hot encoded extended feature

if use_reduced_features == 1:
    H = Hr 
    M = Mr 
    F = Fr 
    T = Tr 
else:
    H = He 
    M = Me 
    F = Fe 
    T = Te 

c = np.zeros(T) # linear value function model
theta = np.zeros(T) # parameter for Gibbs distribution

kernel = np.zeros(NS)

for m in tqdm(range(iter_)): # each iteration involves multiple episodes
    # Current Gibbs distribution using current model theta
    Pi_theta = compute_policy(F,theta,T,NA,NS,Pi)

    for e in range(E): # iterates over episodes
        counter = 0
        sample = 1
        while sample == 1:
            idx = np.random.randint(NS-1)+1 # select a random non-exit state index
            if (idx != 4) and (idx != 11) and (idx != 17): # excluding the block locations and exit state
                s = states[idx-1]
                sample = 0

    h = H[s, :].T # initial state feature vector
    policy = Pi_theta[:, s] # pi(a|s; theta) at this state s
    act = select_action(policy) # initial conditions
    idx = (s-1)*NA+act 
    f = F[int(idx), :].T # initial f vector

    while (s!=17) and (counter < max_episode_duration): # state s different from EXIT state
        for j in range(NS):
            kernel[j] = P[s, act, j]
        sprime = select_next_state(kernel) # next state
        hprime = H[sprime, :].T # next feature
        policyprime = Pi_theta[:, sprime] # pi(a|sprime; theta) at state prime
        actprime = select_action(policyprime)
        idx_prime = (sprime-1)*NA + actprime
        fprime = F[int(idx_prime), :].T 

        # CRITIC for state-action value function learning
        r = reward[s] # in this example, reward is only state s-dependent
        beta = r + (gamma*fprime - f).T@c
        c += mu*beta*f 
        s = sprime # set next sample
        act = actprime 
        f = fprime 
        counter += 1
    co = c.copy() # learned CRITIC MODEL

    #ACTOR
    theta += mug*co 

thetao = theta 
Pi_theta = compute_policy(F,thetao,T,NA,NS,Pi) # optimal policy
max_values = Pi_theta.max(axis=0)
indexes = np.argmax(Pi_theta, axis=0)

act = [0]*NS
action_states = [0]*NS
for s in range(NS):
    act[s] = indexes[s] # indexes of the permissible action  
    action_states[s] = actions[act[s]]
    print(s+1, action_states[s])

print("policy AxS from NATURAL actor critic algorithm")
print(Pi_theta)

100%|██████████| 10000/10000 [00:21<00:00, 457.27it/s]

1 up
2 up
3 right
4 stop
5 up
6 right
7 up
8 stop
9 up
10 up
11 stop
12 up
13 right
14 right
15 right
16 stop
17 stop
policy AxS from NATURAL actor critic algorithm
[[2.50000000e-01 9.96811025e-01 7.61399945e-02 0.00000000e+00
  9.99995654e-01 6.33866948e-04 9.99985042e-01 0.00000000e+00
  1.00000000e+00 1.00000000e+00 0.00000000e+00 1.00000000e+00
  1.80143837e-15 4.34798524e-21 7.70863360e-26 0.00000000e+00
  0.00000000e+00]
 [2.50000000e-01 2.34082025e-04 5.84616955e-03 0.00000000e+00
  2.67094933e-07 1.02884545e-04 3.55168431e-14 0.00000000e+00
  3.68332691e-39 1.24565357e-26 0.00000000e+00 6.87469592e-14
  6.96842989e-18 1.97882261e-21 1.43232035e-30 0.00000000e+00
  0.00000000e+00]
 [2.50000000e-01 1.15305088e-04 4.13189217e-03 0.00000000e+00
  3.79221104e-06 1.03735388e-03 1.49584110e-05 0.00000000e+00
  4.21030226e-32 1.12328402e-21 0.00000000e+00 7.62091515e-12
  2.37084012e-18 1.04396673e-25 6.44393882e-33 0.00000000e+00
  0.00000000e+00]
 [2.50000000e-01 2.83958740e-03 9.138


