In [1]:
import gym
import numpy as np
import csv
import json

In [2]:
import torch
import torch.nn as nn
from scipy.special import softmax

In [None]:
RENDER = False

In [3]:
class ValueNet(nn.Module):
    def __init__(self, input_dim, hidden, output_dim):
        super(ValueNet, self).__init__()
        
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden),
            nn.Tanh(),
            nn.Linear(hidden, hidden),
            nn.Tanh(),
            nn.Linear(hidden, output_dim),
        )
    
    def forward(self, x):
        return self.model(x)

In [4]:
class MLP(nn.Module):
    def __init__(self,
                 input_dim : int,
                 hidden : int,
                 output_dim : int):
        super().__init__()
        self.l1 = nn.Linear(input_dim, hidden)
        self.l2 = nn.Linear(hidden, hidden)
        self.l3 = nn.Linear(hidden, output_dim)
        
        nn.init.xavier_uniform_(self.l1.weight)
        nn.init.xavier_uniform_(self.l2.weight)
        nn.init.xavier_uniform_(self.l3.weight)
        
    def forward(self, inputs):
        x = self.l1(inputs)
        x = nn.functional.tanh(x)
        x = self.l2(x)
        x = nn.functional.tanh(x)
        x = self.l3(x)
        return x

In [5]:
class ModelWithPrior(nn.Module):
    def __init__(self,
                 base_model : nn.Module,
                 prior_model : nn.Module,
                 prior_scale : float = 1.0):
        super().__init__()
        self.base_model = base_model
        self.prior_model = prior_model
        self.prior_scale = prior_scale
        
    def forward(self, inputs):
        with torch.no_grad():
            prior_out = self.prior_model(inputs)
            prior_out = prior_out.detach()
        model_out = self.base_model(inputs)
        return model_out + (self.prior_scale * prior_out)

In [6]:
class POLO(object):
    def __init__(self, env, K, T, U, lambda_, noise_mu, 
                 noise_sigma, u_init, memory_size, observation_space, action_space, state_space, 
                 net_hidden_layers, num_nets, state_samples, gradient_steps, gamma=0.99, log_file=None, 
                 noise_gaussian=True):
        
        self.memory_size = memory_size
        self.obs_mem = np.zeros((self.memory_size, observation_space))
        self.state_mem = [None for i in range(self.memory_size)]
        self.targets_mem = np.zeros((self.memory_size, num_nets))
        
        self.num_nets = num_nets
        
        self.K = K  # N_SAMPLES
        self.T = T  # TIMESTEPS
        self.lambda_ = lambda_
        self.noise_mu = noise_mu
        self.noise_sigma = noise_sigma
        self.U = U
        self.u_init = u_init
        self.reward_total = np.zeros(shape=(self.K))
        
        self.state_samples = state_samples
        self.gradient_steps = gradient_steps

        self.env = env

        ############################
        if self.env.unwrapped.spec.id == "Pendulum-v0":
            self.x_init = self.env.env.state
        elif self.env.unwrapped.spec.id == "HumanoidStandup-v2":
            self.x_init = env.sim.get_state()
        ############################
        
        self.gamma = gamma
        
        self.max_reward_for_net = np.full((self.num_nets), '-inf', dtype=np.float)
        
        self.log_file = log_file
        if log_file is not None:
            self.writer = csv.writer(log_file, delimiter='\t')
            headers = ["timestamp", "reward", "action", "state"]
            self.writer.writerow(headers)
        

        if noise_gaussian:
            self.noise = np.random.normal(loc=self.noise_mu, scale=self.noise_sigma, size=(self.K, self.T, self.env.action_space.shape[0]))
        else:
            self.noise = np.full(shape=(self.K, self.T), fill_value=0.9)
            
        self._build_value_nets(observation_space, net_hidden_layers, 1)
        
    def _build_value_nets(self, input_dim, hidden, output_dim):
        self.value_nets = []
        self.loss_funcs = []
        self.optimizers = []
        
        for i in range(self.num_nets):
#             self.value_nets.append(ValueNet(input_dim, hidden, output_dim))
            self.value_nets.append(ModelWithPrior(MLP(input_dim, hidden, output_dim), 
                                                  MLP(input_dim, hidden, output_dim)))
            self.loss_funcs.append(nn.MSELoss())
            self.optimizers.append(torch.optim.Adam(self.value_nets[-1].parameters(), lr=0.01))

    def _get_reward_from_state(self, s):
        root_z = s[0]
        if root_z > 1.1:
            return 1.0
        else:
            return 1.0 - (1.1 - root_z)

    def learn(self, env):
        for _ in range(self.gradient_steps):
            sampled_idx = np.random.choice(np.min([self.memory_counter, self.memory_size]), size=self.state_samples, replace=False)
            
            sampled_obs = self.obs_mem[sampled_idx,:]
            sampled_targets = self.targets_mem[sampled_idx,:]
            sampled_targets = sampled_targets.transpose()
            
            for i in range(self.num_nets):
                net = self.value_nets[i]
                loss_func = self.loss_funcs[i]
                optimizer = self.optimizers[i]
                
                optimizer.zero_grad()
                
#                 print("Target: {}".format(sampled_targets[i]))
                preds = net(torch.tensor(sampled_obs, dtype=torch.float))
                
                target = torch.tensor([sampled_targets[i]], dtype=torch.float)
                loss = loss_func(preds, target)
            

                loss.backward()
                optimizer.step()
                
                
#             print("=====")
#             print(sampled_obs)
#             print(sampled_targets)
        return
        self.x_init = self.env.sim.get_state()
        
        for _ in range(self.gradient_steps):
            sampled_idx = np.random.choice(np.min([self.memory_counter, self.memory_size]), size=self.state_samples, replace=False)

    #             print(self.state_mem)
    #             print(idx)
    #             sampled_states = self.state_mem[idx]

            sampled_obs = self.obs_mem[sampled_idx,:]

    #             sampled_obs = []

            targets = [None for i in range(self.num_nets)]

            for index in sampled_idx:
                s_state = self.state_mem[index]
                o = self.state_mem[index]
    #             for s_state, o in zip(sampled_states, sampled_obs):


                max_rewards = [float('-inf') for _ in range(self.num_nets)]

                for k in range(self.K):
    #                     print(len(self.x_init), self.x_init)
    #                     print(len(s_state), s_state)
                    self.env.sim.set_state(s_state)
                    discount = 1
                    total_reward = 0
                    for t in range(self.T):
                        perturbed_action_t = self.U[t] + self.noise[k, t]

                        s, reward, _, _ = env.step(np.array([perturbed_action_t]))

                        total_reward += discount * reward
                        discount *= self.gamma

                    for i in range(self.num_nets):
                        net = self.value_nets[i]
                        reward_for_net = torch.tensor(total_reward, dtype=torch.float) + net(torch.tensor(s[:22], dtype=torch.float))
                        if reward_for_net > max_rewards[i]:
                            max_rewards[i] = reward_for_net



                for i in range(self.num_nets):
                    target = max_rewards[i]

                    if targets[i] is None:
                        targets[i] = torch.tensor([[target]], dtype=torch.float)
                    else:
                        targets[i] = torch.cat((targets[i], torch.tensor([[target]], dtype=torch.float)))

        
#         for _ in range(self.gradient_steps):
            for i in range(self.num_nets):
                net = self.value_nets[i]
                loss_func = self.loss_funcs[i]
                optimizer = self.optimizers[i]

                optimizer.zero_grad()

                preds = net(torch.tensor(sampled_obs, dtype=torch.float))

                loss = loss_func(preds, targets[i])

                loss.backward()
                optimizer.step()
                
                
        self.env.sim.set_state(self.x_init)
        
    def get_aggregated_value(self, values):
        weights = softmax(0.01 * values)
        weighted_values = values * weights
        
        return sum(weighted_values)
    
    def get_network_values(self, s):
        values = []
        for net in self.value_nets:
            values.append(net(torch.FloatTensor(s)).tolist())
            
        values = np.array(values)
        return values
    
    def _compute_total_reward(self, k):
        discount = 1
        ############################
        if self.env.unwrapped.spec.id == "Pendulum-v0":
            self.env.env.state = self.x_init
        elif self.env.unwrapped.spec.id == "HumanoidStandup-v2":
            self.env.sim.set_state(self.x_init)
        ############################
        for t in range(self.T):
            perturbed_action_t = self.U[t] + self.noise[k, t]
            s, reward, _, _ = self.env.step(np.array([perturbed_action_t]))
            if self.env.unwrapped.spec.id == "HumanoidStandup-v2":
                reward = self._get_reward_from_state(s)
            self.reward_total[k] += discount * reward
            discount *= self.gamma
        
        network_values = self.get_network_values(s[:22])
        
        for i in range(self.num_nets):
            reward_for_net = self.reward_total[k] + discount * network_values[i]
            if reward_for_net > self.max_reward_for_net[i]:
                self.max_reward_for_net[i] = reward_for_net
        
        self.reward_total[k] += discount * self.get_aggregated_value(network_values)

    def _ensure_non_zero(self, reward, beta, factor):
        return np.exp(-factor * (beta - reward))


    def get_action_and_targets(self, env):
        self.max_reward_for_net = np.full((self.num_nets), '-inf', dtype=np.float)
        if self.env.unwrapped.spec.id == "Pendulum-v0":
            self.x_init = self.env.env.state
        elif self.env.unwrapped.spec.id == "HumanoidStandup-v2":
            self.x_init = self.env.sim.get_state()
        
        for k in range(self.K):
            self._compute_total_reward(k)
            if self.env.unwrapped.spec.id == "Pendulum-v0":
                self.env.env.state = self.x_init
            elif self.env.unwrapped.spec.id == "HumanoidStandup-v2":
                self.env.sim.set_state(self.x_init)
        print(self.max_reward_for_net)
            
        beta = np.max(self.reward_total)  # maximum reward of all trajectories
        reward_total_non_zero = self._ensure_non_zero(reward=self.reward_total, beta=beta, factor=1/self.lambda_)
        eta = np.sum(reward_total_non_zero)
        omega = 1/eta * reward_total_non_zero
        
        self.U += [np.sum(omega.reshape(len(omega), 1) * self.noise[:, t], axis=0) for t in range(self.T)]
        
        
            
        action = self.U[0]
        
        self.U = np.roll(self.U, -1, axis=0)

        self.U[-1] = self.u_init  #
        self.reward_total[:] = 0
        
        self.noise = np.random.normal(loc=self.noise_mu, scale=self.noise_sigma, size=(self.K, self.T, self.env.action_space.shape[0]))
        
        
        
        return action, self.max_reward_for_net
    
    def control(self, iter=1000):
        for timestamp in range(iter):
            for k in range(self.K):
                self._compute_total_reward(k)

            beta = np.max(self.reward_total)  # maximum reward of all trajectories
#             print()
#             print(self.reward_total)
#             print(beta)
            reward_total_non_zero = self._ensure_non_zero(reward=self.reward_total, beta=beta, factor=1/self.lambda_)
#             print(reward_total_non_zero)
            eta = np.sum(reward_total_non_zero)
            
            omega = 1/eta * reward_total_non_zero
#             print("Omega: {}".format(omega))
#             print("Noise: {}".format(self.noise))
#             print("U before: {}".format(self.U))
            self.U += [np.sum(omega.reshape(len(omega), 1) * self.noise[:, t], axis=0) for t in range(self.T)]
#             print("Incremental: {}".format([np.sum(omega.reshape(len(omega), 1) * self.noise[:, t], axis=0) for t in range(self.T)]))
#             print("U after: {}".format(self.U))
            ############################
            if self.env.unwrapped.spec.id == "Pendulum-v0":
                self.env.env.state = self.x_init
            elif self.env.unwrapped.spec.id == "HumanoidStandup-v2":
                self.env.sim.set_state(self.x_init)
            ############################
            s, r, _, _ = self.env.step(np.array([self.U[0]]))
            try:
                r = r[0]
            except:
                pass
            if self.env.unwrapped.spec.id == "HumanoidStandup-v2":
                r = self._get_reward_from_state(s)
            print("timestamp: {}, action taken: {} reward received: {}".format(timestamp, self.U[0], r))
            if RENDER:
                self.env.render()
#             self.env.sim.render(1024, 1024)

            self.U = np.roll(self.U, -1, axis=0)

            self.U[-1] = self.u_init  #
            self.reward_total[:] = 0
#             print("U after shifting: {}".format(self.U))
#             print("Rewards reset: {}".format(self.reward_total))
            
            ############################
            if self.env.unwrapped.spec.id == "Pendulum-v0":
                self.x_init = self.env.env.state
            elif self.env.unwrapped.spec.id == "HumanoidStandup-v2":
                self.x_init = self.env.sim.get_state()
            ###########################
            
            if self.writer is not None:
                self._write_record(timestamp, r, self.U[0], s)
            
            self.noise = np.random.normal(loc=self.noise_mu, scale=self.noise_sigma, size=(self.K, self.T, self.env.action_space.shape[0]))
    
    def write_record(self, timestamp, reward, action, state):
        action_json = json.dumps(action.tolist())
        state_json = json.dumps(state.reshape(len(state), 1).tolist())
        self.writer.writerow([timestamp, reward, action_json, state_json])
        self.log_file.flush()
        
    def store_state(self, obs, state, rewards):
        if not hasattr(self, 'memory_counter'):
            self.memory_counter = 0

        # replace the old memory with new memory
        index = self.memory_counter % self.memory_size
        self.obs_mem[index] = np.array(obs)
        self.state_mem[index] = state
        self.targets_mem[index] = rewards

        self.memory_counter += 1

In [7]:
num_epos = 1000
for idx in range(num_epos):
    print("Running for episode {}...".format(idx))
    ENV_NAME = "HumanoidStandup-v2"
    TIMESTEPS = 64  # T
    N_SAMPLES = 128  # K
    ACTION_LOW = -1.0
    ACTION_HIGH = 1.0

    # TIMESTEPS = 15 # T
    # N_SAMPLES = 120  # K

    STATE_SAMPLES = 32

    noise_mu = 0
    noise_sigma = 0.2
    lambda_ = 1.25
    gamma = 0.99

    Z = 16

    env = gym.make(ENV_NAME)

    # from gym.wrappers import Monitor
    # env = Monitor(env, './video', force=True)
    # env._max_episode_steps = 200
    # env.render()
    # env.sim.render(1024, 1024)
    print(env.observation_space)
    print(env.action_space)

    U = np.random.uniform(low=ACTION_LOW, high=ACTION_HIGH, size=(TIMESTEPS, env.action_space.shape[0]))  # pendulum joint effort in (-2, +2)
    # print(U)

    log_file = open("polo_record_multiple_{}.tsv".format(idx), "w")

    s = env.reset()

    polo = POLO(env=env, K=N_SAMPLES, T=TIMESTEPS, U=U, lambda_=lambda_, noise_mu=noise_mu, 
                    noise_sigma=noise_sigma, u_init=0, memory_size=512, 
                    observation_space=22, action_space=env.action_space.shape[0],
                    state_space=env.observation_space.shape[0], net_hidden_layers=16, 
                    num_nets=6, state_samples=STATE_SAMPLES, gradient_steps=64, 
                    gamma=gamma, log_file=log_file, noise_gaussian=True)


    # polo.store_state(s[:22], env.sim.get_state())

    rewards = []
    for t in range(1000):
        a, targets = polo.get_action_and_targets(env)
        polo.store_state(s[:22], env.sim.get_state(), targets)
        s, r, _, _ = env.step(np.array([a]))
        rewards.append(r)
        polo.write_record(t, s[0], a, s[:22])
        print("episode: {}, timestamp: {}, action taken: {} reward received: {}".format(idx, t, a, s[0]))
        if RENDER:
            env.render()


        if t != 0 and t % Z == 0 and t >= STATE_SAMPLES:
            print("Updating networks...")
            polo.learn(env)


    # mppi_gym.control(iter=30)

    log_file.close()

Box(376,)
Box(17,)




[2.01591893 1.55738503 1.2574061  0.38289886 1.62152857 1.77763039]
timestamp: 0, action taken: [-0.16792404 -0.79324572  0.0767476   0.70023252  0.82098363  0.6449306
 -0.45213628 -0.83539893 -0.15609682 -0.21286734  0.09015993 -0.75705853
  0.48681135 -0.59001441  0.4226868  -0.99859737  0.23199166] reward received: 0.10428574874116879
Creating window glfw
[1.70541435 1.50653618 1.08298179 0.56956718 1.39005444 1.78909673]
timestamp: 1, action taken: [-0.73653857 -0.15586092 -0.14371088 -0.10432229  0.08831533 -0.17644435
  0.07719761 -0.40905663 -0.75561846 -0.64262816 -0.89719175 -0.92814119
 -0.00216634  0.05566629  0.55410835 -0.0583927  -0.89675527] reward received: 0.11031195072479733
[1.84548632 1.71966774 1.1410338  0.48808991 1.52469618 1.85159722]
timestamp: 2, action taken: [ 0.00850785 -0.57781229  0.17043837 -0.06046941  0.14776939  0.931453
  0.55530095  0.85722542 -0.15930335 -0.07451103  0.875281    0.25834101
 -0.08624199 -0.2592055   0.58925958  0.44680039 -0.807133

[2.67585928 2.68749792 2.52952067 1.43703231 2.38485526 2.79571279]
timestamp: 24, action taken: [ 0.63628766  0.36565061  0.5797599  -0.07844764 -0.69229112  0.61019248
  0.0433036  -0.08325643 -0.54813755  0.0387551   0.49713292  0.47695855
  0.44849545 -0.25297567  0.55693223  0.4765762  -0.36096028] reward received: 0.09480816462579031
[3.10581597 3.18449707 2.73391887 2.03846809 2.76515526 3.33187923]
timestamp: 25, action taken: [ 0.28495308 -0.11972451 -1.01428853 -0.01862636  0.28013808 -0.98573628
  0.74263978 -0.10594583  0.93239547  0.08418362 -0.8299873  -0.28278748
 -0.00860428  0.85080588 -0.17154282  0.01096771 -0.80330376] reward received: 0.10776870297067706
[3.7657999  3.72839835 3.00127209 2.94238013 3.49680385 4.28271108]
timestamp: 26, action taken: [ 0.78554173 -0.81084684  0.86116755 -1.0184369  -0.12641682 -0.91480023
  0.21989499  0.03933474 -0.39641892 -0.45655319 -0.93150301  0.18927837
  0.19280025 -1.09951455 -0.02787657 -0.27764047  0.14198105] reward rece

[3.76152551 4.09246638 4.08470006 3.20991087 3.7051543  3.77071073]
timestamp: 48, action taken: [-0.66777267 -0.6222651   0.75154058 -1.01840121  1.10399413  0.01345485
  0.97234803 -0.60099947  0.79754487 -0.80138453 -0.62525414 -0.10768991
  0.03277175 -0.79931113 -0.12893924  0.18707804  0.66541913] reward received: 0.23814392204097598
Updating networks...
[4.0951448  4.10900944 4.23084137 3.58606268 4.06954996 4.1112915 ]
timestamp: 49, action taken: [ 0.70414954  1.06930913 -0.89371902  0.94269735  0.5800998  -0.77450301
  1.13137425 -0.07077066 -0.06548481 -0.63855438  0.94175178  0.38980279
  0.00746451 -0.5664724   0.6490059   0.79822836  0.98648892] reward received: 0.2320329495907414
[3.81604337 3.84019675 3.82833046 3.0590693  3.80446442 3.93764159]
timestamp: 50, action taken: [-0.34090227  1.06025659 -0.04977758  0.39444739 -0.49577142 -0.52079475
  0.75862897 -0.08164882  1.23474222 -0.64612433 -0.82012856 -0.17972465
  0.70952192  0.56675177 -0.5257289  -0.72466948  1.1

[5.73421692 5.88703236 5.8268509  5.23617565 5.7847001  6.03374397]
timestamp: 72, action taken: [ 0.17331742  0.04607438  0.02341332  0.24455546  0.06254949  0.46574091
 -0.11779389  0.08123781  0.00573983  0.12543165  0.19564015 -0.16276256
 -0.06228229  0.19096734  0.1168957   0.11668913  0.16331911] reward received: 0.13069201665243782
[5.95493577 6.03636862 6.15436024 5.40797685 5.89037341 6.00581406]
timestamp: 73, action taken: [ 0.35667629  0.24803653 -0.07781221  0.18183309 -0.07044844  0.2272358
 -0.13758163  0.08468429 -0.12061156  0.01084639 -0.0866331  -0.08482799
  0.03058666  0.1450808  -0.31126357  0.23822002  0.03483771] reward received: 0.13211055404343727
[5.98005125 6.11691218 6.39539075 5.47867707 6.07842604 5.99006794]
timestamp: 74, action taken: [ 0.10156309  0.07373467 -0.14888487 -0.10225635 -0.2244782  -0.0887154
 -0.23876249  0.15850903  0.12337906 -0.06094071  0.09683458 -0.16519525
  0.04358775  0.2328049   0.04130939 -0.01480368 -0.01936184] reward receiv

[17.10402805 17.05893684 16.86624012 16.77567362 16.68465653 17.22014738]
timestamp: 96, action taken: [ 0.04484019  0.16005257  0.08771815  0.08606178  0.09767381 -0.35935747
  0.05327497  0.03404103  0.19738176 -0.11333805  0.06358948 -0.18015417
  0.01226054 -0.20609289  0.09093989 -0.08322857 -0.1315745 ] reward received: 0.2923321801000424
Updating networks...
[18.59491603 18.5109931  18.0653791  18.07731821 18.35710598 18.56837521]
timestamp: 97, action taken: [ 0.1043027   0.12309218  0.35874691  0.02650457 -0.02297198 -0.3980268
 -0.06395256  0.07032741  0.19182004 -0.42907129 -0.14266979 -0.04062846
 -0.1914604   0.20936055 -0.28396345 -0.17672094  0.21906974] reward received: 0.31271685880755684
[18.84928576 18.79932889 18.67866425 18.59822925 18.67700562 18.57586153]
timestamp: 98, action taken: [-0.09321081  0.22138912  0.2081287   0.15289371 -0.15098285 -0.3574182
 -0.12342633  0.4327706   0.0222358  -0.34138721 -0.00816262 -0.08715239
  0.40241003 -0.03185952 -0.14313776 

[20.38552552 20.19149613 19.96136431 19.85715528 20.14941973 20.1003436 ]
timestamp: 120, action taken: [-0.32357195 -0.10185295  0.0028714  -0.21641235 -0.0209587  -0.01736072
 -0.09981881 -0.06306755  0.11751158 -0.34271845 -0.19190304  0.03611292
  0.0578365  -0.38153995  0.06318992 -0.1065333   0.05995983] reward received: 0.4548823233278564
[20.29349385 20.07150219 19.95667583 19.87031575 20.04093559 20.175995  ]
timestamp: 121, action taken: [ 0.25085852 -0.25950847  0.03110311 -0.08747415 -0.53650565  0.02575488
 -0.21420786  0.11597624  0.31208339 -0.17174696  0.08423185 -0.10566528
 -0.04900371  0.05642869  0.2694314   0.09524191  0.00894921] reward received: 0.4550860099433381
[20.13450733 20.293012   19.9334949  19.87382793 20.0111631  20.17638693]
timestamp: 122, action taken: [ 0.11753101  0.01287629  0.27440645  0.08784029 -0.09398228 -0.08643096
 -0.07773583  0.10077276 -0.11845336  0.04101136  0.14344357 -0.17606137
 -0.14144082  0.17281155  0.08990805  0.2147133  -0.11

[21.04401499 21.37545353 20.73961809 20.84408375 20.8795499  20.90372924]
timestamp: 144, action taken: [ 0.23502836 -0.02088557  0.31150116  0.21275655 -0.01977046  0.06365559
  0.01838137 -0.27543154 -0.12502656  0.17970449  0.34219268 -0.01477446
  0.09295542  0.07582896 -0.02611796 -0.14205149  0.14326003] reward received: 0.4470775448409883
Updating networks...
[21.62062812 21.75570744 21.3775467  21.5396808  21.57524086 21.64204586]
timestamp: 145, action taken: [-0.01242736  0.04314402  0.15910716  0.08908193  0.35285588 -0.18370618
 -0.01471098  0.00452214  0.10817098 -0.22676956 -0.04605689 -0.20499397
 -0.13966476 -0.05010357  0.23537062  0.11174789  0.152146  ] reward received: 0.44482228171793875
[21.726513   21.97418672 21.3925562  21.46401626 21.54232656 21.6676254 ]
timestamp: 146, action taken: [ 0.11211382  0.09367012  0.10202678 -0.01224112 -0.24348519 -0.0646814
  0.09221364  0.13584619  0.45447099 -0.42680254  0.00803387  0.00888503
  0.19092132  0.16101288 -0.16515

[22.68104955 22.93729798 22.44396622 22.53623001 22.6911639  22.79466339]
timestamp: 168, action taken: [ 0.01897833  0.08964003  0.04496596 -0.05479968  0.03156415 -0.02169414
 -0.14495655  0.09115033 -0.07947747 -0.26069909 -0.0016761   0.18245216
  0.14535114 -0.01812502 -0.02823749 -0.19971738 -0.1013181 ] reward received: 0.4361739737052769
[22.72286051 22.92016176 22.33370739 22.48491263 22.36184082 22.83215417]
timestamp: 169, action taken: [ 0.0780376  -0.09214184 -0.00119208 -0.08776474 -0.14581187 -0.18023642
  0.02294599 -0.28190645 -0.03551073 -0.07061336 -0.10627805 -0.20355748
 -0.32847251  0.03892805  0.11353324 -0.23834783  0.01412143] reward received: 0.43537570812404486
[22.54842194 22.76052161 22.49654523 22.50993312 22.5409549  22.78300193]
timestamp: 170, action taken: [-0.32226842  0.21181071  0.03711963 -0.01831261  0.24667802 -0.36286466
 -0.0512426  -0.05960813 -0.02790728 -0.02171591  0.03949748 -0.11994371
  0.10083615  0.03405267 -0.08801813 -0.13454063 -0.2

[23.10599308 23.29202459 23.08029837 23.13420523 23.08912307 23.4136936 ]
timestamp: 192, action taken: [ 0.10928512  0.0305256   0.19784457 -0.15933803 -0.07575738  0.48219946
  0.27774527  0.35256676 -0.35431575 -0.03482189 -0.14760659  0.1992336
 -0.14573891  0.15010705  0.12018585  0.00665598  0.10309885] reward received: 0.4393877256040518
Updating networks...
[23.82264016 24.03240287 23.62049047 23.81264986 23.70779582 23.96682167]
timestamp: 193, action taken: [-0.1981699   0.28825663  0.16003548  0.26715125  0.08584414  0.13794411
  0.06229262  0.0412804  -0.05641275 -0.14232088  0.29587797 -0.06818587
 -0.08585641 -0.17172156 -0.14146766  0.29779916 -0.12571047] reward received: 0.4424338240740218
[23.8654886  23.99326269 23.72135521 23.70129649 23.64187833 24.00565219]
timestamp: 194, action taken: [-0.06676516  0.01816876  0.1115777   0.13358296 -0.10695894  0.18099086
  0.02777578 -0.1183271  -0.12560417  0.01724974  0.03645985  0.25842409
  0.14844691  0.01436427  0.200039

[24.72578798 24.67506771 24.42627383 24.48912229 24.73909711 24.61407737]
timestamp: 216, action taken: [-0.10134583  0.11959526  0.5387579   0.32432595  0.19176585  0.12009936
 -0.10041427  0.19951873 -0.15881173  0.09028337  0.00377226 -0.17351431
  0.10278414 -0.25253118 -0.10245702 -0.14283917 -0.2243611 ] reward received: 0.4628653235997343
[24.51178333 24.60650391 23.98342189 24.29856903 24.4402895  24.497192  ]
timestamp: 217, action taken: [-0.0784759  -0.06528037  0.14787235  0.03130316 -0.06976034  0.03429538
  0.23990067 -0.12133895  0.39395453  0.01354986  0.03319722 -0.0369784
 -0.15609483  0.1009219  -0.12548263  0.03968369 -0.11038719] reward received: 0.46409455780001596
[24.50160075 24.60618242 24.24877414 24.39367888 24.52542206 24.71795618]
timestamp: 218, action taken: [-0.1566657   0.12732749  0.11345261  0.10785534  0.40192389  0.06838701
 -0.01323572  0.11526347  0.2342687   0.04791471  0.07263091 -0.1248327
  0.19677293 -0.04102787 -0.00870592  0.01579295 -0.081

[25.63433507 25.62003096 25.27012987 25.37233231 25.5338063  25.77896363]
timestamp: 240, action taken: [-0.19054552 -0.26704962  0.21951166 -0.14339203 -0.19134315 -0.07702433
 -0.13584338 -0.01716567 -0.08489712 -0.06431453 -0.23953647  0.01283582
 -0.05664337  0.24581228 -0.11158369  0.00989002 -0.07756629] reward received: 0.4707178395027195
Updating networks...
[25.82460136 25.8641819  25.56570633 25.56642963 25.77813919 25.91308615]
timestamp: 241, action taken: [-0.07696646 -0.14842564  0.25460283  0.05896808 -0.0792746   0.02353084
  0.19297603  0.05712462  0.1241277  -0.33270702  0.24091398  0.01933569
  0.09350886  0.17327667 -0.00655411 -0.01284877 -0.05193046] reward received: 0.4712900265314846
[25.55568695 25.63776178 25.35961219 25.23426514 25.43082861 25.6562097 ]
timestamp: 242, action taken: [ 0.03885738 -0.22399521 -0.01930566  0.20929039 -0.15862413  0.06092012
  0.02096059 -0.2657443   0.3318735   0.02468817  0.14143678 -0.14313115
 -0.33795249 -0.30446474  0.19342

[26.70541395 26.97790332 26.65304157 26.731142   26.37120894 26.93688922]
timestamp: 264, action taken: [-0.19784299 -0.16282547  0.21539829 -0.11901834  0.02554207 -0.11412905
  0.1394081   0.02596937 -0.00364823 -0.39758287 -0.22828399  0.07884904
 -0.23245389 -0.22343314  0.10628316 -0.08532069 -0.0656059 ] reward received: 0.47632779623656457
[26.64476197 26.7848427  26.32790514 26.37423649 26.39822995 26.83932033]
timestamp: 265, action taken: [ 0.17316766  0.05808557 -0.04360236 -0.02933634  0.13570249  0.10836067
 -0.02644229  0.23818712 -0.08372298 -0.1074284   0.04582338  0.06090555
 -0.07478055  0.32264004  0.02591263 -0.2132979   0.07788192] reward received: 0.4754954154936987
[26.69892454 26.94436557 26.59233338 26.62159102 26.40659045 26.87876526]
timestamp: 266, action taken: [ 0.17070184 -0.07020367  0.19004063  0.23427541 -0.21220648  0.19134773
  0.05635981  0.2216222   0.20188653  0.07190775 -0.12147222 -0.04099008
  0.04407239 -0.08006347 -0.12658314  0.13016322 -0.2

[26.37096072 26.45279545 26.08150322 26.09067205 26.19174033 26.61448376]
timestamp: 288, action taken: [ 0.18067614 -0.1765509  -0.0713786   0.22548315 -0.30181062 -0.17003168
  0.00036152  0.03348404  0.30119248 -0.03643731  0.00097422 -0.14799333
 -0.10980485  0.02286215 -0.09994195 -0.11191654 -0.08979155] reward received: 0.4639627569812857
Updating networks...
[26.92986655 27.01377607 26.76456692 26.90591956 26.70886023 27.17705055]
timestamp: 289, action taken: [ 0.08877024  0.05808632  0.04184114  0.02892997 -0.10307045  0.03510309
  0.03162985 -0.03541733 -0.07039294  0.06369932 -0.02205294  0.05194531
  0.03258783  0.2047998  -0.21469068  0.07542127 -0.21319265] reward received: 0.4630690422439476
[26.90311901 27.23152759 26.78542313 26.80704182 26.77121964 27.17153524]
timestamp: 290, action taken: [-0.06803021 -0.3847771  -0.14186807  0.17046205  0.04202136  0.05306767
  0.06133159  0.32119338  0.05011079  0.04827621  0.03397534 -0.19585119
 -0.20867599 -0.16011452  0.09794

[27.39198771 27.53122809 27.1403713  27.24277925 27.15223272 27.56929412]
timestamp: 312, action taken: [ 0.15058991 -0.13395705  0.30932495  0.10462089 -0.14785571 -0.08643875
  0.19067784  0.05867272  0.33139373  0.08483875 -0.30612401 -0.18545684
  0.17780458 -0.14654956 -0.0879496  -0.01991208 -0.04946359] reward received: 0.47180827536747727
[27.42566841 27.5152013  27.00640464 27.23635112 27.10840457 27.6635005 ]
timestamp: 313, action taken: [ 0.01846708 -0.11775374  0.0875262   0.14820148  0.26189364  0.05630633
 -0.08569702 -0.10955632  0.32813014  0.03514251  0.02786127  0.15103869
 -0.04091415  0.22729289  0.07415274  0.12364914  0.02756532] reward received: 0.47209994140185224
[27.39143511 27.63302856 27.18017317 27.19438355 27.08317168 27.60698874]
timestamp: 314, action taken: [-0.32096162 -0.01579494  0.17936673  0.10270826 -0.33065066  0.10470557
 -0.08088415 -0.0516957   0.2715806  -0.04704111 -0.22654245 -0.12918528
 -0.12637558 -0.06172232 -0.00100988 -0.20183791 -0.

[27.20817604 27.34871354 27.13954432 27.05368302 27.02371069 27.35282947]
timestamp: 336, action taken: [ 0.28748226  0.11814092  0.2845336  -0.32975281  0.05434889 -0.17168158
 -0.1642782   0.22958862  0.02761809 -0.08476266  0.23692407  0.0005212
 -0.00491974  0.14232463 -0.19428541 -0.11457368 -0.10420284] reward received: 0.465218102897571
Updating networks...
[27.8215585  27.98204002 27.76092413 27.6688088  27.6325646  27.97033571]
timestamp: 337, action taken: [-0.14879321 -0.02618791 -0.07492913 -0.0515238   0.0327244  -0.06369476
 -0.06884203  0.25184656 -0.03517532 -0.13955685 -0.34342215  0.20177994
 -0.11419931 -0.08524838 -0.2251516  -0.08545955 -0.09090314] reward received: 0.4658955860691187
[27.73739022 27.93943421 27.59853554 27.74709237 27.51219204 28.00873333]
timestamp: 338, action taken: [-0.0606306   0.05130903 -0.11278981 -0.03485294  0.02429164 -0.14061532
 -0.12504958  0.10003966 -0.08742304 -0.26583096 -0.0086455   0.04589991
 -0.09530333  0.01524699 -0.0198237

[28.29931989 28.40515336 28.30460471 28.05060938 28.03817877 28.57536411]
timestamp: 360, action taken: [ 0.12460766 -0.20514232  0.05242717  0.05847227 -0.11566527  0.24737866
 -0.03232211  0.08440372  0.07641138  0.27403419  0.02225308  0.14281477
  0.06813782 -0.10717049  0.0904807   0.21750533  0.02598512] reward received: 0.4786775918564413
[28.18722269 28.40150214 28.17717311 28.16671074 28.0388347  28.50703831]
timestamp: 361, action taken: [-0.10288094 -0.27161373 -0.07804958 -0.13090469 -0.01002844  0.04341936
  0.08481536  0.10354772  0.0791552  -0.33992286 -0.28400658 -0.09967014
 -0.12866518 -0.03667623  0.19923115  0.00983596 -0.22403687] reward received: 0.4761689355076176
[28.22129184 28.35042254 28.0097414  28.12142766 27.93712285 28.39504641]
timestamp: 362, action taken: [ 9.61824388e-02  2.14829058e-02 -3.86710574e-04 -1.89728195e-01
 -1.85830599e-01 -1.19505573e-01  1.64915717e-01 -5.63974768e-02
  3.26774353e-01 -5.23674313e-02 -1.89658746e-02  4.25712987e-02
  1.4

[28.15163526 28.46452621 28.14137572 28.30234145 28.12423314 28.45522606]
timestamp: 384, action taken: [-0.00912667  0.14026842  0.07670405 -0.23847688 -0.10845739 -0.06393633
 -0.02813792 -0.00320812  0.16429417 -0.3158094   0.13769639  0.06259066
 -0.14726093  0.00895527 -0.0738869   0.13813194 -0.02316763] reward received: 0.47214141485349576
Updating networks...
[28.27342116 28.53594255 28.08237992 28.16319842 28.0732936  28.50537428]
timestamp: 385, action taken: [ 0.00801124 -0.00840901  0.33555246  0.0816247   0.10405618  0.06858432
 -0.12732992  0.01053556  0.0799147   0.20463422  0.13834988  0.06702518
  0.03970317 -0.15228372 -0.27447931 -0.01263752  0.21613943] reward received: 0.4735949901508631
[28.28415338 28.46016518 28.34553694 28.21208271 28.14052682 28.65000679]
timestamp: 386, action taken: [-0.11147858 -0.32522946  0.32199784 -0.25264047 -0.22387295  0.15489536
  0.04438085  0.03542296  0.17965552 -0.11492921 -0.46627355  0.0904189
  0.03518866 -0.06744595  0.09107

[28.36335282 28.69174136 28.20504929 28.28777423 28.10058265 28.58488935]
timestamp: 408, action taken: [-0.10179329  0.06155295  0.27345078 -0.07217448 -0.05835966 -0.24584449
  0.05588162 -0.12808131  0.00364694 -0.08604786 -0.08411278  0.05236911
 -0.16281133 -0.11827852 -0.16474947 -0.08990627 -0.06183617] reward received: 0.4713224392235934
[28.50450857 28.63930401 28.44661378 28.39918222 28.27594559 28.73649314]
timestamp: 409, action taken: [ 0.15863253 -0.30552675 -0.01049025 -0.00675861  0.12972662  0.08244281
 -0.12232952 -0.02364032  0.07519235  0.00606339  0.01135473 -0.03383201
  0.08693479  0.12789091 -0.06472068 -0.1007454   0.09767711] reward received: 0.4714305351919782
[28.35015238 28.6542052  28.29422916 28.3501833  28.12540473 28.75555247]
timestamp: 410, action taken: [ 0.0956543  -0.03587926  0.05014431  0.00849009 -0.17462391  0.38378177
 -0.01298028 -0.0159887   0.06128915 -0.06745379 -0.05694081 -0.06263605
 -0.12764059  0.0885223  -0.13755849  0.1612087  -0.21

[28.55802119 28.87600161 28.65918622 28.63242159 28.42010724 28.99025305]
timestamp: 432, action taken: [-0.05136058 -0.14293288  0.18446121  0.06649267 -0.19189057  0.16846634
  0.04552806 -0.18482705 -0.13579169  0.05813043  0.02749602  0.0022881
  0.33977821  0.11133648  0.00113268 -0.00588325  0.16143198] reward received: 0.4795808823886002
Updating networks...
[29.17207197 29.39919139 29.2123464  29.23501066 28.97740456 29.43243985]
timestamp: 433, action taken: [-0.08405306 -0.00397367  0.13953237  0.11752534 -0.3684149   0.13194735
 -0.16525404  0.05542282  0.1175552  -0.18528224  0.20269354  0.21134886
 -0.25154771 -0.07203401  0.34047409 -0.01229419 -0.20415154] reward received: 0.47868170850584923
[29.14467031 29.35749691 29.02039769 29.11350745 28.88310141 29.2476304 ]
timestamp: 434, action taken: [ 0.0662679  -0.14481534  0.1988605  -0.08234458 -0.09048001  0.01688136
  0.15280877  0.01748911  0.30405465 -0.19559266  0.00587366  0.10642216
 -0.01336724  0.09964653  0.03192

[28.86185006 29.11813713 28.86127851 28.95060879 28.64283371 29.27718983]
timestamp: 456, action taken: [ 0.25834752  0.09007809  0.0640164   0.11106734  0.2224531  -0.19308971
  0.06404552  0.10462686  0.3553458  -0.26879065 -0.01895276  0.32576111
  0.07469347  0.37277252 -0.10559103  0.17817459  0.09754984] reward received: 0.468910887636917
[28.97821448 29.11779826 28.75216461 28.99279667 28.86898354 29.28003376]
timestamp: 457, action taken: [-0.15653017 -0.13912263  0.11448024 -0.09974778 -0.08733723 -0.0123346
 -0.23661999 -0.0214868  -0.01226492 -0.07993093  0.13873814  0.11235514
 -0.08205398  0.17309415  0.04718454 -0.08285434  0.16762181] reward received: 0.4698677145916678
[28.88840031 29.17414218 28.81204496 28.90615726 28.61387762 29.23019573]
timestamp: 458, action taken: [ 0.09590569 -0.32053057  0.32442226  0.01078551 -0.14571538  0.09873947
 -0.05916537  0.07143564  0.14087127  0.03385552 -0.0585278   0.08144163
 -0.131343    0.02102625 -0.04308062  0.01182825 -0.4436

[28.91939015 29.33774907 29.05908332 29.06314468 28.89901852 29.34486378]
timestamp: 480, action taken: [ 0.15633536  0.12676646  0.15928105  0.21685004 -0.06310919 -0.19036398
 -0.06809051  0.05943153  0.0078397  -0.16000041  0.120743    0.09322121
 -0.09209615 -0.03230379 -0.19621978  0.12976017 -0.05593401] reward received: 0.47493492869432674
Updating networks...
[29.19934661 29.4293442  29.19825891 29.26858357 29.02978307 29.51884159]
timestamp: 481, action taken: [-0.12525098 -0.16088732  0.08007015 -0.00815936 -0.10267715 -0.0099429
 -0.11514567  0.10270021 -0.12056199 -0.18005555  0.07082484  0.10424647
 -0.03993817 -0.01376774  0.21058019  0.18445616  0.20195064] reward received: 0.47493457940781525
[29.27798478 29.52909693 29.42704387 29.43983471 29.18469576 29.64913076]
timestamp: 482, action taken: [ 0.15424501 -0.13754153  0.04903216 -0.01930664 -0.14854246  0.19589165
  0.00863995  0.16964419 -0.09528633 -0.24463499 -0.05254726 -0.01956971
  0.34318474 -0.29327034  0.2383

[29.40383902 29.60527652 29.37861859 29.46110359 29.25962299 29.5951894 ]
timestamp: 503, action taken: [-0.28375338  0.01263484  0.08787792 -0.42616271  0.04459563 -0.03558092
  0.06734126 -0.11405706 -0.02980373 -0.34290487  0.05563403 -0.18085311
  0.18866128  0.03910206 -0.33803199  0.05048974  0.04808417] reward received: 0.46960164554434686
[29.3494909  29.5743663  29.42537839 29.43317724 29.23518762 29.68007277]
timestamp: 504, action taken: [ 0.13915121 -0.004483   -0.20286187  0.03343317  0.06936201 -0.02268827
 -0.25927914 -0.10646822  0.04202894 -0.38998751  0.30614367 -0.06176964
 -0.16247979  0.00455395  0.11771207  0.16660223 -0.00583792] reward received: 0.4685728844252586
[29.39055376 29.58406852 29.29098187 29.50399718 29.20523741 29.67390317]
timestamp: 505, action taken: [ 0.05261598 -0.10965506  0.27723472  0.06006833 -0.29938965  0.31492244
  0.15539289 -0.11961224  0.14066348 -0.10057109  0.13205567 -0.05457177
  0.17723992 -0.03139131 -0.10475132 -0.2986573  -0.0

[29.47034195 29.78790993 29.63749093 29.80216389 29.34085536 29.81398933]
timestamp: 527, action taken: [-0.01598915  0.03855212  0.08368273  0.16556942 -0.29916889 -0.03546177
 -0.05068681 -0.07878055  0.37050598 -0.18839812 -0.21578811  0.07316007
  0.0490465  -0.15396873 -0.16053746  0.08467178 -0.02354327] reward received: 0.46697866310509595
[29.55072081 29.71698373 29.60747711 29.74716088 29.37392568 29.7928303 ]
timestamp: 528, action taken: [-0.13727806 -0.08841428 -0.03830555  0.1134973  -0.09840398 -0.20533548
 -0.04207439  0.01397435  0.10911709 -0.17282102 -0.01240839 -0.08649803
  0.36164087 -0.25774636 -0.04943393 -0.02859723 -0.02609724] reward received: 0.46798127314534066
Updating networks...
[30.26550804 30.72900994 30.43638143 30.36651249 30.09203016 30.72658891]
timestamp: 529, action taken: [ 0.13056678 -0.08394693  0.00433757 -0.03358689 -0.04046563 -0.1367993
  0.00237204  0.37282512 -0.33630263 -0.12384911  0.19622959  0.15172751
  0.01518152  0.05307656  0.1551

[30.5141753  30.90152362 30.86548189 30.6865374  30.36886918 30.84693973]
timestamp: 551, action taken: [-0.02527688 -0.15986653 -0.3388394  -0.05055735 -0.16289488  0.08669757
  0.02043276 -0.39704068  0.06569709 -0.03294889 -0.04818168  0.24920061
  0.03156656 -0.04012157  0.07351757  0.17211317 -0.20176188] reward received: 0.47776567659672803
[30.48764471 30.79075877 30.7103175  30.63812879 30.43427485 30.88549514]
timestamp: 552, action taken: [-0.1013533  -0.17071437 -0.00468136  0.1177376   0.07198613 -0.018176
 -0.10671301  0.03668071  0.17035538 -0.05882459  0.12860959 -0.05994508
 -0.0416618  -0.25973412 -0.0998924  -0.022258   -0.12049368] reward received: 0.4769213836383512
[30.64149582 30.98362155 30.78908324 30.6586385  30.29471249 30.91726335]
timestamp: 553, action taken: [-0.15648397 -0.1784884   0.02834946  0.1107054  -0.03540385  0.038158
  0.07176219  0.02989353  0.12590219 -0.12124703  0.16769893  0.0006508
  0.08686622 -0.18067386 -0.07320825 -0.0569638  -0.142107

[31.17710645 31.31417334 31.32968231 31.20826099 30.98793548 31.43205312]
timestamp: 575, action taken: [-0.17893686 -0.07771936  0.32375593 -0.02698699 -0.24053314  0.02686955
  0.30425426 -0.0123161   0.10412087  0.00827723  0.06500352  0.31491749
  0.05404108  0.03795709 -0.02341195 -0.03820143  0.28350679] reward received: 0.4720294250983745
[31.18701305 31.45150451 31.33095828 31.17076071 30.9540929  31.47695287]
timestamp: 576, action taken: [ 0.06403818  0.01292457  0.45209228 -0.04491407 -0.16924585  0.05201389
  0.3646441   0.2500186   0.21996299 -0.18425561  0.02593537  0.15923646
 -0.12292916 -0.18147926  0.02538525 -0.04486027  0.01126111] reward received: 0.47286216311923396
Updating networks...
[31.57430126 31.95082964 31.69472426 31.69038532 31.55021446 31.91180248]
timestamp: 577, action taken: [-0.13136464 -0.00544933  0.3499315   0.12670971  0.07033243  0.28816199
 -0.15996789 -0.15374493  0.07217902  0.30406508  0.17309582  0.10474111
 -0.05376684 -0.38865703 -0.1948

[31.87103736 32.2494394  31.86375022 31.8572244  31.66297739 32.19066107]
timestamp: 599, action taken: [-0.06253117 -0.22802688  0.04994055 -0.06793044 -0.00729068 -0.29771087
  0.01228844  0.07169968 -0.08730945 -0.30215497  0.15287567 -0.35648354
 -0.29631566 -0.27184288 -0.18679696 -0.34275247  0.11702166] reward received: 0.4729875814002584
[31.80075788 32.0852159  32.05162338 31.93977783 31.74572588 32.132963  ]
timestamp: 600, action taken: [-0.13439534  0.02498826 -0.08425141 -0.17490151  0.14181358 -0.10372314
 -0.07429942  0.18436275  0.3583342  -0.06809884 -0.1632369  -0.29548851
  0.08002604  0.08240121  0.09206798  0.04807983  0.06091142] reward received: 0.47043040570974
[31.7949872  32.28810483 32.06373124 31.94291046 31.75660163 32.22018574]
timestamp: 601, action taken: [ 0.07939458  0.02348175  0.31907324  0.05280871 -0.01128691  0.27606922
  0.1685557  -0.37639805  0.13063484  0.03735002  0.00676015  0.14060378
 -0.12225796 -0.01205636  0.09876183 -0.16410356  0.2527

[32.14398595 32.51574162 32.44908768 32.36125423 32.0373165  32.46007603]
timestamp: 623, action taken: [-0.2087107   0.13201623  0.22862758  0.12022712 -0.02977655  0.10139467
 -0.01063799  0.14070822 -0.08630132 -0.16855403  0.12936978 -0.17189722
  0.22335836 -0.11544655 -0.17838015 -0.14670134 -0.1413857 ] reward received: 0.4755219418923985
[31.96212586 32.23634454 32.15965662 32.18708089 32.06043782 32.4017413 ]
timestamp: 624, action taken: [ 0.43005179 -0.29402521  0.04226115 -0.08336737  0.08120135  0.03982784
 -0.05015812  0.11903651  0.18027913 -0.20281491 -0.12120776 -0.21783967
  0.10951918 -0.00192433  0.00688094 -0.15509884  0.00111402] reward received: 0.4755814636525054
Updating networks...
[32.22512419 32.61587397 32.35245021 32.43072508 32.21224696 32.6877449 ]
timestamp: 625, action taken: [ 0.00896559 -0.18964803  0.16655395  0.01922347  0.24733009  0.07328814
 -0.03354693  0.04987258  0.27405719  0.05232942 -0.17303825  0.10845139
 -0.09168236 -0.19295416 -0.08630

[32.17657457 32.45383883 32.2082959  32.37558272 32.00539966 32.50268343]
timestamp: 647, action taken: [ 0.04173305 -0.18350629 -0.13799367  0.32247359 -0.05643381  0.04469092
  0.26156551 -0.16290053  0.02239669  0.23097531  0.39976136 -0.09210537
  0.11637389 -0.0279916  -0.18782458 -0.06067315 -0.0117809 ] reward received: 0.4712885703201544
[32.22726841 32.5408722  32.29907824 32.45102747 32.10637143 32.66016718]
timestamp: 648, action taken: [-0.14314136 -0.35492566  0.24134525  0.24829095 -0.14406181 -0.19999604
 -0.18716423  0.05306547 -0.0276792   0.00277315 -0.13345212  0.05690838
  0.004527   -0.13951029 -0.25659831  0.12597802  0.17198801] reward received: 0.4714183049262693
[32.1913936  32.43761859 32.12946121 32.24771328 31.97303411 32.49011628]
timestamp: 649, action taken: [ 0.11790183 -0.19536298  0.27282433  0.0675945  -0.07848239 -0.03718837
 -0.10121231 -0.15802194  0.08058777 -0.05344046 -0.29413029 -0.24943235
  0.25929892  0.19381393 -0.1334379  -0.44702496 -0.10

[32.55608474 32.94355913 32.77015283 32.7534699  32.37767234 32.91337298]
timestamp: 671, action taken: [ 0.26794084 -0.35485001  0.2347325  -0.10100715 -0.15702441  0.06492229
  0.2079669   0.07129543 -0.30528153 -0.26115503 -0.17195238 -0.01272783
  0.08269238  0.04228239  0.00127902 -0.18221317  0.26713199] reward received: 0.47763115615051466
[32.67743873 33.12287365 32.94321539 32.94602037 32.55812269 33.09527695]
timestamp: 672, action taken: [-0.14451266 -0.02786576  0.21738693 -0.32325263 -0.21617917  0.0428664
  0.0269993   0.0513057   0.07934013 -0.21051093  0.07616211  0.02499469
 -0.09645867 -0.27375554  0.02084446 -0.20309004  0.15958952] reward received: 0.4774760930109158
Updating networks...
[32.73616765 33.11371128 32.84843091 32.94899025 32.61849548 33.1400469 ]
timestamp: 673, action taken: [-0.224537   -0.13477162  0.08751815  0.06518295  0.00268409  0.10321569
 -0.31419497  0.08275108  0.02108813 -0.10997002  0.02345461  0.27663137
  0.37194054 -0.26266936  0.01388

[32.75409548 33.08718824 32.99697565 33.03951504 32.67571687 33.10480069]
timestamp: 695, action taken: [ 0.28147452  0.15384721  0.08362174 -0.03998394 -0.06881131 -0.08258679
 -0.1664609   0.29777758 -0.20078934 -0.10733751 -0.00365607  0.11521045
  0.1105939  -0.04596262  0.02649322  0.05208044 -0.02728188] reward received: 0.4660529619374924
[32.73367593 32.92117984 32.76773953 32.92162539 32.61802401 33.0914367 ]
timestamp: 696, action taken: [ 0.0982179   0.04366646  0.14964937  0.05806524  0.29704592  0.21094066
  0.15292873  0.04491558  0.05861053 -0.00530567  0.1517193  -0.13223882
 -0.05765889  0.27003277  0.16166282 -0.0744112  -0.03908696] reward received: 0.46772454363546484
[32.68437068 32.95237288 32.95797275 33.19127056 32.69378443 33.11994838]
timestamp: 697, action taken: [-0.08437967  0.07946394  0.12424328 -0.09963252 -0.11743571  0.29172739
  0.02336711 -0.02646665  0.10837033 -0.09407272 -0.20305751  0.18909389
 -0.05842242  0.11768826 -0.13179088  0.10428324 -0.1

[32.82128739 33.11396745 33.02396595 33.05754856 32.78056823 33.20922758]
timestamp: 719, action taken: [-0.21087917 -0.08772768 -0.00062598 -0.09997223 -0.06419455 -0.12271097
 -0.06442158 -0.01824341  0.02802039 -0.05519774 -0.1492049   0.00515353
  0.21409682  0.29665562  0.13791706 -0.06289334 -0.25222107] reward received: 0.4671354998866782
[32.95696094 33.09977734 33.08450231 32.94863037 32.89562625 33.18759311]
timestamp: 720, action taken: [-0.23506823 -0.38037832  0.30642522 -0.02873721 -0.39500483 -0.02415272
  0.19473668  0.1223041   0.05834116 -0.22508039  0.23360999  0.03411835
 -0.06290954  0.00506463  0.05605829 -0.02193526 -0.11210516] reward received: 0.46777105004760433
Updating networks...
[32.78247709 33.30204032 33.11965863 33.29399766 32.83107254 33.27288404]
timestamp: 721, action taken: [ 0.09277087  0.05384814  0.08688997  0.1527288  -0.29977056 -0.08207991
 -0.05399057  0.04443679 -0.10379064  0.0298417  -0.00035651  0.14336421
  0.02837136 -0.12579074  0.1536

[33.22999767 33.59759852 33.46578452 33.38898016 33.0808535  33.61789705]
timestamp: 743, action taken: [ 0.22117857 -0.00970273 -0.2198539   0.05941234 -0.10178115 -0.29815115
  0.06406269  0.24844031  0.02011571 -0.12129308  0.11814614 -0.2319446
 -0.13856475  0.23148103 -0.08708511  0.04821555 -0.06959109] reward received: 0.4586048648153006
[33.22985751 33.60818838 33.56524848 33.50314988 33.01731859 33.52137445]
timestamp: 744, action taken: [ 0.05903761  0.15697575  0.1138979  -0.1313956   0.17044179  0.03619316
 -0.13357177  0.11121816  0.0352946   0.08995244 -0.26244592 -0.19296597
  0.08673232  0.06392342  0.33896133 -0.07556128  0.084399  ] reward received: 0.4595117035337448
[33.31290995 33.60716235 33.72294679 33.59555861 33.13549427 33.68634247]
timestamp: 745, action taken: [ 0.07639265  0.13288651  0.32597382 -0.3009804  -0.28608962  0.13117638
  0.09521316 -0.26707383 -0.06044913 -0.44304292 -0.06330932 -0.05769479
  0.21666719 -0.06992244  0.03605295  0.06376459 -0.117

[33.34123302 33.80077005 33.74583529 33.65134108 33.23902032 33.7841617 ]
timestamp: 767, action taken: [ 0.04924172 -0.39139041 -0.2771579   0.09320524  0.122038    0.1471861
 -0.13548688 -0.19890856  0.22096269 -0.21170864  0.00332841  0.04493825
 -0.01371888  0.10829841  0.11384011 -0.05541421 -0.07557807] reward received: 0.46385057010401604
[33.27240008 33.73028337 33.67846299 33.65528829 33.38142287 33.84483254]
timestamp: 768, action taken: [-0.27911683 -0.31242556  0.19447468 -0.16813775 -0.16288667  0.14870359
 -0.23508663 -0.04394293 -0.21744917  0.00221614  0.1252538  -0.18793054
  0.25954138 -0.01135755  0.16400119 -0.04745058  0.14692373] reward received: 0.4637103136613814
Updating networks...
[33.55812668 33.95940066 33.61786941 33.71687088 33.39609831 33.97690524]
timestamp: 769, action taken: [ 0.02037012 -0.12687283  0.23884708 -0.15427476 -0.18350846 -0.20488033
  0.1778226  -0.27406948  0.12712503 -0.19895381  0.16011299 -0.06051645
  0.03407503 -0.07002812  0.01310

[33.14657462 33.67246618 33.63859853 33.64066868 33.15073964 33.76394825]
timestamp: 791, action taken: [-0.06344194 -0.14453278  0.14285568  0.05919496 -0.08305673 -0.07475383
  0.07206427  0.24318899  0.02396186 -0.09059341  0.23308257  0.10303613
  0.16891499  0.07741044 -0.18453522 -0.430574   -0.12803526] reward received: 0.46585255491377625
[33.30061863 33.55795929 33.60903205 33.65577483 33.20024274 33.70095728]
timestamp: 792, action taken: [-0.11080646  0.00715418  0.28238398  0.09741193 -0.1168906  -0.101561
 -0.10036376 -0.08298763  0.1753568   0.07464486  0.11128004 -0.03357286
 -0.33705169  0.03938293  0.01852921  0.00866524 -0.01673126] reward received: 0.46592875992935906
[33.18551599 33.59262826 33.7305416  33.66326611 33.14499703 33.70787417]
timestamp: 793, action taken: [ 0.17683843 -0.3198439   0.15888929  0.02943691 -0.0177697  -0.17325309
  0.059041    0.13192959  0.06946098 -0.06758898  0.11790716 -0.13868945
 -0.03988045 -0.10985778 -0.13373354 -0.06859235  0.16

[33.54050121 34.04123687 33.99341582 34.04262734 33.4559548  34.02330924]
timestamp: 815, action taken: [-0.11385838 -0.00933819  0.25613384  0.41098629 -0.15204057  0.16396924
  0.116027    0.07354701 -0.13456134 -0.18144093 -0.0649869   0.0812605
 -0.03352222 -0.05848784 -0.14482756 -0.233441   -0.08240203] reward received: 0.4656368063637862
[33.61505549 34.05214363 33.97782202 33.96481663 33.57520067 34.11262776]
timestamp: 816, action taken: [-0.21555348 -0.0637339   0.32695838 -0.16206923 -0.00273332 -0.02200202
 -0.06355485  0.21486656  0.27808848 -0.04518557  0.12509123 -0.07951267
  0.08075226  0.2022024  -0.01997396 -0.00879342 -0.0095878 ] reward received: 0.46560011736248463
Updating networks...
[34.06749623 34.39269984 34.20319105 34.22593789 33.85211203 34.22673266]
timestamp: 817, action taken: [-0.02554037 -0.19628938  0.23160486 -0.40072523 -0.00713643  0.1447978
 -0.0332025  -0.05812354  0.12799077  0.19284334 -0.05434362 -0.09712289
 -0.01742343 -0.12609329  0.113668

[33.98542981 34.30050219 34.20576202 34.21616423 33.79757245 34.29381053]
timestamp: 839, action taken: [ 0.07793399  0.0611838   0.19256733 -0.18052729 -0.15938915  0.06140725
  0.22780393 -0.1665228  -0.07096051 -0.01659429  0.05441943 -0.22007647
  0.10367552  0.069983   -0.06402285  0.15684399  0.12829674] reward received: 0.4636733145880462
[33.8822717  34.28264036 34.11277749 34.19248799 33.76559387 34.33597922]
timestamp: 840, action taken: [ 0.08822552 -0.21964889 -0.09931572  0.00560374  0.00570378 -0.25491259
  0.18841151  0.14650797  0.06032533 -0.03051613  0.1666069  -0.26503212
  0.11209584 -0.11935588  0.12315154 -0.05220965  0.15687913] reward received: 0.46310371983803794
[33.83903463 34.37614251 34.26899678 34.31042591 33.72315513 34.36009556]
timestamp: 841, action taken: [ 0.03802219  0.16372408  0.07864162  0.22982704 -0.33151679  0.11369854
 -0.04308884  0.28498011 -0.07180587  0.09172987 -0.09254532 -0.04014808
  0.23127357 -0.10014029 -0.01781848  0.04482254  0.0

[34.27539886 34.71006297 34.58407755 34.65825299 34.18322534 34.6651652 ]
timestamp: 863, action taken: [-0.16242613 -0.05911173  0.06783089 -0.00077187  0.09700158  0.0385468
 -0.25169719 -0.06847834  0.13151006 -0.14141758  0.0794604   0.02855196
  0.02586151 -0.2164756  -0.54062543  0.05850095  0.11674246] reward received: 0.4681143157190357
[34.01469515 34.46100626 34.48772778 34.41778867 33.98718403 34.39007905]
timestamp: 864, action taken: [-0.16534605  0.03640645 -0.10785487 -0.07081234 -0.34470921  0.28047315
  0.09550346 -0.73396405  0.25681877 -0.38636669 -0.01013995 -0.04946097
  0.12464987 -0.07670037 -0.01977802 -0.31656206 -0.06892599] reward received: 0.46887454723087607
Updating networks...
[34.60410046 34.8581409  35.01953469 34.68220169 34.43218347 34.77413777]
timestamp: 865, action taken: [-0.09901575  0.12002373 -0.01515288  0.23848416 -0.24785541 -0.01903408
 -0.07613931  0.04505077  0.1288885  -0.0094151  -0.0943457   0.07612001
  0.03150339 -0.12475746 -0.01031

[34.23842995 34.59493949 34.76127281 34.62107779 34.1006516  34.64228058]
timestamp: 887, action taken: [-0.13750439 -0.11934978  0.21305587  0.05812401 -0.1139877   0.05102854
 -0.13908337  0.09455341  0.09612287 -0.05895155 -0.2508184  -0.24182167
  0.29551609 -0.06636233  0.17916477 -0.10659722  0.04166898] reward received: 0.4643710717106306
[34.09370355 34.64134994 34.68971835 34.6165444  34.05021482 34.64708643]
timestamp: 888, action taken: [-0.30232194 -0.00114376  0.01121848 -0.07381196  0.07603824  0.04548191
  0.08663485 -0.00264903  0.10256779  0.05158109  0.17683543  0.12374681
  0.05605391  0.11283451 -0.15032324 -0.0051728  -0.19953302] reward received: 0.4651637320132016
[34.11826834 34.65580857 34.53498176 34.51927967 34.26479613 34.60823914]
timestamp: 889, action taken: [ 0.00886783 -0.26092746  0.23575671  0.18912061 -0.25689209 -0.13827495
 -0.03634949  0.00623973  0.17324415  0.00345971 -0.02109388  0.05501415
 -0.02573937  0.00384113  0.04724001  0.13067602  0.11

[34.36731406 34.63221153 34.65966208 34.55235565 34.30763649 34.73394881]
timestamp: 911, action taken: [ 0.04329822  0.28462416  0.40330967  0.04114759 -0.07254778 -0.15394652
 -0.04138938 -0.11684829 -0.17113815  0.01828138  0.47527913  0.14130857
 -0.10397862 -0.24804279  0.03346965 -0.34064023  0.11975792] reward received: 0.46851136413655164
[34.02207055 34.64689607 34.56312201 34.44687411 34.05745264 34.53768419]
timestamp: 912, action taken: [-0.05425841 -0.24931775 -0.10303677  0.09613048 -0.10887525  0.03591656
 -0.19815454  0.01536874  0.30385234 -0.10616814  0.00992584 -0.04157547
  0.23336708 -0.18914667  0.21657509  0.13717914 -0.0988057 ] reward received: 0.46757993574264706
Updating networks...
[34.41194048 34.95855919 34.83435316 34.86432971 34.32649676 34.92216971]
timestamp: 913, action taken: [-0.11571219 -0.01176268  0.08844919 -0.16545964 -0.36420868  0.12850151
  0.15717825 -0.04638181  0.04564987  0.03733952  0.24058242 -0.15276189
 -0.05447551 -0.01170039  0.216

[34.58442859 35.16718134 35.14315824 35.05710731 34.54096656 35.00553892]
timestamp: 935, action taken: [-0.20241201  0.14200534  0.19616301  0.04309371 -0.15228875  0.22268912
  0.19855972  0.23725944  0.17379338 -0.15790489  0.12375668  0.1698247
  0.11911144 -0.21070749 -0.13799545 -0.11502613 -0.23253242] reward received: 0.4686119382871417
[34.55493527 35.07206782 35.12787968 34.91686928 34.4543649  35.06206278]
timestamp: 936, action taken: [ 0.03735453  0.08167658  0.36291437  0.00259414 -0.27946202 -0.05935084
  0.06345034  0.10294111 -0.07367792  0.18232994  0.13695162  0.05775352
 -0.20676071 -0.21502851  0.2247031   0.02797538 -0.19691834] reward received: 0.47104939961613623
[34.62313362 35.08585257 35.18169317 35.03160161 34.51473172 35.04352617]
timestamp: 937, action taken: [-0.19169735 -0.26149179 -0.10982748 -0.10452882 -0.07035872  0.07705097
  0.06366234 -0.1818998   0.04190332  0.03199961 -0.08080153 -0.09732045
  0.05407751  0.15425669  0.13511933  0.0737302   0.10

[34.81256588 35.26257118 35.37282165 35.14407418 34.80952431 35.05050323]
timestamp: 959, action taken: [-0.15665775 -0.16145386  0.24422694 -0.09052164  0.01470154  0.165444
 -0.09751538 -0.00745809  0.04800968 -0.14154274  0.14809171  0.13101416
 -0.32963947 -0.24762477 -0.05025326  0.11713764  0.17650499] reward received: 0.47461200160846656
[34.75156742 35.17616641 35.11600805 35.02672177 34.62055682 35.08789672]
timestamp: 960, action taken: [ 0.0470652   0.13121183  0.12508901  0.32435533 -0.2059983  -0.02552287
 -0.00774784 -0.01211011  0.05809629 -0.02981287  0.21457697 -0.01842778
  0.22686888 -0.13530427 -0.16371915 -0.16870302 -0.09957775] reward received: 0.4739434718719686
Updating networks...
[34.73304089 35.18927985 35.23198617 35.19088384 34.70971882 35.32545772]
timestamp: 961, action taken: [-0.0294771   0.17900912  0.32179969  0.1189613  -0.1744729   0.51042422
  0.10147901 -0.12648299  0.34270568  0.08652901  0.05584144  0.02335838
 -0.11354563  0.25126946  0.152005

[34.76285725 35.29911542 35.43298269 35.33411656 34.81060813 35.28455504]
timestamp: 983, action taken: [ 0.28291974  0.07160939  0.17989208 -0.29831524  0.12682996  0.02720592
  0.00862621  0.13483597  0.10442731 -0.07276998  0.22001815 -0.04876035
 -0.27162654 -0.15223411  0.02271712  0.0364282  -0.21986366] reward received: 0.46892792867832817
[34.91355906 35.25667546 35.26981818 35.28108401 34.7053028  35.20908722]
timestamp: 984, action taken: [-0.01874913  0.01696129  0.01294961  0.08083701 -0.15593807 -0.00592304
  0.03442687  0.05993822  0.06511687 -0.06968075  0.21316075 -0.04800088
 -0.07435007 -0.34435444  0.14742556 -0.24029656  0.25585478] reward received: 0.46870852053101925
[35.00267163 35.50659817 35.40102337 35.31448189 34.82979726 35.45121429]
timestamp: 985, action taken: [-0.26443713  0.22223703 -0.26793764 -0.21420712 -0.09843248  0.00510019
 -0.19135388 -0.04870028  0.03441107 -0.02506677  0.13829959  0.07214366
  0.00357379  0.21358773  0.25653136 -0.03884984  0.

[34.84527019 35.27662606 35.36023822 35.28580078 34.81836791 35.19285551]
timestamp: 1007, action taken: [ 0.01222582  0.10651738  0.26910328 -0.15871031  0.29917391 -0.05860226
  0.11081511  0.04497652  0.19045161  0.1616384   0.16742855 -0.17463578
  0.11439407 -0.09015542 -0.02387666  0.07832155 -0.02904342] reward received: 0.4651700852583338
[34.83328668 35.32406677 35.28715399 35.17402296 34.83138996 35.30051787]
timestamp: 1008, action taken: [ 0.1812254  -0.03499746  0.05622775  0.27409856 -0.11654816 -0.15713008
 -0.27145384  0.32479943 -0.089059   -0.23847114 -0.09143133 -0.13438788
  0.04362705  0.20739103  0.00446591  0.05723614 -0.14670797] reward received: 0.466071896243001
Updating networks...
[35.0645372  35.3656874  35.46931639 35.4306762  34.90792331 35.39477707]
timestamp: 1009, action taken: [-0.0096197  -0.34373559  0.09105226 -0.04419651  0.06345021 -0.16932205
 -0.02673252  0.19547654 -0.13758534 -0.15304092 -0.1240466   0.0320327
  0.12716704 -0.03804455  0.0433

[35.11723341 35.63973219 35.67828547 35.52474593 35.14155124 35.51261928]
timestamp: 1031, action taken: [-0.01058928 -0.41917037  0.15675932  0.07756174  0.28797342 -0.0522782
  0.07586368 -0.20465027  0.23137013  0.01332501  0.03978771  0.00438292
  0.04317988 -0.14709661  0.06127377 -0.0137927  -0.0493883 ] reward received: 0.4630987306073578
[35.26535778 35.79607013 35.68645925 35.61981734 35.24857373 35.70339228]
timestamp: 1032, action taken: [ 0.23852594  0.03465974  0.0833863  -0.08611197 -0.0688228  -0.03630631
  0.02332252  0.20924646  0.24917358 -0.04038974  0.26262668 -0.04253267
 -0.05427668 -0.20517259 -0.18198933 -0.15023174  0.0830704 ] reward received: 0.46334422353782634
[35.26028126 35.76506594 35.74649641 35.65476735 35.24211033 35.65813373]
timestamp: 1033, action taken: [ 0.00031754 -0.05854437  0.09709867 -0.12490172 -0.16964826  0.15130583
 -0.19794267  0.01736615 -0.14090011 -0.22037525  0.0381442  -0.15263498
  0.0785537  -0.01898674  0.07358414 -0.03772067 -0

[35.2105084  35.60934933 35.74599952 35.73276658 35.2713675  35.61606204]
timestamp: 1055, action taken: [-0.05962489 -0.28840808  0.27339275 -0.29113413  0.24624176  0.23112005
  0.14383928 -0.13577795  0.01284632 -0.13091713 -0.08698647 -0.05846622
  0.09975085 -0.19450689 -0.09682304 -0.00483723  0.08047789] reward received: 0.46962275260310815
[35.21381508 35.74104363 35.62014665 35.68486577 35.12094187 35.74349373]
timestamp: 1056, action taken: [-0.04146335  0.00185056  0.21009082  0.31046226  0.12257862  0.07910261
 -0.30913166 -0.25254552  0.01258213 -0.16445511 -0.01191093 -0.20961458
 -0.08857834 -0.13147723  0.04664247  0.0218339  -0.03748261] reward received: 0.46956720532678575
Updating networks...
[35.23297355 35.71564908 35.51973895 35.82005912 35.15924166 35.81243213]
timestamp: 1057, action taken: [ 0.16120724 -0.26505696  0.15811728  0.03383732 -0.07059157 -0.02667871
  0.26265977 -0.17235925  0.02476456 -0.34150644  0.13332457  0.20167548
  0.20900164 -0.19180302  0.

[35.27341079 35.75836673 35.83891863 35.79135324 35.36434115 35.75029601]
timestamp: 1079, action taken: [ 0.08728788  0.14610913  0.30313662 -0.23190504  0.22653352 -0.09340685
 -0.14508822 -0.08103856  0.17311369  0.13876316 -0.20020567 -0.02420797
  0.08068067 -0.04059016 -0.12085233 -0.19235707 -0.18793307] reward received: 0.46933836493956305
[35.15319201 35.71182238 35.63902123 35.66679237 35.15030857 35.70059677]
timestamp: 1080, action taken: [-0.14743086 -0.20344132  0.20971833 -0.06071808 -0.20426119  0.13978586
  0.01721794  0.04823037 -0.01021663 -0.33570555 -0.15955419 -0.16082105
  0.03794311 -0.14743432 -0.01508657  0.25258465  0.02578055] reward received: 0.46967842823819433
[35.19654635 35.84423679 35.93845334 35.85974941 35.21877047 35.79362078]
timestamp: 1081, action taken: [-0.05242222  0.04810879  0.08480299  0.17671216  0.25785098 -0.074804
 -0.22969056  0.05180244 -0.05976577  0.1503919  -0.08425058 -0.0049115
  0.38079414 -0.08910254  0.13796123 -0.02534234 -0.

[35.57488622 36.16999576 36.0637221  36.10999438 35.52703462 36.13660263]
timestamp: 1103, action taken: [ 0.11547918  0.099434    0.02351114  0.11381286  0.4350844   0.14462675
 -0.19127609  0.01704294  0.12585356 -0.03298168 -0.1319576   0.35873631
  0.01923934 -0.22847861  0.04263448  0.17672653 -0.0794393 ] reward received: 0.4680775215328412
[35.84802111 36.21785486 36.21926437 36.26372505 35.62378486 36.16226847]
timestamp: 1104, action taken: [-0.22577965 -0.0371825   0.07350384  0.23001866 -0.01630552 -0.12293439
 -0.08446863  0.11816287  0.02927766 -0.21573193 -0.06680465 -0.21878947
 -0.17316303 -0.19668374  0.17636808  0.08964997  0.05004446] reward received: 0.46927156934087383
Updating networks...
[35.73621714 36.14480589 36.19906297 36.17243619 35.87808032 36.20458483]
timestamp: 1105, action taken: [-0.19239954 -0.01052803  0.02176736  0.02990649 -0.24601396  0.2623951
 -0.24070891  0.16378746  0.27504339  0.14555126  0.13377576  0.15948296
  0.13869952 -0.01571588  0.12

[35.84791128 36.40460842 36.48955433 36.42711137 36.0503454  36.34717421]
timestamp: 1127, action taken: [-0.11908085 -0.07041623 -0.01699973  0.07267437 -0.26670082  0.11477807
  0.11142887  0.07220742 -0.03939875  0.30113949 -0.12526319 -0.3395102
  0.16998419  0.06797358 -0.04036467 -0.13987938  0.13691274] reward received: 0.48059229658749114
[35.95321947 36.38792368 36.50332498 36.52548815 36.07825441 36.34863717]
timestamp: 1128, action taken: [-0.02698901 -0.31452412  0.00687916 -0.04518502 -0.38168404  0.09514896
 -0.10216176 -0.05780261 -0.0043455  -0.17048017  0.07524024 -0.06364574
  0.21252169 -0.0310143  -0.02495014  0.21488593  0.11052815] reward received: 0.479554209195575
[35.9352793  36.38794021 36.26310562 36.31318429 35.9864547  36.31405773]
timestamp: 1129, action taken: [-0.01433188 -0.17728456 -0.11519754 -0.07480329  0.15798     0.09644905
 -0.19454141 -0.11778278  0.1222467   0.02051908  0.07002011  0.07244779
  0.21488453 -0.04277851 -0.03727434  0.0197224   0.

[35.79186171 36.29043292 36.18412119 36.25179833 35.96107698 36.32035902]
timestamp: 1151, action taken: [ 0.14144575 -0.04998143 -0.06690419  0.10557525 -0.11060919  0.17359926
 -0.0600078  -0.2426008   0.14235593  0.10676828  0.13975489  0.01589737
 -0.18414727  0.15348564  0.11848377 -0.19960435 -0.09628329] reward received: 0.47682169236310495
[35.77694108 36.31726599 36.27526943 36.24547978 35.76994341 36.2917447 ]
timestamp: 1152, action taken: [ 0.20030897  0.12536171  0.05236686 -0.00895522 -0.24532941  0.13607462
 -0.08637425 -0.13420864 -0.00762585  0.22211005 -0.0861291   0.09784579
 -0.00389794 -0.14534977 -0.01971132  0.10536217  0.05179234] reward received: 0.47540866945620536
Updating networks...
[35.80500284 36.37919861 36.291386   36.35096833 35.83040408 36.36547415]
timestamp: 1153, action taken: [-0.04265815 -0.03447314  0.101095    0.01105275 -0.26769024 -0.12934531
  0.10115162 -0.0110333   0.09914357 -0.16190184  0.04342586  0.06425382
 -0.16147916  0.07002436 -0.

NameError: name 'exit' is not defined