In [34]:
import sys
import time
import numpy as np

import torch
import torch.nn as nn


class RewardTracker:
    def __init__(self, writer, stop_reward):
        self.writer = writer
        self.stop_reward = stop_reward

    def __enter__(self):
        self.ts = time.time()
        self.ts_frame = 0
        self.total_rewards = []
        return self

    def __exit__(self, *args):
        self.writer.close()

    def reward(self, reward, frame, epsilon=None):
        self.total_rewards.append(reward)
        speed = (frame - self.ts_frame) / (time.time() - self.ts)
        self.ts_frame = frame
        self.ts = time.time()
        mean_reward = np.mean(self.total_rewards[-100:])
        epsilon_str = "" if epsilon is None else ", eps %.2f" % epsilon
        print("%d: done %d games, mean reward %.3f, speed %.2f f/s%s" % (
            frame, len(self.total_rewards), mean_reward, speed, epsilon_str
        ))
        sys.stdout.flush()
        if epsilon is not None:
            self.writer.add_scalar("epsilon", epsilon, frame)
        self.writer.add_scalar("speed", speed, frame)
        self.writer.add_scalar("reward_100", mean_reward, frame)
        self.writer.add_scalar("reward", reward, frame)
        if mean_reward > self.stop_reward:
            print("Solved in %d frames!" % frame)
            return True
        return False


class AtariPGN(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(AtariPGN, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

        conv_out_size = self._get_conv_out(input_shape)
        self.fc = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )

    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, x):
        fx = x.float() / 256
        conv_out = self.conv(fx).view(fx.size()[0], -1)
        return self.fc(conv_out)

In [35]:
#!/usr/bin/env python3
import gym
import ptan
import time
import random
import numpy as np
import argparse
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.nn.utils as nn_utils
import torch.nn.functional as F
import torch.optim as optim

#from lib import common

GAMMA = 0.99
LEARNING_RATE = 5e-4
ENTROPY_BETA = 0.01
NUM_ENVS = 16

REWARD_STEPS = 4
CLIP_GRAD = 0.1
IMG_SHAPE = (4, 84, 84)


class AtariA2C(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(AtariA2C, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

        conv_out_size = self._get_conv_out(input_shape)
        self.policy = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )

        self.value = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )

    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, x):
        fx = x.float() / 256
        conv_out = self.conv(fx).view(fx.size()[0], -1)
        return self.policy(conv_out), self.value(conv_out)


def discount_with_dones(rewards, dones, gamma):
    discounted = []
    r = 0
    for reward, done in zip(rewards[::-1], dones[::-1]):
        r = reward + gamma*r*(1.-done)
        discounted.append(r)
    return discounted[::-1]


def iterate_batches(envs, net, device="cpu"):
    n_actions = envs[0].action_space.n
    act_selector = ptan.actions.ProbabilityActionSelector()
    obs = [e.reset() for e in envs]
    batch_dones = [[False] for _ in range(NUM_ENVS)]
    total_reward = [0.0] * NUM_ENVS
    total_steps = [0] * NUM_ENVS
    mb_obs = np.zeros((NUM_ENVS, REWARD_STEPS) + IMG_SHAPE, dtype=np.uint8)
    mb_rewards = np.zeros((NUM_ENVS, REWARD_STEPS), dtype=np.float32)
    mb_values = np.zeros((NUM_ENVS, REWARD_STEPS), dtype=np.float32)
    mb_actions = np.zeros((NUM_ENVS, REWARD_STEPS), dtype=np.int32)
    mb_probs = np.zeros((NUM_ENVS, REWARD_STEPS, n_actions), dtype=np.float32)

    while True:
        print("batching here")
        batch_dones = [[dones[-1]] for dones in batch_dones]
        done_rewards = []
        done_steps = []
        for n in range(REWARD_STEPS):
            obs_v = ptan.agent.default_states_preprocessor(obs).to(device)
            mb_obs[:, n] = obs_v.data.cpu().numpy()
            logits_v, values_v = net(obs_v)
            probs_v = F.softmax(logits_v, dim=1)
            probs = probs_v.data.cpu().numpy()
            actions = act_selector(probs)
            mb_probs[:, n] = probs
            mb_actions[:, n] = actions
            mb_values[:, n] = values_v.squeeze().data.cpu().numpy()
            for e_idx, e in enumerate(envs):
                o, r, done, _ = e.step(actions[e_idx])
                total_reward[e_idx] += r
                total_steps[e_idx] += 1
                if done:
                    o = e.reset()
                    done_rewards.append(total_reward[e_idx])
                    done_steps.append(total_steps[e_idx])
                    total_reward[e_idx] = 0.0
                    total_steps[e_idx] = 0
                obs[e_idx] = o
                mb_rewards[e_idx, n] = r
                batch_dones[e_idx].append(done)
        # obtain values for the last observation
        obs_v = ptan.agent.default_states_preprocessor(obs).to(device)
        _, values_v = net(obs_v)
        values_last = values_v.squeeze().data.cpu().numpy()

        for e_idx, (rewards, dones, value) in enumerate(zip(mb_rewards, batch_dones, values_last)):
            rewards = rewards.tolist()
            if not dones[-1]:
                rewards = discount_with_dones(rewards + [value], dones[1:] + [False], GAMMA)[:-1]
            else:
                rewards = discount_with_dones(rewards, dones[1:], GAMMA)
            mb_rewards[e_idx] = rewards

        out_mb_obs = mb_obs.reshape((-1,) + IMG_SHAPE)
        out_mb_rewards = mb_rewards.flatten()
        out_mb_actions = mb_actions.flatten()
        out_mb_values = mb_values.flatten()
        out_mb_probs = mb_probs.flatten()
        yield out_mb_obs, out_mb_rewards, out_mb_actions, out_mb_values, out_mb_probs, \
              np.array(done_rewards), np.array(done_steps)


def train_a2c(net, mb_obs, mb_rewards, mb_actions, mb_values, optimizer, tb_tracker, step_idx, device="cpu"):
    print("training here")
    optimizer.zero_grad()
    mb_adv = mb_rewards - mb_values

    adv_v = torch.FloatTensor(mb_adv).to(device)
    obs_v = torch.FloatTensor(mb_obs).to(device)
    rewards_v = torch.FloatTensor(mb_rewards).to(device)
    actions_t = torch.LongTensor(mb_actions).to(device)
    logits_v, values_v = net(obs_v)

    loss_value_v = F.mse_loss(values_v.squeeze(-1), rewards_v)
    print('loss_value_v', loss_value_v.dtype, loss_value_v.shape, loss_value_v)
    
    log_prob_v = F.log_softmax(logits_v, dim=1)
    print('log_prob_v', log_prob_v.dtype, log_prob_v.shape, log_prob_v)
    log_prob_actions_v = adv_v * log_prob_v[range(len(mb_actions)), actions_t]
    print('log_prob_actions_v', log_prob_actions_v.dtype, log_prob_actions_v.shape, log_prob_actions_v)
    loss_policy_v = -log_prob_actions_v.mean()
    print('loss_policy_v', loss_policy_v.dtype, loss_policy_v.shape, loss_policy_v)

    prob_v = F.softmax(logits_v, dim=1)
    print('prob_v', prob_v.dtype, prob_v.shape, prob_v)
    entropy_loss_v = (prob_v * log_prob_v).sum(dim=1).mean()
    print('entropy_loss_v', entropy_loss_v.dtype, entropy_loss_v.shape, entropy_loss_v)
    loss_v = ENTROPY_BETA * entropy_loss_v + loss_value_v + loss_policy_v
    print('loss_v', loss_v.dtype, loss_v.shape, loss_v)
    loss_v.backward()
    nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD)
    optimizer.step()

    return obs_v


def set_seed(seed, envs=None, cuda=False):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed(seed)

    if envs:
        for idx, env in enumerate(envs):
            env.seed(seed + idx)


if __name__ == "__main__":

    device = torch.device("cpu")

    make_env = lambda: ptan.common.wrappers.wrap_dqn(gym.make("BreakoutNoFrameskip-v4"))
    envs = [make_env() for _ in range(NUM_ENVS)]
    writer = SummaryWriter(comment="-pong-a2c-r2_")
    set_seed(20, envs)

    net = AtariA2C(envs[0].observation_space.shape, envs[0].action_space.n).to(device)
    #print(net)

    optimizer = optim.RMSprop(net.parameters(), lr=LEARNING_RATE, eps=1e-5)

    step_idx = 0
    total_steps = 0
    best_reward = None
    ts_start = time.time()

    with RewardTracker(writer, stop_reward=18) as tracker:
        with ptan.common.utils.TBMeanTracker(writer, batch_size=10) as tb_tracker:
            for mb_obs, mb_rewards, mb_actions, mb_values, _, done_rewards, done_steps in iterate_batches(envs, net, device=device):
                if len(done_rewards) > 0:
                    total_steps += sum(done_steps)
                    speed = total_steps / (time.time() - ts_start)
                    if best_reward is None:
                        best_reward = done_rewards.max()
                    elif best_reward < done_rewards.max():
                        best_reward = done_rewards.max()
                    tb_tracker.track("total_reward_max", best_reward, step_idx)
                    tb_tracker.track("total_reward", done_rewards, step_idx)
                    tb_tracker.track("total_steps", done_steps, step_idx)
                    print("%d: done %d episodes, mean_reward=%.2f, best_reward=%.2f, speed=%.2f" % (
                        step_idx, len(done_rewards), done_rewards.mean(), best_reward, speed))

                train_a2c(net, mb_obs, mb_rewards, mb_actions, mb_values,
                          optimizer, tb_tracker, step_idx, device=device)
                step_idx += 1

batching here
training here
loss_value_v torch.float32 torch.Size([]) tensor(1.2038e-06, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3535, -1.3975, -1.4256, -1.3701],
        [-1.3534, -1.3976, -1.4256, -1.3702],
        [-1.3535, -1.3972, -1.4257, -1.3702],
        [-1.3535, -1.3975, -1.4255, -1.3702],
        [-1.3537, -1.3974, -1.4256, -1.3699],
        [-1.3535, -1.3975, -1.4256, -1.3701],
        [-1.3536, -1.3972, -1.4258, -1.3701],
        [-1.3535, -1.3972, -1.4258, -1.3701],
        [-1.3535, -1.3975, -1.4256, -1.3701],
        [-1.3534, -1.3976, -1.4256, -1.3702],
        [-1.3535, -1.3972, -1.4257, -1.3702],
        [-1.3536, -1.3975, -1.4255, -1.3701],
        [-1.3537, -1.3974, -1.4256, -1.3699],
        [-1.3537, -1.3975, -1.4257, -1.3699],
        [-1.3536, -1.3974, -1.4256, -1.3701],
        [-1.3537, -1.3974, -1.4258, -1.3699],
        [-1.3537, -1.3974, -1.4256, -1.3699],
        [-1.3535, -1.3974, -1.4257, -1.3700],
        [-1

batching here
training here
loss_value_v torch.float32 torch.Size([]) tensor(3.1505e-07, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3606, -1.3925, -1.4202, -1.3729],
        [-1.3605, -1.3924, -1.4202, -1.3731],
        [-1.3606, -1.3924, -1.4201, -1.3730],
        [-1.3606, -1.3924, -1.4201, -1.3731],
        [-1.3607, -1.3923, -1.4201, -1.3731],
        [-1.3607, -1.3925, -1.4200, -1.3730],
        [-1.3605, -1.3927, -1.4201, -1.3728],
        [-1.3606, -1.3924, -1.4202, -1.3731],
        [-1.3607, -1.3924, -1.4201, -1.3729],
        [-1.3607, -1.3924, -1.4201, -1.3730],
        [-1.3607, -1.3924, -1.4201, -1.3730],
        [-1.3604, -1.3926, -1.4201, -1.3731],
        [-1.3604, -1.3927, -1.4203, -1.3728],
        [-1.3607, -1.3923, -1.4205, -1.3728],
        [-1.3607, -1.3922, -1.4204, -1.3730],
        [-1.3603, -1.3925, -1.4204, -1.3730],
        [-1.3606, -1.3925, -1.4201, -1.3729],
        [-1.3606, -1.3925, -1.4200, -1.3731],
        [-1

training here
loss_value_v torch.float32 torch.Size([]) tensor(1.4474e-08, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3661, -1.3930, -1.4146, -1.3721],
        [-1.3661, -1.3931, -1.4146, -1.3722],
        [-1.3661, -1.3930, -1.4146, -1.3721],
        [-1.3661, -1.3930, -1.4146, -1.3722],
        [-1.3660, -1.3932, -1.4146, -1.3721],
        [-1.3661, -1.3932, -1.4144, -1.3722],
        [-1.3661, -1.3930, -1.4146, -1.3722],
        [-1.3661, -1.3930, -1.4145, -1.3723],
        [-1.3660, -1.3930, -1.4145, -1.3723],
        [-1.3662, -1.3931, -1.4146, -1.3721],
        [-1.3661, -1.3930, -1.4147, -1.3722],
        [-1.3661, -1.3930, -1.4146, -1.3722],
        [-1.3661, -1.3933, -1.4145, -1.3720],
        [-1.3661, -1.3931, -1.4145, -1.3722],
        [-1.3661, -1.3930, -1.4145, -1.3723],
        [-1.3661, -1.3931, -1.4145, -1.3723],
        [-1.3660, -1.3932, -1.4145, -1.3721],
        [-1.3661, -1.3933, -1.4145, -1.3721],
        [-1.3661, -1.3930

training here
loss_value_v torch.float32 torch.Size([]) tensor(1.3824e-06, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3666, -1.3901, -1.4151, -1.3741],
        [-1.3667, -1.3898, -1.4152, -1.3742],
        [-1.3666, -1.3900, -1.4151, -1.3741],
        [-1.3669, -1.3896, -1.4153, -1.3741],
        [-1.3668, -1.3901, -1.4151, -1.3738],
        [-1.3669, -1.3899, -1.4150, -1.3740],
        [-1.3669, -1.3898, -1.4153, -1.3739],
        [-1.3670, -1.3898, -1.4151, -1.3739],
        [-1.3668, -1.3898, -1.4152, -1.3740],
        [-1.3668, -1.3895, -1.4156, -1.3740],
        [-1.3666, -1.3899, -1.4152, -1.3741],
        [-1.3668, -1.3899, -1.4153, -1.3739],
        [-1.3665, -1.3900, -1.4152, -1.3741],
        [-1.3667, -1.3897, -1.4153, -1.3742],
        [-1.3666, -1.3900, -1.4152, -1.3741],
        [-1.3669, -1.3896, -1.4153, -1.3741],
        [-1.3665, -1.3901, -1.4151, -1.3741],
        [-1.3667, -1.3898, -1.4152, -1.3742],
        [-1.3667, -1.3899

training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0007, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.2955, -1.2911, -1.3609, -1.6356],
        [-1.2954, -1.2910, -1.3612, -1.6355],
        [-1.2957, -1.2911, -1.3610, -1.6352],
        [-1.2954, -1.2910, -1.3611, -1.6355],
        [-1.2955, -1.2912, -1.3611, -1.6351],
        [-1.2955, -1.2912, -1.3610, -1.6353],
        [-1.2954, -1.2912, -1.3612, -1.6352],
        [-1.2956, -1.2910, -1.3608, -1.6357],
        [-1.2955, -1.2911, -1.3610, -1.6354],
        [-1.2956, -1.2913, -1.3607, -1.6354],
        [-1.2955, -1.2909, -1.3610, -1.6356],
        [-1.2956, -1.2911, -1.3610, -1.6354],
        [-1.2954, -1.2914, -1.3608, -1.6355],
        [-1.2953, -1.2912, -1.3610, -1.6356],
        [-1.2954, -1.2914, -1.3609, -1.6353],
        [-1.2954, -1.2910, -1.3612, -1.6354],
        [-1.2957, -1.2912, -1.3607, -1.6355],
        [-1.2956, -1.2909, -1.3610, -1.6355],
        [-1.2957, -1.2912, -1

training here
loss_value_v torch.float32 torch.Size([]) tensor(1.6317e-07, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3586, -1.3695, -1.3878, -1.4308],
        [-1.3585, -1.3695, -1.3879, -1.4308],
        [-1.3586, -1.3695, -1.3879, -1.4308],
        [-1.3585, -1.3695, -1.3879, -1.4308],
        [-1.3585, -1.3695, -1.3879, -1.4309],
        [-1.3585, -1.3694, -1.3879, -1.4308],
        [-1.3585, -1.3695, -1.3879, -1.4308],
        [-1.3586, -1.3694, -1.3878, -1.4308],
        [-1.3585, -1.3695, -1.3878, -1.4308],
        [-1.3585, -1.3694, -1.3879, -1.4309],
        [-1.3586, -1.3695, -1.3878, -1.4308],
        [-1.3586, -1.3694, -1.3879, -1.4308],
        [-1.3586, -1.3695, -1.3878, -1.4308],
        [-1.3585, -1.3695, -1.3879, -1.4308],
        [-1.3586, -1.3695, -1.3879, -1.4308],
        [-1.3585, -1.3695, -1.3878, -1.4309],
        [-1.3586, -1.3695, -1.3878, -1.4308],
        [-1.3585, -1.3695, -1.3879, -1.4308],
        [-1.3585, -1.3695

12: done 2 episodes, mean_reward=1.00, best_reward=1.00, speed=174.61
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0001, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3568, -1.3778, -1.3902, -1.4214],
        [-1.3569, -1.3778, -1.3901, -1.4215],
        [-1.3570, -1.3781, -1.3898, -1.4214],
        [-1.3568, -1.3778, -1.3900, -1.4216],
        [-1.3569, -1.3779, -1.3900, -1.4215],
        [-1.3567, -1.3780, -1.3900, -1.4215],
        [-1.3568, -1.3779, -1.3902, -1.4214],
        [-1.3568, -1.3778, -1.3900, -1.4216],
        [-1.3570, -1.3778, -1.3900, -1.4214],
        [-1.3569, -1.3778, -1.3900, -1.4216],
        [-1.3569, -1.3778, -1.3902, -1.4214],
        [-1.3568, -1.3780, -1.3901, -1.4214],
        [-1.3568, -1.3778, -1.3902, -1.4214],
        [-1.3569, -1.3778, -1.3901, -1.4214],
        [-1.3570, -1.3781, -1.3898, -1.4214],
        [-1.3568, -1.3778, -1.3900, -1.4216],
        [-1.3567, -1.3778, -1.3901, -1.4216],
     

training here
loss_value_v torch.float32 torch.Size([]) tensor(7.0718e-06, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3800, -1.3056, -1.4198, -1.4454],
        [-1.3800, -1.3055, -1.4198, -1.4455],
        [-1.3800, -1.3057, -1.4199, -1.4453],
        [-1.3801, -1.3056, -1.4197, -1.4454],
        [-1.3800, -1.3054, -1.4200, -1.4454],
        [-1.3800, -1.3055, -1.4199, -1.4453],
        [-1.3799, -1.3055, -1.4200, -1.4454],
        [-1.3800, -1.3054, -1.4200, -1.4454],
        [-1.3799, -1.3056, -1.4199, -1.4454],
        [-1.3799, -1.3057, -1.4198, -1.4455],
        [-1.3800, -1.3056, -1.4198, -1.4454],
        [-1.3801, -1.3054, -1.4198, -1.4455],
        [-1.3800, -1.3056, -1.4198, -1.4454],
        [-1.3800, -1.3055, -1.4198, -1.4455],
        [-1.3800, -1.3057, -1.4198, -1.4453],
        [-1.3801, -1.3056, -1.4197, -1.4455],
        [-1.3800, -1.3055, -1.4199, -1.4454],
        [-1.3799, -1.3056, -1.4199, -1.4454],
        [-1.3800, -1.3055

training here
loss_value_v torch.float32 torch.Size([]) tensor(2.6492e-05, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4134, -1.2116, -1.4599, -1.4841],
        [-1.4134, -1.2114, -1.4601, -1.4841],
        [-1.4134, -1.2112, -1.4601, -1.4843],
        [-1.4135, -1.2114, -1.4601, -1.4841],
        [-1.4134, -1.2115, -1.4601, -1.4841],
        [-1.4134, -1.2115, -1.4601, -1.4841],
        [-1.4134, -1.2114, -1.4602, -1.4841],
        [-1.4134, -1.2113, -1.4602, -1.4841],
        [-1.4134, -1.2114, -1.4601, -1.4842],
        [-1.4134, -1.2114, -1.4601, -1.4842],
        [-1.4134, -1.2113, -1.4602, -1.4842],
        [-1.4135, -1.2113, -1.4601, -1.4843],
        [-1.4134, -1.2113, -1.4601, -1.4843],
        [-1.4134, -1.2112, -1.4601, -1.4843],
        [-1.4134, -1.2112, -1.4601, -1.4843],
        [-1.4134, -1.2112, -1.4601, -1.4843],
        [-1.4134, -1.2114, -1.4601, -1.4842],
        [-1.4134, -1.2114, -1.4601, -1.4842],
        [-1.4134, -1.2113

18: done 1 episodes, mean_reward=1.00, best_reward=1.00, speed=169.28
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.1211, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4231, -1.2724, -1.3784, -1.4833],
        [-1.4230, -1.2725, -1.3785, -1.4832],
        [-1.4232, -1.2723, -1.3784, -1.4833],
        [-1.4232, -1.2724, -1.3784, -1.4832],
        [-1.4231, -1.2723, -1.3784, -1.4834],
        [-1.4231, -1.2723, -1.3784, -1.4834],
        [-1.4231, -1.2723, -1.3784, -1.4833],
        [-1.4231, -1.2723, -1.3784, -1.4834],
        [-1.4231, -1.2725, -1.3784, -1.4832],
        [-1.4232, -1.2726, -1.3784, -1.4829],
        [-1.4232, -1.2723, -1.3784, -1.4832],
        [-1.4232, -1.2724, -1.3783, -1.4833],
        [-1.4231, -1.2725, -1.3784, -1.4832],
        [-1.4231, -1.2724, -1.3785, -1.4832],
        [-1.4231, -1.2724, -1.3784, -1.4832],
        [-1.4231, -1.2723, -1.3783, -1.4834],
        [-1.4230, -1.2726, -1.3784, -1.4831],
     

training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0600, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4408, -1.2897, -1.3477, -1.4780],
        [-1.4409, -1.2897, -1.3477, -1.4781],
        [-1.4408, -1.2897, -1.3477, -1.4781],
        [-1.4408, -1.2897, -1.3477, -1.4780],
        [-1.4409, -1.2895, -1.3478, -1.4781],
        [-1.4410, -1.2896, -1.3477, -1.4780],
        [-1.4409, -1.2896, -1.3478, -1.4780],
        [-1.4409, -1.2897, -1.3478, -1.4779],
        [-1.4408, -1.2896, -1.3479, -1.4780],
        [-1.4408, -1.2897, -1.3478, -1.4779],
        [-1.4409, -1.2897, -1.3477, -1.4780],
        [-1.4409, -1.2896, -1.3477, -1.4782],
        [-1.4408, -1.2897, -1.3477, -1.4780],
        [-1.4408, -1.2897, -1.3477, -1.4780],
        [-1.4409, -1.2896, -1.3477, -1.4781],
        [-1.4408, -1.2897, -1.3477, -1.4781],
        [-1.4408, -1.2896, -1.3478, -1.4781],
        [-1.4408, -1.2897, -1.3478, -1.4780],
        [-1.4408, -1.2897, -1

22: done 7 episodes, mean_reward=0.00, best_reward=1.00, speed=168.45
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.1067, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3898, -1.3119, -1.3852, -1.4641],
        [-1.3897, -1.3121, -1.3852, -1.4639],
        [-1.3897, -1.3121, -1.3852, -1.4640],
        [-1.3898, -1.3121, -1.3852, -1.4639],
        [-1.3897, -1.3121, -1.3852, -1.4639],
        [-1.3897, -1.3122, -1.3852, -1.4639],
        [-1.3897, -1.3122, -1.3852, -1.4639],
        [-1.3896, -1.3122, -1.3853, -1.4639],
        [-1.3896, -1.3122, -1.3853, -1.4639],
        [-1.3896, -1.3122, -1.3852, -1.4639],
        [-1.3896, -1.3122, -1.3852, -1.4639],
        [-1.3896, -1.3122, -1.3852, -1.4639],
        [-1.3896, -1.3121, -1.3852, -1.4639],
        [-1.3896, -1.3121, -1.3852, -1.4639],
        [-1.3896, -1.3121, -1.3852, -1.4639],
        [-1.3896, -1.3121, -1.3852, -1.4639],
        [-1.3896, -1.3122, -1.3852, -1.4639],
     

24: done 4 episodes, mean_reward=1.00, best_reward=2.00, speed=189.16
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0617, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4216, -1.2672, -1.3891, -1.4795],
        [-1.4216, -1.2673, -1.3891, -1.4794],
        [-1.4216, -1.2673, -1.3891, -1.4794],
        [-1.4216, -1.2674, -1.3891, -1.4794],
        [-1.4215, -1.2674, -1.3892, -1.4793],
        [-1.4216, -1.2675, -1.3891, -1.4792],
        [-1.4216, -1.2674, -1.3892, -1.4792],
        [-1.4214, -1.2676, -1.3892, -1.4791],
        [-1.4215, -1.2677, -1.3892, -1.4790],
        [-1.4215, -1.2674, -1.3892, -1.4792],
        [-1.4214, -1.2676, -1.3891, -1.4793],
        [-1.4214, -1.2676, -1.3891, -1.4792],
        [-1.4215, -1.2677, -1.3891, -1.4791],
        [-1.4215, -1.2674, -1.3892, -1.4792],
        [-1.4214, -1.2676, -1.3891, -1.4792],
        [-1.4215, -1.2676, -1.3890, -1.4791],
        [-1.4214, -1.2677, -1.3890, -1.4792],
     

training here
loss_value_v torch.float32 torch.Size([]) tensor(2.8302e-05, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4369, -1.2452, -1.3503, -1.5357],
        [-1.4368, -1.2453, -1.3504, -1.5356],
        [-1.4369, -1.2452, -1.3504, -1.5357],
        [-1.4369, -1.2452, -1.3504, -1.5357],
        [-1.4370, -1.2453, -1.3503, -1.5356],
        [-1.4368, -1.2453, -1.3503, -1.5357],
        [-1.4368, -1.2454, -1.3503, -1.5356],
        [-1.4369, -1.2453, -1.3503, -1.5357],
        [-1.4367, -1.2454, -1.3504, -1.5355],
        [-1.4368, -1.2453, -1.3503, -1.5357],
        [-1.4368, -1.2453, -1.3504, -1.5357],
        [-1.4367, -1.2454, -1.3504, -1.5356],
        [-1.4368, -1.2454, -1.3504, -1.5355],
        [-1.4368, -1.2453, -1.3503, -1.5357],
        [-1.4368, -1.2453, -1.3503, -1.5358],
        [-1.4368, -1.2454, -1.3503, -1.5357],
        [-1.4367, -1.2454, -1.3503, -1.5356],
        [-1.4368, -1.2453, -1.3503, -1.5357],
        [-1.4368, -1.2454

28: done 6 episodes, mean_reward=0.17, best_reward=2.00, speed=187.43
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0138, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3802, -1.2773, -1.3672, -1.5378],
        [-1.3802, -1.2773, -1.3672, -1.5378],
        [-1.3802, -1.2773, -1.3672, -1.5378],
        [-1.3802, -1.2773, -1.3672, -1.5377],
        [-1.3802, -1.2774, -1.3672, -1.5377],
        [-1.3802, -1.2774, -1.3673, -1.5376],
        [-1.3802, -1.2774, -1.3672, -1.5376],
        [-1.3802, -1.2774, -1.3672, -1.5377],
        [-1.3802, -1.2774, -1.3671, -1.5377],
        [-1.3802, -1.2775, -1.3671, -1.5376],
        [-1.3802, -1.2775, -1.3671, -1.5376],
        [-1.3801, -1.2774, -1.3672, -1.5377],
        [-1.3801, -1.2774, -1.3671, -1.5377],
        [-1.3801, -1.2774, -1.3671, -1.5377],
        [-1.3801, -1.2774, -1.3671, -1.5377],
        [-1.3802, -1.2775, -1.3671, -1.5376],
        [-1.3801, -1.2774, -1.3672, -1.5377],
     

30: done 2 episodes, mean_reward=1.00, best_reward=2.00, speed=199.20
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0310, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3833, -1.3007, -1.3709, -1.5005],
        [-1.3833, -1.3009, -1.3708, -1.5003],
        [-1.3833, -1.3009, -1.3708, -1.5003],
        [-1.3833, -1.3009, -1.3708, -1.5003],
        [-1.3833, -1.3010, -1.3708, -1.5003],
        [-1.3833, -1.3010, -1.3708, -1.5002],
        [-1.3833, -1.3011, -1.3708, -1.5001],
        [-1.3832, -1.3011, -1.3708, -1.5002],
        [-1.3832, -1.3010, -1.3708, -1.5003],
        [-1.3832, -1.3011, -1.3708, -1.5002],
        [-1.3832, -1.3011, -1.3708, -1.5002],
        [-1.3833, -1.3010, -1.3708, -1.5002],
        [-1.3833, -1.3011, -1.3707, -1.5001],
        [-1.3833, -1.3010, -1.3707, -1.5003],
        [-1.3833, -1.3010, -1.3707, -1.5003],
        [-1.3833, -1.3012, -1.3708, -1.5001],
        [-1.3832, -1.3012, -1.3708, -1.5001],
     

training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0305, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4415, -1.2697, -1.3175, -1.5385],
        [-1.4414, -1.2697, -1.3175, -1.5386],
        [-1.4414, -1.2697, -1.3175, -1.5385],
        [-1.4414, -1.2698, -1.3175, -1.5385],
        [-1.4413, -1.2699, -1.3175, -1.5384],
        [-1.4413, -1.2699, -1.3175, -1.5384],
        [-1.4413, -1.2699, -1.3175, -1.5385],
        [-1.4413, -1.2699, -1.3175, -1.5385],
        [-1.4414, -1.2698, -1.3175, -1.5385],
        [-1.4414, -1.2699, -1.3175, -1.5384],
        [-1.4413, -1.2699, -1.3175, -1.5384],
        [-1.4414, -1.2699, -1.3175, -1.5384],
        [-1.4414, -1.2698, -1.3174, -1.5385],
        [-1.4414, -1.2698, -1.3176, -1.5384],
        [-1.4414, -1.2699, -1.3175, -1.5383],
        [-1.4413, -1.2699, -1.3175, -1.5384],
        [-1.4413, -1.2699, -1.3175, -1.5385],
        [-1.4413, -1.2699, -1.3176, -1.5384],
        [-1.4414, -1.2698, -1

34: done 6 episodes, mean_reward=0.17, best_reward=2.00, speed=197.96
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0449, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4198, -1.2254, -1.3454, -1.5889],
        [-1.4198, -1.2254, -1.3454, -1.5889],
        [-1.4198, -1.2254, -1.3454, -1.5888],
        [-1.4198, -1.2253, -1.3454, -1.5889],
        [-1.4198, -1.2255, -1.3453, -1.5889],
        [-1.4198, -1.2255, -1.3453, -1.5888],
        [-1.4198, -1.2255, -1.3453, -1.5888],
        [-1.4198, -1.2255, -1.3453, -1.5888],
        [-1.4198, -1.2255, -1.3453, -1.5889],
        [-1.4199, -1.2254, -1.3453, -1.5888],
        [-1.4198, -1.2254, -1.3453, -1.5888],
        [-1.4198, -1.2254, -1.3454, -1.5888],
        [-1.4198, -1.2255, -1.3453, -1.5889],
        [-1.4198, -1.2255, -1.3453, -1.5889],
        [-1.4198, -1.2255, -1.3452, -1.5889],
        [-1.4198, -1.2256, -1.3452, -1.5889],
        [-1.4197, -1.2255, -1.3454, -1.5888],
     

training here
loss_value_v torch.float32 torch.Size([]) tensor(5.6596e-06, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4340, -1.2500, -1.3415, -1.5431],
        [-1.4340, -1.2501, -1.3415, -1.5431],
        [-1.4341, -1.2500, -1.3415, -1.5431],
        [-1.4341, -1.2499, -1.3415, -1.5432],
        [-1.4340, -1.2502, -1.3415, -1.5430],
        [-1.4339, -1.2503, -1.3416, -1.5430],
        [-1.4339, -1.2502, -1.3415, -1.5430],
        [-1.4339, -1.2502, -1.3415, -1.5430],
        [-1.4340, -1.2500, -1.3415, -1.5432],
        [-1.4341, -1.2500, -1.3415, -1.5431],
        [-1.4341, -1.2500, -1.3414, -1.5432],
        [-1.4341, -1.2500, -1.3415, -1.5432],
        [-1.4339, -1.2502, -1.3415, -1.5431],
        [-1.4339, -1.2502, -1.3415, -1.5430],
        [-1.4339, -1.2503, -1.3415, -1.5430],
        [-1.4339, -1.2503, -1.3415, -1.5430],
        [-1.4339, -1.2502, -1.3416, -1.5429],
        [-1.4339, -1.2503, -1.3416, -1.5429],
        [-1.4339, -1.2503

training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0457, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4353, -1.2537, -1.3409, -1.5376],
        [-1.4353, -1.2537, -1.3409, -1.5375],
        [-1.4353, -1.2537, -1.3409, -1.5376],
        [-1.4353, -1.2537, -1.3409, -1.5376],
        [-1.4351, -1.2540, -1.3410, -1.5373],
        [-1.4352, -1.2539, -1.3409, -1.5374],
        [-1.4352, -1.2539, -1.3410, -1.5373],
        [-1.4352, -1.2538, -1.3409, -1.5375],
        [-1.4352, -1.2537, -1.3410, -1.5375],
        [-1.4353, -1.2537, -1.3409, -1.5377],
        [-1.4353, -1.2537, -1.3410, -1.5375],
        [-1.4353, -1.2537, -1.3410, -1.5375],
        [-1.4352, -1.2540, -1.3409, -1.5374],
        [-1.4351, -1.2540, -1.3409, -1.5374],
        [-1.4351, -1.2539, -1.3410, -1.5374],
        [-1.4352, -1.2539, -1.3410, -1.5374],
        [-1.4353, -1.2538, -1.3409, -1.5374],
        [-1.4352, -1.2539, -1.3409, -1.5374],
        [-1.4352, -1.2539, -1

40: done 7 episodes, mean_reward=0.14, best_reward=2.00, speed=210.33
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0335, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4676, -1.2014, -1.3731, -1.5351],
        [-1.4674, -1.2017, -1.3731, -1.5350],
        [-1.4674, -1.2017, -1.3731, -1.5350],
        [-1.4675, -1.2016, -1.3731, -1.5350],
        [-1.4675, -1.2017, -1.3730, -1.5350],
        [-1.4675, -1.2016, -1.3730, -1.5350],
        [-1.4674, -1.2017, -1.3731, -1.5349],
        [-1.4675, -1.2016, -1.3730, -1.5350],
        [-1.4677, -1.2013, -1.3730, -1.5352],
        [-1.4674, -1.2018, -1.3731, -1.5348],
        [-1.4675, -1.2017, -1.3731, -1.5349],
        [-1.4674, -1.2017, -1.3731, -1.5349],
        [-1.4675, -1.2016, -1.3731, -1.5350],
        [-1.4675, -1.2017, -1.3731, -1.5349],
        [-1.4675, -1.2016, -1.3731, -1.5350],
        [-1.4674, -1.2017, -1.3731, -1.5349],
        [-1.4675, -1.2017, -1.3730, -1.5349],
     

42: done 2 episodes, mean_reward=1.00, best_reward=2.00, speed=221.73
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0007, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.5654, -1.0454, -1.4061, -1.6381],
        [-1.5654, -1.0454, -1.4061, -1.6380],
        [-1.5654, -1.0454, -1.4061, -1.6382],
        [-1.5654, -1.0453, -1.4061, -1.6382],
        [-1.5656, -1.0452, -1.4062, -1.6383],
        [-1.5655, -1.0452, -1.4061, -1.6383],
        [-1.5655, -1.0453, -1.4061, -1.6382],
        [-1.5655, -1.0452, -1.4061, -1.6383],
        [-1.5654, -1.0454, -1.4061, -1.6381],
        [-1.5655, -1.0452, -1.4062, -1.6382],
        [-1.5657, -1.0450, -1.4062, -1.6384],
        [-1.5654, -1.0453, -1.4062, -1.6382],
        [-1.5655, -1.0453, -1.4061, -1.6382],
        [-1.5655, -1.0452, -1.4062, -1.6382],
        [-1.5657, -1.0450, -1.4062, -1.6383],
        [-1.5655, -1.0452, -1.4062, -1.6382],
        [-1.5655, -1.0452, -1.4061, -1.6383],
     

training here
loss_value_v torch.float32 torch.Size([]) tensor(3.7584e-06, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.5566, -1.0775, -1.3935, -1.6070],
        [-1.5566, -1.0774, -1.3935, -1.6070],
        [-1.5565, -1.0775, -1.3935, -1.6070],
        [-1.5566, -1.0775, -1.3935, -1.6070],
        [-1.5565, -1.0775, -1.3935, -1.6070],
        [-1.5565, -1.0776, -1.3935, -1.6069],
        [-1.5566, -1.0774, -1.3935, -1.6071],
        [-1.5565, -1.0776, -1.3935, -1.6070],
        [-1.5565, -1.0775, -1.3935, -1.6069],
        [-1.5566, -1.0774, -1.3936, -1.6070],
        [-1.5566, -1.0775, -1.3935, -1.6070],
        [-1.5566, -1.0774, -1.3935, -1.6071],
        [-1.5565, -1.0776, -1.3935, -1.6069],
        [-1.5566, -1.0775, -1.3935, -1.6070],
        [-1.5566, -1.0775, -1.3936, -1.6069],
        [-1.5566, -1.0774, -1.3935, -1.6070],
        [-1.5566, -1.0774, -1.3935, -1.6070],
        [-1.5566, -1.0775, -1.3936, -1.6070],
        [-1.5567, -1.0774

46: done 1 episodes, mean_reward=0.00, best_reward=2.00, speed=213.41
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0001, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4853, -1.1372, -1.4095, -1.5675],
        [-1.4853, -1.1370, -1.4095, -1.5676],
        [-1.4853, -1.1371, -1.4095, -1.5675],
        [-1.4853, -1.1371, -1.4095, -1.5676],
        [-1.4854, -1.1372, -1.4094, -1.5675],
        [-1.4853, -1.1371, -1.4095, -1.5675],
        [-1.4854, -1.1371, -1.4095, -1.5676],
        [-1.4854, -1.1371, -1.4095, -1.5675],
        [-1.4854, -1.1371, -1.4095, -1.5675],
        [-1.4854, -1.1370, -1.4095, -1.5676],
        [-1.4853, -1.1371, -1.4095, -1.5676],
        [-1.4854, -1.1370, -1.4095, -1.5676],
        [-1.4852, -1.1371, -1.4095, -1.5675],
        [-1.4852, -1.1371, -1.4095, -1.5676],
        [-1.4853, -1.1371, -1.4095, -1.5676],
        [-1.4853, -1.1371, -1.4095, -1.5675],
        [-1.4853, -1.1371, -1.4095, -1.5676],
     

48: done 1 episodes, mean_reward=0.00, best_reward=2.00, speed=214.01
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0008, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4211, -1.2059, -1.3953, -1.5542],
        [-1.4212, -1.2058, -1.3953, -1.5542],
        [-1.4211, -1.2060, -1.3952, -1.5542],
        [-1.4211, -1.2060, -1.3952, -1.5541],
        [-1.4210, -1.2064, -1.3951, -1.5538],
        [-1.4211, -1.2059, -1.3952, -1.5543],
        [-1.4212, -1.2059, -1.3952, -1.5542],
        [-1.4212, -1.2060, -1.3952, -1.5542],
        [-1.4211, -1.2059, -1.3953, -1.5542],
        [-1.4211, -1.2059, -1.3953, -1.5541],
        [-1.4211, -1.2058, -1.3953, -1.5543],
        [-1.4211, -1.2059, -1.3953, -1.5541],
        [-1.4211, -1.2058, -1.3953, -1.5543],
        [-1.4211, -1.2060, -1.3952, -1.5541],
        [-1.4211, -1.2057, -1.3953, -1.5544],
        [-1.4212, -1.2057, -1.3953, -1.5544],
        [-1.4211, -1.2059, -1.3953, -1.5541],
     

training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0306, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4216, -1.2301, -1.3712, -1.5483],
        [-1.4215, -1.2302, -1.3712, -1.5482],
        [-1.4216, -1.2300, -1.3712, -1.5484],
        [-1.4216, -1.2300, -1.3712, -1.5484],
        [-1.4216, -1.2300, -1.3712, -1.5483],
        [-1.4216, -1.2301, -1.3712, -1.5483],
        [-1.4216, -1.2301, -1.3713, -1.5483],
        [-1.4216, -1.2301, -1.3712, -1.5483],
        [-1.4216, -1.2300, -1.3712, -1.5484],
        [-1.4216, -1.2300, -1.3712, -1.5484],
        [-1.4216, -1.2300, -1.3712, -1.5484],
        [-1.4216, -1.2301, -1.3712, -1.5483],
        [-1.4217, -1.2298, -1.3713, -1.5485],
        [-1.4216, -1.2299, -1.3713, -1.5484],
        [-1.4216, -1.2300, -1.3713, -1.5483],
        [-1.4216, -1.2299, -1.3713, -1.5484],
        [-1.4215, -1.2302, -1.3712, -1.5482],
        [-1.4215, -1.2302, -1.3712, -1.5482],
        [-1.4216, -1.2302, -1

52: done 3 episodes, mean_reward=0.00, best_reward=2.00, speed=208.73
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0494, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3935, -1.2754, -1.3880, -1.5010],
        [-1.3935, -1.2755, -1.3879, -1.5010],
        [-1.3935, -1.2756, -1.3879, -1.5009],
        [-1.3935, -1.2755, -1.3879, -1.5009],
        [-1.3935, -1.2754, -1.3879, -1.5011],
        [-1.3936, -1.2754, -1.3880, -1.5010],
        [-1.3936, -1.2754, -1.3880, -1.5010],
        [-1.3936, -1.2755, -1.3879, -1.5009],
        [-1.3935, -1.2756, -1.3880, -1.5008],
        [-1.3936, -1.2755, -1.3879, -1.5009],
        [-1.3936, -1.2755, -1.3879, -1.5009],
        [-1.3936, -1.2755, -1.3879, -1.5009],
        [-1.3935, -1.2754, -1.3880, -1.5010],
        [-1.3936, -1.2754, -1.3880, -1.5010],
        [-1.3936, -1.2753, -1.3879, -1.5010],
        [-1.3936, -1.2754, -1.3880, -1.5010],
        [-1.3935, -1.2755, -1.3880, -1.5010],
     

54: done 1 episodes, mean_reward=1.00, best_reward=2.00, speed=206.97
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0624, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3536, -1.2802, -1.4073, -1.5190],
        [-1.3536, -1.2802, -1.4072, -1.5191],
        [-1.3537, -1.2802, -1.4073, -1.5190],
        [-1.3536, -1.2802, -1.4073, -1.5190],
        [-1.3537, -1.2801, -1.4073, -1.5191],
        [-1.3537, -1.2802, -1.4073, -1.5191],
        [-1.3536, -1.2801, -1.4073, -1.5193],
        [-1.3537, -1.2801, -1.4073, -1.5192],
        [-1.3538, -1.2803, -1.4072, -1.5188],
        [-1.3537, -1.2801, -1.4073, -1.5191],
        [-1.3536, -1.2802, -1.4073, -1.5190],
        [-1.3537, -1.2801, -1.4073, -1.5192],
        [-1.3537, -1.2802, -1.4072, -1.5191],
        [-1.3538, -1.2802, -1.4073, -1.5189],
        [-1.3537, -1.2802, -1.4073, -1.5189],
        [-1.3537, -1.2801, -1.4073, -1.5191],
        [-1.3536, -1.2802, -1.4072, -1.5192],
     

training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0451, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3690, -1.2672, -1.3762, -1.5537],
        [-1.3690, -1.2671, -1.3761, -1.5538],
        [-1.3691, -1.2671, -1.3761, -1.5538],
        [-1.3690, -1.2671, -1.3761, -1.5538],
        [-1.3691, -1.2673, -1.3761, -1.5535],
        [-1.3691, -1.2671, -1.3762, -1.5537],
        [-1.3692, -1.2671, -1.3761, -1.5537],
        [-1.3691, -1.2672, -1.3761, -1.5537],
        [-1.3691, -1.2671, -1.3761, -1.5538],
        [-1.3691, -1.2671, -1.3761, -1.5537],
        [-1.3690, -1.2671, -1.3762, -1.5537],
        [-1.3690, -1.2671, -1.3761, -1.5537],
        [-1.3691, -1.2670, -1.3762, -1.5538],
        [-1.3691, -1.2671, -1.3762, -1.5536],
        [-1.3691, -1.2671, -1.3762, -1.5537],
        [-1.3691, -1.2670, -1.3762, -1.5538],
        [-1.3690, -1.2671, -1.3762, -1.5537],
        [-1.3690, -1.2671, -1.3761, -1.5537],
        [-1.3690, -1.2671, -1

58: done 4 episodes, mean_reward=0.50, best_reward=2.00, speed=213.44
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0638, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3684, -1.2533, -1.3747, -1.5749],
        [-1.3685, -1.2535, -1.3746, -1.5748],
        [-1.3684, -1.2534, -1.3747, -1.5747],
        [-1.3684, -1.2533, -1.3748, -1.5748],
        [-1.3685, -1.2536, -1.3747, -1.5744],
        [-1.3685, -1.2535, -1.3747, -1.5747],
        [-1.3685, -1.2535, -1.3747, -1.5746],
        [-1.3685, -1.2534, -1.3747, -1.5747],
        [-1.3685, -1.2535, -1.3746, -1.5747],
        [-1.3685, -1.2536, -1.3746, -1.5746],
        [-1.3685, -1.2535, -1.3746, -1.5747],
        [-1.3685, -1.2535, -1.3747, -1.5746],
        [-1.3686, -1.2532, -1.3747, -1.5748],
        [-1.3686, -1.2533, -1.3747, -1.5748],
        [-1.3685, -1.2533, -1.3747, -1.5748],
        [-1.3686, -1.2533, -1.3747, -1.5748],
        [-1.3685, -1.2534, -1.3747, -1.5748],
     

60: done 4 episodes, mean_reward=1.50, best_reward=2.00, speed=226.72
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0077, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3373, -1.2235, -1.3993, -1.6272],
        [-1.3374, -1.2238, -1.3993, -1.6266],
        [-1.3373, -1.2236, -1.3993, -1.6271],
        [-1.3374, -1.2237, -1.3993, -1.6268],
        [-1.3374, -1.2236, -1.3993, -1.6270],
        [-1.3374, -1.2236, -1.3992, -1.6269],
        [-1.3374, -1.2237, -1.3992, -1.6269],
        [-1.3375, -1.2237, -1.3992, -1.6268],
        [-1.3374, -1.2236, -1.3992, -1.6270],
        [-1.3374, -1.2237, -1.3993, -1.6268],
        [-1.3374, -1.2238, -1.3992, -1.6266],
        [-1.3374, -1.2239, -1.3992, -1.6266],
        [-1.3374, -1.2235, -1.3992, -1.6272],
        [-1.3374, -1.2234, -1.3993, -1.6272],
        [-1.3374, -1.2235, -1.3993, -1.6271],
        [-1.3374, -1.2235, -1.3992, -1.6271],
        [-1.3373, -1.2236, -1.3992, -1.6271],
     

training here
loss_value_v torch.float32 torch.Size([]) tensor(2.2450e-05, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3456, -1.1874, -1.4101, -1.6581],
        [-1.3457, -1.1874, -1.4101, -1.6581],
        [-1.3457, -1.1873, -1.4101, -1.6583],
        [-1.3457, -1.1874, -1.4101, -1.6581],
        [-1.3457, -1.1876, -1.4101, -1.6577],
        [-1.3457, -1.1873, -1.4101, -1.6581],
        [-1.3458, -1.1875, -1.4101, -1.6579],
        [-1.3457, -1.1874, -1.4101, -1.6580],
        [-1.3457, -1.1877, -1.4100, -1.6577],
        [-1.3457, -1.1876, -1.4101, -1.6579],
        [-1.3457, -1.1878, -1.4101, -1.6575],
        [-1.3457, -1.1875, -1.4101, -1.6578],
        [-1.3457, -1.1874, -1.4101, -1.6581],
        [-1.3457, -1.1873, -1.4101, -1.6581],
        [-1.3457, -1.1875, -1.4101, -1.6578],
        [-1.3457, -1.1873, -1.4101, -1.6581],
        [-1.3457, -1.1875, -1.4101, -1.6579],
        [-1.3457, -1.1875, -1.4101, -1.6579],
        [-1.3457, -1.1874

64: done 3 episodes, mean_reward=0.00, best_reward=2.00, speed=222.90
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0012, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3424, -1.2247, -1.4024, -1.6149],
        [-1.3423, -1.2247, -1.4025, -1.6148],
        [-1.3423, -1.2244, -1.4025, -1.6153],
        [-1.3422, -1.2245, -1.4024, -1.6152],
        [-1.3422, -1.2247, -1.4025, -1.6150],
        [-1.3422, -1.2246, -1.4025, -1.6151],
        [-1.3422, -1.2247, -1.4025, -1.6150],
        [-1.3423, -1.2247, -1.4024, -1.6150],
        [-1.3423, -1.2249, -1.4024, -1.6146],
        [-1.3422, -1.2246, -1.4025, -1.6151],
        [-1.3423, -1.2250, -1.4024, -1.6145],
        [-1.3423, -1.2248, -1.4024, -1.6148],
        [-1.3423, -1.2247, -1.4024, -1.6150],
        [-1.3423, -1.2247, -1.4024, -1.6150],
        [-1.3422, -1.2246, -1.4024, -1.6150],
        [-1.3422, -1.2246, -1.4024, -1.6151],
        [-1.3423, -1.2246, -1.4025, -1.6150],
     

66: done 3 episodes, mean_reward=0.33, best_reward=2.00, speed=224.34
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0321, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3528, -1.2001, -1.4117, -1.6266],
        [-1.3529, -1.2002, -1.4117, -1.6264],
        [-1.3528, -1.2002, -1.4116, -1.6265],
        [-1.3528, -1.2001, -1.4117, -1.6266],
        [-1.3529, -1.2003, -1.4116, -1.6263],
        [-1.3529, -1.2003, -1.4116, -1.6263],
        [-1.3529, -1.2003, -1.4116, -1.6262],
        [-1.3528, -1.2002, -1.4117, -1.6264],
        [-1.3529, -1.2003, -1.4116, -1.6263],
        [-1.3529, -1.2007, -1.4116, -1.6257],
        [-1.3528, -1.2002, -1.4117, -1.6264],
        [-1.3529, -1.2004, -1.4116, -1.6261],
        [-1.3528, -1.2003, -1.4116, -1.6263],
        [-1.3528, -1.2003, -1.4116, -1.6263],
        [-1.3529, -1.2004, -1.4116, -1.6262],
        [-1.3528, -1.2003, -1.4116, -1.6263],
        [-1.3528, -1.2002, -1.4117, -1.6265],
     

68: done 3 episodes, mean_reward=0.33, best_reward=2.00, speed=224.04
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0186, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3394, -1.2154, -1.3946, -1.6426],
        [-1.3394, -1.2155, -1.3946, -1.6425],
        [-1.3394, -1.2154, -1.3946, -1.6427],
        [-1.3394, -1.2153, -1.3946, -1.6427],
        [-1.3394, -1.2154, -1.3946, -1.6427],
        [-1.3395, -1.2155, -1.3946, -1.6424],
        [-1.3394, -1.2154, -1.3946, -1.6426],
        [-1.3395, -1.2155, -1.3946, -1.6424],
        [-1.3394, -1.2156, -1.3946, -1.6422],
        [-1.3395, -1.2156, -1.3946, -1.6422],
        [-1.3394, -1.2153, -1.3946, -1.6428],
        [-1.3394, -1.2154, -1.3946, -1.6426],
        [-1.3394, -1.2152, -1.3945, -1.6430],
        [-1.3395, -1.2153, -1.3946, -1.6427],
        [-1.3394, -1.2154, -1.3946, -1.6426],
        [-1.3394, -1.2153, -1.3946, -1.6428],
        [-1.3394, -1.2153, -1.3946, -1.6428],
     

70: done 3 episodes, mean_reward=0.67, best_reward=2.00, speed=227.71
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0160, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3490, -1.2598, -1.4028, -1.5565],
        [-1.3490, -1.2597, -1.4028, -1.5565],
        [-1.3489, -1.2596, -1.4029, -1.5567],
        [-1.3490, -1.2598, -1.4029, -1.5564],
        [-1.3490, -1.2598, -1.4029, -1.5564],
        [-1.3490, -1.2599, -1.4029, -1.5563],
        [-1.3490, -1.2599, -1.4029, -1.5563],
        [-1.3490, -1.2599, -1.4028, -1.5564],
        [-1.3490, -1.2599, -1.4029, -1.5563],
        [-1.3490, -1.2599, -1.4029, -1.5563],
        [-1.3490, -1.2599, -1.4028, -1.5563],
        [-1.3490, -1.2599, -1.4028, -1.5564],
        [-1.3490, -1.2598, -1.4028, -1.5564],
        [-1.3490, -1.2599, -1.4029, -1.5563],
        [-1.3490, -1.2600, -1.4028, -1.5563],
        [-1.3490, -1.2599, -1.4028, -1.5563],
        [-1.3490, -1.2599, -1.4029, -1.5563],
     

72: done 2 episodes, mean_reward=0.00, best_reward=2.00, speed=230.33
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0461, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3550, -1.2313, -1.4087, -1.5816],
        [-1.3550, -1.2312, -1.4086, -1.5816],
        [-1.3550, -1.2312, -1.4086, -1.5817],
        [-1.3550, -1.2313, -1.4086, -1.5815],
        [-1.3550, -1.2313, -1.4086, -1.5816],
        [-1.3550, -1.2312, -1.4086, -1.5816],
        [-1.3549, -1.2312, -1.4087, -1.5817],
        [-1.3550, -1.2313, -1.4086, -1.5815],
        [-1.3550, -1.2311, -1.4087, -1.5817],
        [-1.3551, -1.2314, -1.4086, -1.5814],
        [-1.3550, -1.2309, -1.4087, -1.5820],
        [-1.3550, -1.2312, -1.4086, -1.5816],
        [-1.3550, -1.2313, -1.4086, -1.5816],
        [-1.3549, -1.2310, -1.4086, -1.5820],
        [-1.3551, -1.2317, -1.4086, -1.5809],
        [-1.3550, -1.2313, -1.4086, -1.5816],
        [-1.3550, -1.2314, -1.4086, -1.5814],
     

74: done 2 episodes, mean_reward=0.00, best_reward=2.00, speed=226.68
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0013, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3665, -1.2608, -1.3841, -1.5557],
        [-1.3665, -1.2609, -1.3841, -1.5557],
        [-1.3664, -1.2609, -1.3842, -1.5557],
        [-1.3664, -1.2608, -1.3842, -1.5558],
        [-1.3666, -1.2608, -1.3842, -1.5556],
        [-1.3666, -1.2607, -1.3842, -1.5558],
        [-1.3666, -1.2608, -1.3842, -1.5557],
        [-1.3665, -1.2607, -1.3842, -1.5559],
        [-1.3666, -1.2607, -1.3841, -1.5558],
        [-1.3665, -1.2607, -1.3842, -1.5558],
        [-1.3665, -1.2606, -1.3842, -1.5558],
        [-1.3666, -1.2607, -1.3842, -1.5557],
        [-1.3665, -1.2610, -1.3842, -1.5554],
        [-1.3665, -1.2609, -1.3842, -1.5556],
        [-1.3665, -1.2609, -1.3842, -1.5556],
        [-1.3665, -1.2609, -1.3842, -1.5555],
        [-1.3665, -1.2609, -1.3842, -1.5556],
     

76: done 1 episodes, mean_reward=0.00, best_reward=2.00, speed=225.42
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0310, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3759, -1.2223, -1.3989, -1.5801],
        [-1.3759, -1.2223, -1.3989, -1.5800],
        [-1.3759, -1.2225, -1.3989, -1.5797],
        [-1.3759, -1.2223, -1.3989, -1.5800],
        [-1.3759, -1.2221, -1.3989, -1.5802],
        [-1.3760, -1.2221, -1.3989, -1.5802],
        [-1.3760, -1.2222, -1.3989, -1.5801],
        [-1.3760, -1.2222, -1.3989, -1.5801],
        [-1.3759, -1.2223, -1.3989, -1.5800],
        [-1.3760, -1.2223, -1.3989, -1.5800],
        [-1.3759, -1.2223, -1.3989, -1.5801],
        [-1.3760, -1.2223, -1.3989, -1.5800],
        [-1.3759, -1.2223, -1.3989, -1.5801],
        [-1.3759, -1.2224, -1.3989, -1.5800],
        [-1.3759, -1.2222, -1.3989, -1.5802],
        [-1.3759, -1.2223, -1.3989, -1.5801],
        [-1.3759, -1.2224, -1.3989, -1.5800],
     

78: done 2 episodes, mean_reward=0.00, best_reward=2.00, speed=224.46
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0008, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3985, -1.1945, -1.3895, -1.6045],
        [-1.3985, -1.1946, -1.3895, -1.6044],
        [-1.3985, -1.1946, -1.3895, -1.6044],
        [-1.3985, -1.1946, -1.3895, -1.6045],
        [-1.3985, -1.1945, -1.3896, -1.6045],
        [-1.3986, -1.1946, -1.3895, -1.6042],
        [-1.3985, -1.1945, -1.3896, -1.6045],
        [-1.3986, -1.1945, -1.3896, -1.6044],
        [-1.3986, -1.1945, -1.3895, -1.6044],
        [-1.3985, -1.1946, -1.3896, -1.6043],
        [-1.3985, -1.1944, -1.3896, -1.6046],
        [-1.3986, -1.1944, -1.3896, -1.6046],
        [-1.3985, -1.1947, -1.3895, -1.6044],
        [-1.3985, -1.1946, -1.3895, -1.6044],
        [-1.3985, -1.1946, -1.3895, -1.6044],
        [-1.3985, -1.1948, -1.3895, -1.6042],
        [-1.3985, -1.1946, -1.3895, -1.6045],
     

80: done 4 episodes, mean_reward=0.25, best_reward=2.00, speed=225.66
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0517, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3833, -1.1974, -1.3989, -1.6074],
        [-1.3833, -1.1974, -1.3989, -1.6074],
        [-1.3833, -1.1974, -1.3990, -1.6073],
        [-1.3833, -1.1974, -1.3990, -1.6073],
        [-1.3834, -1.1971, -1.3990, -1.6077],
        [-1.3834, -1.1976, -1.3989, -1.6069],
        [-1.3833, -1.1972, -1.3990, -1.6075],
        [-1.3834, -1.1973, -1.3989, -1.6074],
        [-1.3834, -1.1973, -1.3990, -1.6074],
        [-1.3834, -1.1973, -1.3990, -1.6074],
        [-1.3834, -1.1973, -1.3990, -1.6074],
        [-1.3834, -1.1971, -1.3990, -1.6077],
        [-1.3834, -1.1975, -1.3989, -1.6072],
        [-1.3834, -1.1975, -1.3989, -1.6071],
        [-1.3833, -1.1972, -1.3990, -1.6077],
        [-1.3833, -1.1972, -1.3990, -1.6077],
        [-1.3833, -1.1974, -1.3989, -1.6074],
     

training here
loss_value_v torch.float32 torch.Size([]) tensor(2.0996e-05, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3993, -1.1879, -1.3904, -1.6126],
        [-1.3992, -1.1882, -1.3904, -1.6121],
        [-1.3992, -1.1879, -1.3904, -1.6126],
        [-1.3992, -1.1881, -1.3904, -1.6123],
        [-1.3994, -1.1878, -1.3904, -1.6125],
        [-1.3994, -1.1878, -1.3904, -1.6125],
        [-1.3994, -1.1877, -1.3904, -1.6126],
        [-1.3994, -1.1877, -1.3904, -1.6126],
        [-1.3994, -1.1875, -1.3904, -1.6129],
        [-1.3992, -1.1883, -1.3904, -1.6118],
        [-1.3993, -1.1879, -1.3904, -1.6124],
        [-1.3993, -1.1879, -1.3904, -1.6125],
        [-1.3993, -1.1876, -1.3904, -1.6129],
        [-1.3993, -1.1877, -1.3904, -1.6128],
        [-1.3993, -1.1877, -1.3904, -1.6128],
        [-1.3993, -1.1877, -1.3904, -1.6127],
        [-1.3992, -1.1880, -1.3904, -1.6124],
        [-1.3993, -1.1879, -1.3904, -1.6126],
        [-1.3993, -1.1881

training here
loss_value_v torch.float32 torch.Size([]) tensor(9.8078e-06, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4010, -1.2102, -1.3829, -1.5863],
        [-1.4010, -1.2105, -1.3830, -1.5859],
        [-1.4010, -1.2103, -1.3829, -1.5862],
        [-1.4010, -1.2103, -1.3829, -1.5862],
        [-1.4010, -1.2101, -1.3829, -1.5864],
        [-1.4011, -1.2103, -1.3830, -1.5861],
        [-1.4010, -1.2100, -1.3829, -1.5866],
        [-1.4010, -1.2101, -1.3829, -1.5864],
        [-1.4011, -1.2104, -1.3829, -1.5860],
        [-1.4011, -1.2103, -1.3829, -1.5860],
        [-1.4010, -1.2104, -1.3830, -1.5860],
        [-1.4011, -1.2102, -1.3829, -1.5862],
        [-1.4011, -1.2102, -1.3829, -1.5863],
        [-1.4011, -1.2102, -1.3829, -1.5862],
        [-1.4010, -1.2101, -1.3830, -1.5864],
        [-1.4011, -1.2103, -1.3830, -1.5861],
        [-1.4010, -1.2102, -1.3830, -1.5862],
        [-1.4010, -1.2104, -1.3829, -1.5862],
        [-1.4010, -1.2102

86: done 4 episodes, mean_reward=0.00, best_reward=2.00, speed=230.09
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0022, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4015, -1.1904, -1.3972, -1.5976],
        [-1.4015, -1.1904, -1.3972, -1.5976],
        [-1.4015, -1.1904, -1.3972, -1.5976],
        [-1.4015, -1.1903, -1.3972, -1.5978],
        [-1.4014, -1.1905, -1.3972, -1.5976],
        [-1.4014, -1.1903, -1.3973, -1.5978],
        [-1.4014, -1.1903, -1.3972, -1.5978],
        [-1.4014, -1.1903, -1.3972, -1.5978],
        [-1.4015, -1.1904, -1.3972, -1.5976],
        [-1.4015, -1.1903, -1.3972, -1.5976],
        [-1.4015, -1.1901, -1.3973, -1.5978],
        [-1.4015, -1.1903, -1.3972, -1.5977],
        [-1.4014, -1.1905, -1.3972, -1.5975],
        [-1.4015, -1.1904, -1.3972, -1.5976],
        [-1.4014, -1.1905, -1.3972, -1.5975],
        [-1.4014, -1.1904, -1.3972, -1.5977],
        [-1.4014, -1.1904, -1.3972, -1.5976],
     

88: done 3 episodes, mean_reward=0.67, best_reward=2.00, speed=235.94
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0016, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3999, -1.2226, -1.3967, -1.5536],
        [-1.3999, -1.2225, -1.3968, -1.5536],
        [-1.3999, -1.2223, -1.3968, -1.5538],
        [-1.3998, -1.2224, -1.3967, -1.5538],
        [-1.3998, -1.2223, -1.3968, -1.5539],
        [-1.3998, -1.2223, -1.3968, -1.5540],
        [-1.3998, -1.2224, -1.3968, -1.5539],
        [-1.3998, -1.2224, -1.3968, -1.5538],
        [-1.4000, -1.2222, -1.3968, -1.5540],
        [-1.4000, -1.2220, -1.3968, -1.5541],
        [-1.3999, -1.2222, -1.3968, -1.5539],
        [-1.4000, -1.2225, -1.3967, -1.5536],
        [-1.3999, -1.2225, -1.3967, -1.5537],
        [-1.3999, -1.2224, -1.3967, -1.5537],
        [-1.3999, -1.2224, -1.3967, -1.5537],
        [-1.3999, -1.2224, -1.3967, -1.5537],
        [-1.3999, -1.2223, -1.3968, -1.5539],
     

training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0604, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.3996, -1.2361, -1.3939, -1.5386],
        [-1.3996, -1.2361, -1.3939, -1.5386],
        [-1.3996, -1.2362, -1.3939, -1.5386],
        [-1.3996, -1.2362, -1.3939, -1.5385],
        [-1.3995, -1.2364, -1.3939, -1.5382],
        [-1.3996, -1.2362, -1.3939, -1.5385],
        [-1.3996, -1.2362, -1.3939, -1.5385],
        [-1.3996, -1.2363, -1.3939, -1.5384],
        [-1.3996, -1.2359, -1.3939, -1.5388],
        [-1.3996, -1.2361, -1.3939, -1.5386],
        [-1.3996, -1.2363, -1.3939, -1.5384],
        [-1.3997, -1.2359, -1.3939, -1.5388],
        [-1.3996, -1.2363, -1.3939, -1.5384],
        [-1.3996, -1.2364, -1.3939, -1.5383],
        [-1.3996, -1.2362, -1.3939, -1.5385],
        [-1.3996, -1.2363, -1.3939, -1.5383],
        [-1.3995, -1.2363, -1.3938, -1.5384],
        [-1.3996, -1.2362, -1.3938, -1.5385],
        [-1.3996, -1.2363, -1

92: done 1 episodes, mean_reward=0.00, best_reward=2.00, speed=232.81
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0003, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4132, -1.2088, -1.3813, -1.5759],
        [-1.4133, -1.2087, -1.3813, -1.5759],
        [-1.4133, -1.2086, -1.3813, -1.5760],
        [-1.4133, -1.2088, -1.3813, -1.5758],
        [-1.4132, -1.2087, -1.3813, -1.5760],
        [-1.4132, -1.2087, -1.3813, -1.5760],
        [-1.4132, -1.2087, -1.3813, -1.5760],
        [-1.4132, -1.2087, -1.3813, -1.5760],
        [-1.4134, -1.2084, -1.3812, -1.5763],
        [-1.4134, -1.2084, -1.3812, -1.5763],
        [-1.4134, -1.2085, -1.3812, -1.5762],
        [-1.4134, -1.2085, -1.3812, -1.5762],
        [-1.4133, -1.2086, -1.3812, -1.5762],
        [-1.4132, -1.2092, -1.3813, -1.5754],
        [-1.4133, -1.2086, -1.3813, -1.5761],
        [-1.4132, -1.2088, -1.3812, -1.5759],
        [-1.4132, -1.2088, -1.3813, -1.5759],
     

94: done 2 episodes, mean_reward=0.00, best_reward=2.00, speed=232.60
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0006, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4115, -1.2142, -1.3803, -1.5713],
        [-1.4115, -1.2139, -1.3803, -1.5716],
        [-1.4115, -1.2141, -1.3803, -1.5715],
        [-1.4115, -1.2139, -1.3803, -1.5716],
        [-1.4115, -1.2140, -1.3804, -1.5716],
        [-1.4115, -1.2140, -1.3803, -1.5716],
        [-1.4114, -1.2141, -1.3804, -1.5714],
        [-1.4114, -1.2140, -1.3804, -1.5717],
        [-1.4115, -1.2140, -1.3803, -1.5716],
        [-1.4116, -1.2138, -1.3803, -1.5717],
        [-1.4115, -1.2142, -1.3804, -1.5712],
        [-1.4115, -1.2140, -1.3803, -1.5715],
        [-1.4115, -1.2141, -1.3803, -1.5715],
        [-1.4115, -1.2141, -1.3803, -1.5715],
        [-1.4115, -1.2141, -1.3803, -1.5715],
        [-1.4115, -1.2141, -1.3803, -1.5715],
        [-1.4115, -1.2140, -1.3803, -1.5716],
     

training here
loss_value_v torch.float32 torch.Size([]) tensor(7.8272e-06, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4032, -1.1910, -1.3916, -1.6014],
        [-1.4032, -1.1910, -1.3916, -1.6014],
        [-1.4032, -1.1910, -1.3916, -1.6014],
        [-1.4032, -1.1910, -1.3916, -1.6014],
        [-1.4032, -1.1908, -1.3916, -1.6018],
        [-1.4032, -1.1907, -1.3916, -1.6018],
        [-1.4032, -1.1909, -1.3916, -1.6015],
        [-1.4032, -1.1911, -1.3916, -1.6013],
        [-1.4034, -1.1905, -1.3916, -1.6020],
        [-1.4033, -1.1907, -1.3916, -1.6018],
        [-1.4033, -1.1911, -1.3916, -1.6011],
        [-1.4033, -1.1908, -1.3916, -1.6016],
        [-1.4032, -1.1911, -1.3916, -1.6013],
        [-1.4033, -1.1909, -1.3916, -1.6015],
        [-1.4032, -1.1910, -1.3916, -1.6014],
        [-1.4033, -1.1909, -1.3916, -1.6016],
        [-1.4032, -1.1910, -1.3916, -1.6014],
        [-1.4032, -1.1909, -1.3917, -1.6015],
        [-1.4032, -1.1909

98: done 2 episodes, mean_reward=0.00, best_reward=2.00, speed=228.33
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.0314, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4023, -1.1950, -1.3892, -1.5995],
        [-1.4023, -1.1951, -1.3892, -1.5994],
        [-1.4022, -1.1950, -1.3893, -1.5994],
        [-1.4023, -1.1950, -1.3892, -1.5994],
        [-1.4022, -1.1951, -1.3893, -1.5994],
        [-1.4022, -1.1949, -1.3893, -1.5997],
        [-1.4022, -1.1950, -1.3892, -1.5996],
        [-1.4022, -1.1950, -1.3893, -1.5995],
        [-1.4023, -1.1949, -1.3893, -1.5996],
        [-1.4023, -1.1949, -1.3893, -1.5996],
        [-1.4023, -1.1947, -1.3893, -1.5998],
        [-1.4023, -1.1948, -1.3893, -1.5997],
        [-1.4022, -1.1956, -1.3893, -1.5987],
        [-1.4023, -1.1950, -1.3893, -1.5995],
        [-1.4023, -1.1952, -1.3892, -1.5993],
        [-1.4023, -1.1951, -1.3892, -1.5994],
        [-1.4023, -1.1951, -1.3892, -1.5993],
     

100: done 2 episodes, mean_reward=0.00, best_reward=2.00, speed=226.43
training here
loss_value_v torch.float32 torch.Size([]) tensor(0.1075, grad_fn=<MseLossBackward>)
log_prob_v torch.float32 torch.Size([64, 4]) tensor([[-1.4444, -1.0773, -1.4226, -1.7011],
        [-1.4444, -1.0774, -1.4226, -1.7009],
        [-1.4443, -1.0774, -1.4226, -1.7009],
        [-1.4443, -1.0774, -1.4226, -1.7009],
        [-1.4443, -1.0774, -1.4226, -1.7009],
        [-1.4444, -1.0772, -1.4226, -1.7011],
        [-1.4443, -1.0775, -1.4226, -1.7007],
        [-1.4444, -1.0772, -1.4226, -1.7011],
        [-1.4445, -1.0771, -1.4226, -1.7011],
        [-1.4446, -1.0769, -1.4227, -1.7014],
        [-1.4446, -1.0770, -1.4226, -1.7012],
        [-1.4444, -1.0773, -1.4226, -1.7010],
        [-1.4444, -1.0774, -1.4226, -1.7008],
        [-1.4443, -1.0774, -1.4226, -1.7008],
        [-1.4443, -1.0774, -1.4226, -1.7008],
        [-1.4443, -1.0774, -1.4226, -1.7008],
        [-1.4444, -1.0775, -1.4225, -1.7008],
    

KeyboardInterrupt: 

In [None]:
mb_actions = np.zeros((6, 3), dtype=np.int32)

In [None]:
mb_actions

In [None]:
device = torch.device("cpu")

In [49]:
test = -36.6

In [50]:
test1 = torch.FloatTensor(test).to(device)

TypeError: new(): data must be a sequence (got float)

In [48]:
test1

tensor([0.1784, 0.1803, 0.1821, 0.1839, 0.1785, 0.1803, 0.1821, 0.1839, 0.1786,
        0.1804, 0.1823, 0.1841, 0.1783, 0.1801, 0.1819, 0.1837, 0.1783, 0.1801,
        0.1820, 0.1838, 1.1684, 1.1802, 0.1820, 0.1838, 0.1786, 0.1804, 0.1822,
        0.1840, 0.1784, 0.1802, 0.1820, 0.1838, 0.1784, 0.1802, 0.1820, 0.1838,
        0.1783, 0.1801, 0.1820, 0.1838, 0.1787, 0.1805, 0.1823, 0.1842, 0.1783,
        0.1801, 0.1819, 0.1837, 0.1783, 0.1801, 0.1819, 0.1838, 0.1782, 0.1800,
        0.1818, 0.1836, 0.1784, 0.1802, 0.1820, 0.1839, 0.1784, 0.1802, 0.1821,
        0.1839])

In [39]:
adv_v = torch.tensor(test).to(device)

In [40]:
adv_v.dtype

torch.float32

In [41]:
adv_v.long()

tensor(-36)

In [44]:
test2 = test1 + 35
test2

tensor(-1.6000)