In [1]:
import preonpy
import numpy as np
from copy import deepcopy

from utils.options import Options
from env.preon_env import Preon_env
from agent.ddpg import DDPG
from utils.memory import ReplayMemory
from agent.evaluator import Evaluator
from utils.util import *

In [3]:
opt = Options()
env = Preon_env(opt.env_params)
agent = DDPG(opt.agent_params)
evaluate = Evaluator(opt)

In [13]:
def generate_new_goal(args):
    # np.random.uniform(-1.0,1.0)
    desired_poured_vol = np.random.choice([100,150,200,250,300,350,400,450])
    print(desired_poured_vol)
    desired_poured_vol_norm = (desired_poured_vol - args.max_volume/2.0) / (args.max_volume/2.0)
    print(desired_poured_vol_norm)
    desired_spilled_vol_norm = (0.0 - args.max_volume/2.0) / (args.max_volume/2.0)
    new_goal = [desired_poured_vol_norm, desired_spilled_vol_norm]
    return new_goal

In [4]:
agent.is_training = True
goal = generate_new_goal(opt.env_params)
print(goal)

[-0.35897435897435898, 0.0]


In [5]:
observation, _ = deepcopy(env.reset())
agent.reset(observation)
print(observation)

Started simulation from frame 0 to frame 2 with 8 thread(s).
Done simulating  2  frames.
(0.0, 0.20000000000000001, -1.0, -1.0, -1.0)


In [6]:
action = to_numpy(agent.actor(to_tensor(np.array([observation])),to_tensor(np.array([goal])))).squeeze(0)
action = agent.add_noise_to_action(action)
action = np.clip(action, -1., 1.)
agent.a_t = action
print("action:", action)

('action:', array([ 0.25083561, -0.25995295,  0.24535936]))


### Execute action in environment

In [7]:
observation2, reward, done, info = env.step(action, goal)
observation2 = deepcopy(observation2)

Started simulation from frame 2 to frame 3 with 8 thread(s).
Done simulating  1  frames.


In [12]:
agent.observe(goal, reward, observation2, done)

In [13]:
agent.memory.memory

[Transition(state=(0.0, 0.20000000000000001, -1.0, -1.0, -1.0), goal=[-0.35897435897435898, 0.0], action=array([ 0.25083561, -0.25995295,  0.24535936]), next_state=(0.041666666666666664, 0.14800000000000005, -0.98773203161027701, -1.0, -1.0), reward=1.0, terminal=False),
 Transition(state=(0.041666666666666664, 0.14800000000000005, -0.98773203161027701, -1.0, -1.0), goal=[-0.35897435897435898, 0.0], action=array([ 0.25083561, -0.25995295,  0.24535936]), next_state=(0.041666666666666664, 0.14800000000000005, -0.98773203161027701, -1.0, -1.0), reward=1.0, terminal=False),
 Transition(state=(0.041666666666666664, 0.14800000000000005, -0.98773203161027701, -1.0, -1.0), goal=[-0.35897435897435898, 0.0], action=array([ 0.25083561, -0.25995295,  0.24535936]), next_state=(0.041666666666666664, 0.14800000000000005, -0.98773203161027701, -1.0, -1.0), reward=1.0, terminal=True)]

In [14]:
state_batch, goal_batch, action_batch, reward_batch, \
next_state_batch, terminal_batch = agent.memory.sample(3)

In [15]:
var_goal = to_tensor(goal_batch, volatile=True)
var_next_state = to_tensor(next_state_batch, volatile=True)
print(var_goal)
print(var_next_state)

Variable containing:
-0.3590  0.0000
-0.3590  0.0000
-0.3590  0.0000
[torch.FloatTensor of size 3x2]

Variable containing:
 0.0417  0.1480 -0.9877 -1.0000 -1.0000
 0.0417  0.1480 -0.9877 -1.0000 -1.0000
 0.0417  0.1480 -0.9877 -1.0000 -1.0000
[torch.FloatTensor of size 3x5]



In [16]:
next_q_values = agent.critic_target(var_next_state, var_goal,agent.actor_target(var_next_state, var_goal))
next_q_values.volatile=False
print(next_q_values)

Variable containing:
-0.0674  0.0743 -0.2082
-0.0674  0.0743 -0.2082
-0.0674  0.0743 -0.2082
[torch.FloatTensor of size 3x3]

