In [1]:
import preonpy
import numpy as np
from copy import deepcopy

from utils.options import Options
from env.preon_env import Preon_env
from agent.ddpg import DDPG
from utils.memory import ReplayMemory
from agent.evaluator import Evaluator
from utils.util import *

In [2]:
episode_memory = ReplayMemory(100)
episode_memory.push([0,0],[0,0],[1,1],[1,1],0,True)
episode_memory.push([0,1],[0,0],[10,1],[0,1],1,True)
episode_memory.push([0,2],[1,0],[1,10],[1,0],0,False)

In [3]:
transitions = episode_memory.sample(2)
transitions

[Transition(state=[0, 0], goal=[0, 0], action=[1, 1], next_state=[1, 1], reward=0, terminal=True),
 Transition(state=[0, 2], goal=[1, 0], action=[1, 10], next_state=[1, 0], reward=0, terminal=False)]

In [4]:
from collections import namedtuple
Transition = namedtuple('Transition', ('state', 'goal', 'action', 'next_state', 'reward', 'terminal'))
batch = Transition(*zip(*transitions))
batch

Transition(state=([0, 0], [0, 2]), goal=([0, 0], [1, 0]), action=([1, 1], [1, 10]), next_state=([1, 1], [1, 0]), reward=(0, 0), terminal=(True, False))

In [5]:
np.array(batch.state)

array([[0, 0],
       [0, 2]])

In [8]:
Variable(torch.from_numpy(np.array(batch.state)))

Variable containing:
 0  0
 0  2
[torch.LongTensor of size 2x2]

In [2]:
opt = Options()
env = Preon_env(opt.env_params)
agent = DDPG(opt.agent_params)
evaluate = Evaluator(opt.agent_params)

In [3]:
def generate_new_goal(args):
    desired_vol = float(np.random.randint(0,args.max_volume + 1))   # Generate random expected volume.
    new_goal = [desired_vol, 0.0]
    return new_goal

In [4]:
agent.is_training = True
goal = generate_new_goal(opt.agent_params)
print(goal)

[183.0, 0.0]


In [5]:
observation, _ = deepcopy(env.reset())
agent.reset(observation)
print(observation)

Started simulation from frame 0 to frame 2 with 8 thread(s).
Done simulating  2  frames.
(0.0, 12.0, 0.0, 0.0, 0.0)


In [6]:
action = to_numpy(agent.actor(to_tensor(np.array([observation])),to_tensor(np.array([goal])))).squeeze(0)
action = agent.add_noise_to_action(action)
action = np.clip(action, -1., 1.)
agent.a_t = action
print("action:", action)

('action:', array([ 1., -1.,  1.]))


### Execute action in environment

In [7]:
observation2, reward, done, info = env.step(action, goal)
observation2 = deepcopy(observation2)

In [8]:
agent.observe(goal, reward, observation2, done)

In [9]:
agent.memory.memory

[Transition(state=(0.0, 12.0, 0.0, 0.0, 0.0), goal=[183.0, 0.0], action=array([ 1., -1.,  1.]), next_state=(0.0, 12.0, 0.0, 0.0, 0.0), reward=-1.0, terminal=False)]

In [16]:
agent.a_t