In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
import numpy as np
import matplotlib.pyplot as plt
import gym

In [3]:
eps=1e-10
DTYPE = torch.float32

In [212]:
class Net(nn.Module):
    def __init__(self, state_size, action_size, is_continuous=False):
        super().__init__()
        self.is_continuous = is_continuous
        
        pi_model = nn.Sequential(
            nn.Linear(state_size, 2),
            nn.ReLU(),
            nn.Linear(2, 2),
            nn.ReLU(),
            nn.Linear(2, 1),
            nn.ReLU(),
            nn.Linear(1, action_size)
        )
        
        value_model = nn.Sequential(
            nn.Linear(state_size, 50),
            nn.ReLU(),
            nn.Linear(50, 50),
            nn.ReLU(),
            nn.Linear(50, 50),
            nn.ReLU(),
            nn.Linear(50, 1)
        )
        
        self.pi = pi_model
        self.ve = value_model
        
    def forward(self, x):
        pass
    
    def getValue(self, x):
        x = torch.tensor(x, dtype=DTYPE)
        return self.ve(x)
    
    def getPi(self, x):
        x = torch.tensor(x, dtype=DTYPE)
        probs = self.pi(x)
        return nn.Softmax(dim=-1)(probs) if not self.is_continuous else probs 
    
    def setContinuous(self):
        self.is_continuous = True

In [213]:
class ReplayBuffer():
    def __init__(self, minl, maxl,alpha,sample_size=400):
        self.max_length = maxl
        self.length = 0
        self.buffer = []
        self.weights = []
        self.alpha = alpha
        self.min_length = minl
        self.sample_size = sample_size
        
    def add(self, ep):
        
        weight = np.exp(self.alpha*np.array(ep['rewards']).sum())+eps
#         if self.length>0 and weight < 0.75*np.min(self.weights) + 0.25 * np.mean(self.weights) and self.trainable(): return

        self.buffer.append(ep)
        self.length += 1

        self.weights.append(weight)
        
        if self.length > self.max_length:
            idx = np.random.randint(self.length)
            del self.buffer[idx]
            del self.weights[idx]
            self.length-=1
            
    def sample(self):
        wts = np.array(self.weights)
        
        sample_size = min(self.sample_size, self.length)
        sample = np.random.choice(self.buffer,size=(sample_size,), p=wts/wts.sum(), replace=False)
        out = sample if self.length>1 else [sample]
        return out
    
    def trainable(self):
        return self.length>self.min_length
    

In [276]:
class PCL(object):
    def __init__(self, env, replay_buffer, state_size, action_size, epoch=1000, off_policy_rate=20, pi_lr=7e-4, 
                 ve_lr=3.5e-4, entropy_tau=0.15, rollout_d=20, gamma=1):
        self.epoch = epoch
        self.state_size =  state_size
        self.action_size = action_size
        self.net = Net(self.state_size, self.action_size)
        self.net.is_continuous = False
        
        self.replay_buffer = replay_buffer
        self.env = env
        
        self.pi_optimiser = optim.Adam(self.net.pi.parameters(), lr=pi_lr)
        self.ve_optimiser = optim.Adam(self.net.ve.parameters(), lr=ve_lr)
        self.off_policy_rate = off_policy_rate
        self.tau = entropy_tau
        self.rollout_d = rollout_d
        self.gamma = gamma
    
    def selectProb(self, pi, a):
        a = np.eye(self.action_size, dtype=np.int32)[a]
        pi_selected = torch.sum(pi * torch.tensor(a, dtype=DTYPE), dim=1)
        return torch.log(pi_selected+eps)

        
    def optimise(self, episodes):
        net = self.net
        
        self.pi_optimiser.zero_grad()
        self.ve_optimiser.zero_grad()
        
#         print("before loop", episodes)
        for episode in episodes:
            rollout_d = min(self.rollout_d, len(episode['states']))
            ma = len(episode['states'])-rollout_d+1
            
            for i in range(ma):
                states = episode['states'][i:i+rollout_d]
                a = episode['actions'][i:i+rollout_d]  
                R = episode['rewards'][i:i+rollout_d]

                ve_init = net.getValue(states[0])
                ve_end = net.getValue(states[-1])

                pi_all_states = net.getPi(states).squeeze(0)
                log_pi_selected = self.selectProb(pi_all_states, a)

                discount = torch.tensor(self.gamma**np.arange(0,rollout_d))

                c = -ve_init + self.gamma**rollout_d*ve_end\
                + torch.sum(discount*(torch.tensor(R) - self.tau*log_pi_selected))

                loss = c**2
                loss.backward()
                
#                 print("----------------------------", loss)
#         print("after loop")
        
        self.ve_optimiser.step()
        self.pi_optimiser.step()
        
    def getStateEnc(self,x):
        return x
    def getStateDec(self,x):
        return x
    def getActionEnc(self,a):
        return a
    def getActionDec(self,a):
        return a
    
    def getAction(self, state):
        enc_state = self.getStateEnc(state)
        
        pi = self.net.getPi(enc_state).squeeze(0).detach().numpy()
        return self.getActionDec(np.random.choice(np.arange(self.action_size), p=pi))
    
    def rollout(self, max_ep_length = -1):
        states = []
        actions = []
        rewards = []
        env = self.env
        
        state = env.reset()
        is_terminated = False
        timestep = 0
        
        while not is_terminated and timestep!=max_ep_length:
            action = self.getAction(state)
            next_state, reward, is_terminated, _ = env.step(action)
            
            states.append(self.getStateEnc(state))
            actions.append(self.getActionEnc(action))
            rewards.append(reward)
            
            state = next_state
            timestep += 1

        
        rewards = np.array(rewards)
#         print(np.mean(rewards))
#         rewards = (rewards - np.mean(rewards))
        
#         print(len(states))
#         print("")
        return dict(
            states = states,
            actions = actions,
            rewards = list(rewards)
        )
    
    def train(self, max_ep_length=-1):
        tot_rewards = []
        for i in range(self.epoch):
            episode = self.rollout(max_ep_length)        
            self.optimise([episode])
            
            r = np.array(episode['rewards'])
            tot_rewards.append(r.sum())
            
#             if i>20: 
#                 print("I: {},    B:{}       R: {}                    ".format(i, self.replay_buffer.length, np.mean(tot_rewards[-20:])), end='\r')
#             else:
#                 print("I: {},    B:{}       R: {}                    ".format(i, self.replay_buffer.length, tot_rewards[-1]), end='\r')
                
            self.replay_buffer.add(episode)
            
            ########off policy
            if self.replay_buffer.trainable():
                for _ in range(self.off_policy_rate):
                    episodes = self.replay_buffer.sample()
                    self.optimise(episodes)
            
#             print(i)
#             for x in self.net.pi.parameters():
#                 print(x)

In [277]:
class ContinuousPCL(PCL):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.net.setContinuous()
        
    def selectProb(self, params, a):
        """
        params: [mean, var]
        """
        
        mean = params[:, 0]
        std = params[:, 1]
        a = torch.tensor(a, dtype=DTYPE)
        
        return -(a-mean)**2/(std**2+eps) - std  
    
    def getAction(self, state):
        enc_state = self.getStateEnc(state)
        params = self.net.getPi(enc_state).squeeze(0).detach()
        action = Normal(params[0], params[1]).sample().numpy()
        return np.array([action])
    
    def getActionEnc(self, action):
        return torch.tensor(action, dtype=DTYPE)

In [278]:
class CopyPCL(PCL):
    def __init__(self,*args, **kwargs):
        super().__init__(*args, **kwargs)
    
    def getStateEnc(self,x):
        return np.array([x])
    
    def getStateDec(self,x):
        return x[0]
    
    def getActionEnc(self,a):
        return a[0]*10 + a[1]*5 + a[2]
    
    def getActionDec(self,a):
        b = ((a//10)%5, 1-(a//10), (a%10)%5)
        return b
    

In [279]:
env = gym.make('Copy-v0')
buffer = ReplayBuffer(32, 10000, 1)
agent = CopyPCL(env, buffer, 1, 11, epoch=100000, off_policy_rate=2, pi_lr=0.005, ve_lr=0.0025)
# agent.getStateEnc = lambda x: np.eye(6)[int(x)]
# agent.getStateDec = lambda x: np.argmax(x)

In [280]:
agent.train()

before loop [{'states': [array([2]), array([4]), array([2])], 'actions': [10, 7, 5], 'rewards': [0.0, 1.0, -0.5]}]
---------------------------- tensor([3.2150], grad_fn=<PowBackward0>)
after loop
before loop [{'states': [array([0])], 'actions': [7], 'rewards': [-0.5]}]
---------------------------- tensor([0.0064], grad_fn=<PowBackward0>)
after loop
before loop [{'states': [array([0]), array([5])], 'actions': [5, 6], 'rewards': [1.0, -0.5]}]
---------------------------- tensor([1.2767], grad_fn=<PowBackward0>)
after loop
before loop [{'states': [array([1]), array([5])], 'actions': [6, 8], 'rewards': [1.0, -0.5]}]
---------------------------- tensor([0.8658], grad_fn=<PowBackward0>)
after loop
before loop [{'states': [array([4]), array([5])], 'actions': [9, 7], 'rewards': [1.0, -0.5]}]
---------------------------- tensor([1.9284], grad_fn=<PowBackward0>)
after loop
before loop [{'states': [array([2])], 'actions': [9], 'rewards': [-0.5]}]
---------------------------- tensor([0.0087], grad

---------------------------- tensor([0.0091], grad_fn=<PowBackward0>)
after loop
before loop [{'states': [array([4])], 'actions': [7], 'rewards': [-0.5]}
 {'states': [array([4]), array([5])], 'actions': [9, 7], 'rewards': [1.0, -0.5]}
 {'states': [array([3]), array([5]), array([3])], 'actions': [8, 10, 7], 'rewards': [1.0, 0.0, -0.5]}
 {'states': [array([1])], 'actions': [7], 'rewards': [-0.5]}
 {'states': [array([4]), array([5])], 'actions': [9, 5], 'rewards': [1.0, -0.5]}
 {'states': [array([1]), array([5])], 'actions': [6, 5], 'rewards': [1.0, 1.0]}
 {'states': [array([2]), array([5])], 'actions': [7, 5], 'rewards': [1.0, -0.5]}
 {'states': [array([4])], 'actions': [5], 'rewards': [-0.5]}
 {'states': [array([0])], 'actions': [6], 'rewards': [-0.5]}
 {'states': [array([2]), array([4]), array([2])], 'actions': [10, 7, 5], 'rewards': [0.0, 1.0, -0.5]}
 {'states': [array([1])], 'actions': [8], 'rewards': [-0.5]}
 {'states': [array([3])], 'actions': [9], 'rewards': [-0.5]}
 {'states': [a

---------------------------- tensor([0.0111], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0096], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0049], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0096], grad_fn=<PowBackward0>)
after loop
before loop [{'states': [array([1]), array([5])], 'actions': [6, 8], 'rewards': [1.0, -0.5]}]
---------------------------- tensor([3.5881e-06], grad_fn=<PowBackward0>)
after loop
before loop [{'states': [array([4]), array([5])], 'actions': [9, 7], 'rewards': [1.0, -0.5]}
 {'states': [array([3]), array([5]), array([3])], 'actions': [8, 10, 7], 'rewards': [1.0, 0.0, -0.5]}
 {'states': [array([3])], 'actions': [5], 'rewards': [-0.5]}
 {'states': [array([4])], 'actions': [8], 'rewards': [-0.5]}
 {'states': [array([0])], 'actions': [7], 'rewards': [-0.5]}
 {'states': [array([4])], 'actions': [5], 'rewards': [-0.5]}
 {'states': [array([2]), array([4]), array([2])], 'actions': [10, 7, 5], 'rewards': [0.0, 1

---------------------------- tensor([0.0121], grad_fn=<PowBackward0>)
---------------------------- tensor([1.3538], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0114], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0853], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0165], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0117], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0123], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0110], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0165], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0122], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0169], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0032], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0108], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0036], grad_fn=<PowBackward0>)
after loop
before lo

---------------------------- tensor([0.0117], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0022], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0137], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0122], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0160], grad_fn=<PowBackward0>)
---------------------------- tensor([2.8635], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0135], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0135], grad_fn=<PowBackward0>)
---------------------------- tensor([0.1024], grad_fn=<PowBackward0>)
---------------------------- tensor([0.2841], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0082], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0958], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0160], grad_fn=<PowBackward0>)
---------------------------- tensor([1.0830], grad_fn=<PowBackward0>)
--------------------

before loop [{'states': [array([4])], 'actions': [5], 'rewards': [-0.5]}]
---------------------------- tensor([0.0147], grad_fn=<PowBackward0>)
after loop
before loop [{'states': [array([1]), array([5])], 'actions': [6, 5], 'rewards': [1.0, 1.0]}
 {'states': [array([4])], 'actions': [5], 'rewards': [-0.5]}
 {'states': [array([4])], 'actions': [6], 'rewards': [-0.5]}
 {'states': [array([1])], 'actions': [8], 'rewards': [-0.5]}
 {'states': [array([4]), array([5])], 'actions': [9, 7], 'rewards': [1.0, -0.5]}
 {'states': [array([3]), array([5]), array([3])], 'actions': [8, 10, 7], 'rewards': [1.0, 0.0, -0.5]}
 {'states': [array([4]), array([5])], 'actions': [9, 7], 'rewards': [1.0, -0.5]}
 {'states': [array([1])], 'actions': [7], 'rewards': [-0.5]}
 {'states': [array([4]), array([5])], 'actions': [9, 5], 'rewards': [1.0, -0.5]}
 {'states': [array([4])], 'actions': [8], 'rewards': [-0.5]}
 {'states': [array([2])], 'actions': [5], 'rewards': [-0.5]}
 {'states': [array([4])], 'actions': [8], 

---------------------------- tensor([0.0107], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0008], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0155], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0095], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0155], grad_fn=<PowBackward0>)
---------------------------- tensor([0.9429], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0006], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0133], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0005], grad_fn=<PowBackward0>)
---------------------------- tensor([0.3837], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0149], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0139], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0095], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0138], grad_fn=<PowBackward0>)
--------------------

---------------------------- tensor([0.0002], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0135], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0157], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0139], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0162], grad_fn=<PowBackward0>)
after loop
before loop [{'states': [array([1]), array([5])], 'actions': [6, 9], 'rewards': [1.0, -0.5]}
 {'states': [array([4]), array([5])], 'actions': [9, 7], 'rewards': [1.0, -0.5]}
 {'states': [array([4])], 'actions': [8], 'rewards': [-0.5]}
 {'states': [array([1]), array([5])], 'actions': [6, 8], 'rewards': [1.0, -0.5]}
 {'states': [array([1]), array([5])], 'actions': [6, 5], 'rewards': [1.0, 1.0]}
 {'states': [array([4])], 'actions': [5], 'rewards': [-0.5]}
 {'states': [array([4]), array([5])], 'actions': [9, 7], 'rewards': [1.0, -0.5]}
 {'states': [array([3])], 'actions': [5], 'rewards': [-0.5]}
 {'states': [array([0])], 'actions': [6]

---------------------------- tensor([0.0173], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0165], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0093], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0843], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0959], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0114], grad_fn=<PowBackward0>)
---------------------------- tensor([0.9448], grad_fn=<PowBackward0>)
---------------------------- tensor([1.1421], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0093], grad_fn=<PowBackward0>)
---------------------------- tensor([2.7340], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0159], grad_fn=<PowBackward0>)
---------------------------- tensor([2.6857], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0114], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0133], grad_fn=<PowBackward0>)
--------------------

---------------------------- tensor([0.8494], grad_fn=<PowBackward0>)
---------------------------- tensor([1.0474], grad_fn=<PowBackward0>)
---------------------------- tensor([0.1499], grad_fn=<PowBackward0>)
---------------------------- tensor([2.5448e-07], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0002], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0088], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0122], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0958], grad_fn=<PowBackward0>)
---------------------------- tensor([2.7514e-05], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0170], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0088], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0107], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0122], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0117], grad_fn=<PowBackward0>)
------------

---------------------------- tensor([0.0168], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0168], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0176], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0001], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0112], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0003], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0117], grad_fn=<PowBackward0>)
---------------------------- tensor([5.1771e-05], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0102], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0117], grad_fn=<PowBackward0>)
after loop
before loop [{'states': [array([4])], 'actions': [7], 'rewards': [-0.5]}]
---------------------------- tensor([0.0145], grad_fn=<PowBackward0>)
after loop
before loop [{'states': [array([2]), array([5])], 'actions': [7, 5], 'rewards': [1.0, -0.5]}
 {'states': [array([1]), array([5])],

---------------------------- tensor([0.0008], grad_fn=<PowBackward0>)
after loop
before loop [{'states': [array([4]), array([5])], 'actions': [9, 7], 'rewards': [1.0, -0.5]}
 {'states': [array([3]), array([5]), array([3])], 'actions': [8, 10, 7], 'rewards': [1.0, 0.0, -0.5]}
 {'states': [array([2])], 'actions': [6], 'rewards': [-0.5]}
 {'states': [array([1])], 'actions': [7], 'rewards': [-0.5]}
 {'states': [array([0])], 'actions': [6], 'rewards': [-0.5]}
 {'states': [array([4])], 'actions': [5], 'rewards': [-0.5]}
 {'states': [array([0]), array([5])], 'actions': [5, 6], 'rewards': [1.0, -0.5]}
 {'states': [array([3])], 'actions': [5], 'rewards': [-0.5]}
 {'states': [array([4]), array([5])], 'actions': [9, 9], 'rewards': [1.0, -0.5]}
 {'states': [array([4]), array([5])], 'actions': [9, 7], 'rewards': [1.0, -0.5]}
 {'states': [array([4])], 'actions': [7], 'rewards': [-0.5]}
 {'states': [array([1]), array([5])], 'actions': [6, 5], 'rewards': [1.0, 1.0]}
 {'states': [array([4]), array([5])

---------------------------- tensor([0.0107], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0179], grad_fn=<PowBackward0>)
---------------------------- tensor([0.1309], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0199], grad_fn=<PowBackward0>)
---------------------------- tensor([2.6393], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0010], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0104], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0695], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0060], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0008], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0004], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0847], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0006], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0008], grad_fn=<PowBackward0>)
--------------------

---------------------------- tensor([0.9353], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0243], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0206], grad_fn=<PowBackward0>)
---------------------------- tensor([0.7540], grad_fn=<PowBackward0>)
---------------------------- tensor([0.7478], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0820], grad_fn=<PowBackward0>)
---------------------------- tensor([2.4700], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0901], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0074], grad_fn=<PowBackward0>)
---------------------------- tensor([0.1280], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0097], grad_fn=<PowBackward0>)
---------------------------- tensor([0.7540], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0105], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0011], grad_fn=<PowBackward0>)
--------------------

---------------------------- tensor([2.4228], grad_fn=<PowBackward0>)
---------------------------- tensor([0.7004], grad_fn=<PowBackward0>)
---------------------------- tensor([2.5730], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0224], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0097], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0011], grad_fn=<PowBackward0>)
---------------------------- tensor([0.7004], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0219], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0011], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0012], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0067], grad_fn=<PowBackward0>)
---------------------------- tensor([0.2020], grad_fn=<PowBackward0>)
---------------------------- tensor([0.1602], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0219], grad_fn=<PowBackward0>)
--------------------

---------------------------- tensor([0.0098], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0070], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0070], grad_fn=<PowBackward0>)
---------------------------- tensor([2.9879], grad_fn=<PowBackward0>)
---------------------------- tensor([0.1764], grad_fn=<PowBackward0>)
---------------------------- tensor([0.1314], grad_fn=<PowBackward0>)
---------------------------- tensor([2.3780], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0063], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0239], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0101], grad_fn=<PowBackward0>)
---------------------------- tensor([0.6516], grad_fn=<PowBackward0>)
---------------------------- tensor([0.8485], grad_fn=<PowBackward0>)
---------------------------- tensor([2.5397], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0011], grad_fn=<PowBackward0>)
--------------------

 {'states': [array([1])], 'actions': [9], 'rewards': [-0.5]}]
---------------------------- tensor([0.1777], grad_fn=<PowBackward0>)
---------------------------- tensor([2.6183], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0107], grad_fn=<PowBackward0>)
---------------------------- tensor([0.8452], grad_fn=<PowBackward0>)
---------------------------- tensor([2.4931], grad_fn=<PowBackward0>)
---------------------------- tensor([0.2788], grad_fn=<PowBackward0>)
---------------------------- tensor([0.1039], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0274], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0295], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0295], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0107], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0274], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0110], grad_fn=<PowBackward0>)
----------------------------

---------------------------- tensor([0.0110], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0301], grad_fn=<PowBackward0>)
after loop
before loop [{'states': [array([0])], 'actions': [6], 'rewards': [-0.5]}]
---------------------------- tensor([0.0069], grad_fn=<PowBackward0>)
after loop
before loop [{'states': [array([1]), array([5])], 'actions': [6, 8], 'rewards': [1.0, -0.5]}
 {'states': [array([1]), array([5])], 'actions': [6, 5], 'rewards': [1.0, 1.0]}
 {'states': [array([2]), array([4]), array([2])], 'actions': [10, 7, 5], 'rewards': [0.0, 1.0, -0.5]}
 {'states': [array([3]), array([5]), array([5])], 'actions': [8, 6, 8], 'rewards': [1.0, 1.0, -0.5]}
 {'states': [array([4])], 'actions': [6], 'rewards': [-0.5]}
 {'states': [array([4])], 'actions': [5], 'rewards': [-0.5]}
 {'states': [array([2])], 'actions': [9], 'rewards': [-0.5]}
 {'states': [array([1])], 'actions': [7], 'rewards': [-0.5]}
 {'states': [array([0])], 'actions': [9], 'rewards': [-0.5]}
 {'states': [

---------------------------- tensor([0.0068], grad_fn=<PowBackward0>)
---------------------------- tensor([0.5405], grad_fn=<PowBackward0>)
---------------------------- tensor([0.1696], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0320], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0068], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0256], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0308], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0122], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0256], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0317], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0257], grad_fn=<PowBackward0>)
after loop
before loop [{'states': [array([3])], 'actions': [7], 'rewards': [-0.5]}]
---------------------------- tensor([0.0319], grad_fn=<PowBackward0>)
after loop
before loop [{'states': [array([3]), array([5]), array([5])], 'a

---------------------------- tensor([0.0333], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0263], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0264], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0067], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0337], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0136], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0263], grad_fn=<PowBackward0>)
---------------------------- tensor([0.2727], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0137], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0067], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0016], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0067], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0263], grad_fn=<PowBackward0>)
---------------------------- tensor([0.4305], grad_fn=<PowBackward0>)
--------------------

---------------------------- tensor([0.0269], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0017], grad_fn=<PowBackward0>)
---------------------------- tensor([2.2118], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0066], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0357], grad_fn=<PowBackward0>)
---------------------------- tensor([0.3706], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0066], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0017], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0151], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0066], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0269], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0269], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0151], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0357], grad_fn=<PowBackward0>)
--------------------

---------------------------- tensor([0.0018], grad_fn=<PowBackward0>)
---------------------------- tensor([2.2882], grad_fn=<PowBackward0>)
---------------------------- tensor([0.5205], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0066], grad_fn=<PowBackward0>)
---------------------------- tensor([0.3209], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0165], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0017], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0166], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0377], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0166], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0539], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0017], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0378], grad_fn=<PowBackward0>)
---------------------------- tensor([0.0378], grad_fn=<PowBackward0>)
--------------------

before loop [{'states': [array([3]), array([5])], 'actions': [8, 9], 'rewards': [1.0, 1.0]}
 {'states': [array([4]), array([5])], 'actions': [9, 7], 'rewards': [1.0, -0.5]}
 {'states': [array([1]), array([5])], 'actions': [6, 8], 'rewards': [1.0, -0.5]}
 {'states': [array([2])], 'actions': [8], 'rewards': [-0.5]}
 {'states': [array([1])], 'actions': [7], 'rewards': [-0.5]}
 {'states': [array([4]), array([5])], 'actions': [9, 9], 'rewards': [1.0, -0.5]}
 {'states': [array([0]), array([5])], 'actions': [5, 6], 'rewards': [1.0, -0.5]}
 {'states': [array([3])], 'actions': [5], 'rewards': [-0.5]}
 {'states': [array([3])], 'actions': [5], 'rewards': [-0.5]}
 {'states': [array([0])], 'actions': [7], 'rewards': [-0.5]}
 {'states': [array([4])], 'actions': [5], 'rewards': [-0.5]}
 {'states': [array([3]), array([5]), array([5])], 'actions': [8, 6, 8], 'rewards': [1.0, 1.0, -0.5]}
 {'states': [array([1]), array([5])], 'actions': [6, 5], 'rewards': [1.0, 1.0]}
 {'states': [array([4])], 'actions': 

KeyboardInterrupt: 

In [245]:
agent.getAction(0)

(0, 1, 3)