In [2]:
import torch
import torch.optim as optim
from torch.distributions import Normal
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from gym.wrappers.monitoring.video_recorder import VideoRecorder
import warnings
from typing import Union
from utils import ReplayBuffer, get_env, run_episode

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)



In [3]:
class NeuralNetwork(nn.Module):
    '''
    This class implements a neural network with a variable number of hidden layers and hidden units.
    You may use this function to parametrize your policy and critic networks.
    '''
    def __init__(self, input_dim: int, output_dim: int, hidden_size: int, 
                                hidden_layers: int, activation: str):
        super(NeuralNetwork, self).__init__()

        # TODO: Implement this function which should define a neural network 
        # with a variable number of hidden layers and hidden units.
        # Here you should define layers which your network will use.

        layers = []
        layers.append(nn.Linear(input_dim, hidden_size))
        layers.append(activation)

        for i in range(hidden_layers-1):
            layers.append(nn.Linear(hidden_size, hidden_size))
            layers.append(activation)

        layers.append(nn.Linear(hidden_size, output_dim))

        self.net = nn.Sequential(*layers)
        
    def forward(self, s: torch.Tensor) -> torch.Tensor:
        # TODO: Implement the forward pass for the neural network you have defined.
        return self.net(s)

In [4]:
input_dim = 3
output_dim = 2
hidden_size = 64
hidden_layers = 2
activation = nn.ReLU()

net = NeuralNetwork(input_dim, output_dim, hidden_size, hidden_layers, activation)
net

NeuralNetwork(
  (net): Sequential(
    (0): Linear(in_features=3, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=2, bias=True)
  )
)

In [5]:


layers = []
layers.append(nn.Linear(input_dim, hidden_size))
layers.append(activation)

for i in range(hidden_layers-1):
    layers.append(nn.Linear(hidden_size, hidden_size))
    layers.append(activation)

layers.append(nn.Linear(hidden_size, output_dim))

net = nn.Sequential(*layers)
net

Sequential(
  (0): Linear(in_features=3, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=2, bias=True)
)

In [63]:
class Actor:
    def __init__(self,hidden_size: int, hidden_layers: int, actor_lr: float,
                state_dim: int = 3, action_dim: int = 1, device: torch.device = torch.device('cpu')):
        super(Actor, self).__init__()

        self.hidden_size = hidden_size
        self.hidden_layers = hidden_layers
        self.actor_lr = actor_lr
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.device = device
        self.LOG_STD_MIN = -20
        self.LOG_STD_MAX = 2
        self.setup_actor()

    def setup_actor(self):
        '''
        This function sets up the actor network in the Actor class.
        '''
        # TODO: Implement this function which sets up the actor network. 
        # Take a look at the NeuralNetwork class in utils.py. 
        activation = nn.ReLU()

        self.p = NeuralNetwork(self.state_dim, 2, self.hidden_size, self.hidden_layers, activation)
        self.p_optimizer = optim.Adam(self.p.parameters(), lr=self.actor_lr)

    def clamp_log_std(self, log_std: torch.Tensor) -> torch.Tensor:
        '''
        :param log_std: torch.Tensor, log_std of the policy.
        Returns:
        :param log_std: torch.Tensor, log_std of the policy clamped between LOG_STD_MIN and LOG_STD_MAX.
        '''
        return torch.clamp(log_std, self.LOG_STD_MIN, self.LOG_STD_MAX)

    def get_action_and_log_prob(self, state: torch.Tensor, 
                                deterministic: bool) -> (torch.Tensor, torch.Tensor):
        '''
        :param state: torch.Tensor, state of the agent
        :param deterministic: boolean, if true return a deterministic action 
                                otherwise sample from the policy distribution.
        Returns:
        :param action: torch.Tensor, action the policy returns for the state.
        :param log_prob: log_probability of the the action.
        '''
        assert tuple(state.shape) == (3,) or state.shape[1] == self.state_dim, 'State passed to this method has a wrong shape'
        action , log_prob = torch.zeros(state.shape[0]), torch.ones(state.shape[0])
        # TODO: Implement this function which returns an action and its log probability.
        # If working with stochastic policies, make sure that its log_std are clamped 
        # using the clamp_log_std function.
        
        out = self.p(state)
        mu, std = out[:,0], out[:,1] # (n,), (n,)
        # unsqueeze to obtain the same dimensionality
        mu = mu.unsqueeze(1)
        std = std.unsqueeze(1)

        std = self.clamp_log_std(std) # clamp log_std => numerical

        std = torch.exp(std)
        ndists = Normal(mu, std)
        u = ndists.rsample() # (n,) : rsample => backprop, sample => no backprop
        action = torch.tanh(u) # squash : [-1,+1]

        if deterministic:
            u = mu
            action = torch.tanh(u)
        # appendix c: enforcing action bounds: jacobian
        log_prob = ndists.log_prob(u) - torch.log(1-torch.tanh(u)**2 + 1e-6) 
        # torch.sum dim=1, broadcasting issue if action space>1

        assert ((tuple(action.shape) == (self.action_dim,)) and \
            tuple(log_prob.shape) == (self.action_dim, )) or (tuple(action.shape) == (state.shape[0], 1) and \
                                                        tuple(log_prob.shape) == (state.shape[0],1)), 'Incorrect shape for action or log_prob.'
        # assert action.shape == (state.shape[0], self.action_dim) and \
        #     log_prob.shape == (state.shape[0], self.action_dim), 'Incorrect shape for action or log_prob.'
        
        return action, log_prob

In [7]:
prev_state = state

NameError: name 'state' is not defined

In [8]:
state = state[0].unsqueeze(0)

NameError: name 'state' is not defined

In [9]:


assert tuple(state.shape) == (3,) or state.shape[1] == state_dim, 'State passed to this method has a wrong shape'
action , log_prob = torch.zeros(state.shape[0]), torch.ones(state.shape[0])
# TODO: Implement this function which returns an action and its log probability.
# If working with stochastic policies, make sure that its log_std are clamped 
# using the clamp_log_std function.

out = policy(state)
mu, std = out[:,0], out[:,1] # (n,), (n,)
# unsqueeze to obtain the same dimensionality
mu = mu.unsqueeze(1)
std = std.unsqueeze(1)

# std = self.clamp_log_std(std) # clamp log_std => numerical

std = torch.exp(std)
ndists = Normal(mu, std)
u = ndists.rsample() # (n,) : rsample => backprop, sample => no backprop
action = torch.tanh(u) # squash : [-1,+1]

if deterministic:
    action = mu
# appendix c: enforcing action bounds: jacobian
log_prob = ndists.log_prob(action) - torch.log(1-torch.tanh(u)**2) # torch.sum dim=1, broadcasting issue if action space>1

assert ((tuple(action.shape) == (action_dim,)) and \
    tuple(log_prob.shape) == (action_dim, )) or (tuple(action.shape) == (state.shape[0], 1) and \
                                                tuple(log_prob.shape) == (state.shape[0],1)), 'Incorrect shape for action or log_prob.'

NameError: name 'state' is not defined

In [10]:
log_prob.shape

NameError: name 'log_prob' is not defined

In [11]:
action.shape

NameError: name 'action' is not defined

In [12]:
batch = agent.memory.sample(agent.batch_size)
s_batch, a_batch, r_batch, s_prime_batch = batch

NameError: name 'agent' is not defined

In [13]:
state_dim = 3
action_dim = 1
deterministic = False
policy = NeuralNetwork(state_dim, action_dim, hidden_size, hidden_layers, activation)

In [14]:
s_batch.shape

NameError: name 's_batch' is not defined

In [15]:
out = policy(s_batch)
mu, std = out[:,0], out[:,1] # (n,), (n,)
mu = mu.unsqueeze(1)
std = std.unsqueeze(1)

std = torch.exp(std)
ndists = Normal(mu, std)
u = ndists.rsample() # (n,)
action = torch.tanh(u)

if deterministic:
    action = mu

log_prob = ndists.log_prob(action) - torch.sum(torch.log(1-torch.tanh(u)**2))


NameError: name 's_batch' is not defined

In [16]:
torch.sum(torch.log(1-torch.tanh(u)**2),dim=1).shape

NameError: name 'u' is not defined

In [17]:
torch.log(1-torch.tanh(u)**2).shape

NameError: name 'u' is not defined

In [18]:
log_prob = ndists.log_prob(action) - torch.sum(torch.log(1-torch.tanh(u)**2))
log_prob.shape

NameError: name 'ndists' is not defined

In [19]:
mu.shape

NameError: name 'mu' is not defined

In [20]:
ndists.log_prob(action).shape

NameError: name 'ndists' is not defined

In [21]:
torch.log(1-torch.tanh(u)**2).shape

NameError: name 'u' is not defined

In [22]:
torch.sum(torch.log(1-torch.tanh(u)**2), dim=1).shape

NameError: name 'u' is not defined

In [23]:
u.shape

NameError: name 'u' is not defined

In [24]:
state = s_batch

NameError: name 's_batch' is not defined

In [25]:
assert state.shape == (3,) or state.shape[1] == state_dim, 'State passed to this method has a wrong shape'
action , log_prob = torch.zeros(state.shape[0]), torch.ones(state.shape[0])

NameError: name 'state' is not defined

In [26]:
state.shape[1] == (3,)

NameError: name 'state' is not defined

In [27]:
state.shape[1] == state_dim

NameError: name 'state' is not defined

In [28]:
state_dim

3

In [29]:
state.shape

NameError: name 'state' is not defined

In [30]:
(ndists.log_prob(action) - torch.sum(torch.log(1-torch.tanh(u)**2), dim=1)).shape

NameError: name 'ndists' is not defined

In [31]:
ndists.log_prob(action).shape

NameError: name 'ndists' is not defined

In [32]:
torch.log(1-torch.tanh(u)**2).shape

NameError: name 'u' is not defined

In [33]:


assert tuple(state.shape) == (3,) or state.shape[1] == state_dim, 'State passed to this method has a wrong shape'
action , log_prob = torch.zeros(state.shape[0]), torch.ones(state.shape[0])
# TODO: Implement this function which returns an action and its log probability.
# If working with stochastic policies, make sure that its log_std are clamped 
# using the clamp_log_std function.

out = policy(state)
mu, std = out[:,0], out[:,1] # (n,), (n,)
# unsqueeze to obtain the same dimensionality
mu = mu.unsqueeze(1)
std = std.unsqueeze(1)

# std = self.clamp_log_std(std) # clamp log_std => numerical

std = torch.exp(std)
ndists = Normal(mu, std)
u = ndists.rsample() # (n,) : rsample => backprop, sample => no backprop
action = torch.tanh(u) # squash : [-1,+1]

if deterministic:
    action = mu
# appendix c: enforcing action bounds: jacobian
log_prob = ndists.log_prob(action) - torch.log(1-torch.tanh(u)**2) # torch.sum dim=1, broadcasting issue if action space>1

assert ((tuple(action.shape) == (action_dim,)) and \
    tuple(log_prob.shape) == (action_dim, )) or (tuple(action.shape) == (state.shape[0], 1) and \
                                                tuple(log_prob.shape) == (state.shape[0],1)), 'Incorrect shape for action or log_prob.'

NameError: name 'state' is not defined

In [34]:
assert ((tuple(action.shape) == (action_dim,)) and \
    tuple(log_prob.shape) == (action_dim, )) or (tuple(action.shape) == (state.shape[0], 1) and \
                                                tuple(log_prob.shape) == (state.shape[0],1)), 'Incorrect shape for action or log_prob.'

NameError: name 'action' is not defined

In [35]:
tuple(log_prob.shape)

NameError: name 'log_prob' is not defined

In [36]:
(state.shape[0],1)

NameError: name 'state' is not defined

In [37]:
((action.shape == (action_dim,)) and \
    log_prob.shape == (action_dim, ))

NameError: name 'action' is not defined

In [38]:
(action.shape == (state.shape[0], 1) and \
                                                log_prob.shape == (state.shape[0],1))

NameError: name 'action' is not defined

In [39]:
action.shape

NameError: name 'action' is not defined

In [40]:
tuple((state.shape[0], 1))

NameError: name 'state' is not defined

In [41]:
(state.shape[0], 1)

NameError: name 'state' is not defined

In [42]:
state.shape

NameError: name 'state' is not defined

In [60]:
class Critic:
    def __init__(self, hidden_size: int, 
                 hidden_layers: int, critic_lr: int, state_dim: int = 3, 
                    action_dim: int = 1,device: torch.device = torch.device('cpu')):
        super(Critic, self).__init__()
        self.hidden_size = hidden_size
        self.hidden_layers = hidden_layers
        self.critic_lr = critic_lr
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.device = device
        self.setup_critic()

    def setup_critic(self):
        # TODO: Implement this function which sets up the critic(s). Take a look at the NeuralNetwork 
        # class in utils.py. Note that you can have MULTIPLE critic networks in this class.
        activation = nn.ReLU()
        input_dim = self.state_dim + self.action_dim
        # Q(s,a): (s,a) => R^1
        self.q1 = NeuralNetwork(input_dim, 1, self.hidden_size, self.hidden_layers, activation)
        self.q2 = NeuralNetwork(input_dim, 1, self.hidden_size, self.hidden_layers, activation)

        # target critics
        self.q1_trg = NeuralNetwork(input_dim, 1, self.hidden_size, self.hidden_layers, activation)
        self.q2_trg = NeuralNetwork(input_dim, 1, self.hidden_size, self.hidden_layers, activation)
        # copy weights from q-networks
        self.q1_trg.load_state_dict(self.q1.state_dict())
        self.q2_trg.load_state_dict(self.q2.state_dict())
        # no gradient calculation for target critics
        self.q1_trg.eval()
        self.q2_trg.eval()
        for param in self.q1_trg.parameters():
            param.requires_grad = False
        for param in self.q2_trg.parameters():
            param.requires_grad = False


        # Optimizers
        self.q1_optimizer = optim.Adam(self.q1.parameters(), lr=self.critic_lr)
        self.q2_optimizer = optim.Adam(self.q2.parameters(), lr=self.critic_lr)

In [44]:
class TrainableParameter:
    '''
    This class could be used to define a trainable parameter in your method. You could find it 
    useful if you try to implement the entropy temerature parameter for SAC algorithm.
    '''
    def __init__(self, init_param: float, lr_param: float, 
                 train_param: bool, device: torch.device = torch.device('cpu')):
        
        self.log_param = torch.tensor(np.log(init_param), requires_grad=train_param, device=device)
        self.optimizer = optim.Adam([self.log_param], lr=lr_param)

    def get_param(self) -> torch.Tensor:
        return torch.exp(self.log_param)

    def get_log_param(self) -> torch.Tensor:
        return self.log_param

In [78]:
a_batch.shape
s_batch.shape

torch.Size([200, 3])

In [None]:
state_dim = 3  # [cos(theta), sin(theta), theta_dot]
action_dim = 1  # [torque] in[-1,1]
batch_size = 200
min_buffer_size = 1000
max_buffer_size = 100000
# If your PC possesses a GPU, you should be able to use it for training, 
# as self.device should be 'cuda' in that case.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device: {}".format(device))
memory = ReplayBuffer(min_buffer_size, max_buffer_size, device)

hidden_size = 64
hidden_layers = 3
gamma = 0.99

alpha = 1 # learnt temperature

critic_lr = 1e-4
critic = Critic(hidden_size, hidden_layers, critic_lr, state_dim, action_dim, device)

actor_lr = 1e-4
actor = Actor(hidden_size, hidden_layers, actor_lr, state_dim, action_dim, device)

In [None]:
state.unsqueeze

In [137]:
class Agent:
    def __init__(self):
        # Environment variables. You don't need to change this.
        self.state_dim = 3  # [cos(theta), sin(theta), theta_dot]
        self.action_dim = 1  # [torque] in[-1,1]
        self.batch_size = 200
        self.min_buffer_size = 1000
        self.max_buffer_size = 100000
        # If your PC possesses a GPU, you should be able to use it for training, 
        # as self.device should be 'cuda' in that case.
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print("Using device: {}".format(self.device))
        self.memory = ReplayBuffer(self.min_buffer_size, self.max_buffer_size, self.device)
        
        self.setup_agent()

    def setup_agent(self):
        # TODO: Setup off-policy agent with policy and critic classes. 
        # Feel free to instantiate any other parameters you feel you might need.   
        self.tau = 0.005
        
        hidden_size = 256
        hidden_layers = 2
        self.gamma = 0.99

        critic_lr = 3e-4
        self.critic = Critic(hidden_size, hidden_layers, critic_lr, self.state_dim, self.action_dim, self.device)

        actor_lr = 3e-4
        self.actor = Actor(hidden_size, hidden_layers, actor_lr, self.state_dim, self.action_dim, self.device)
        
        temp_lr = 3e-4
        init_param = 1
        self.temp = TrainableParameter(init_param, temp_lr, True, self.device)

    def get_action(self, s: np.ndarray, train: bool) -> np.ndarray:
        """
        :param s: np.ndarray, state of the pendulum. shape (3, )
        :param train: boolean to indicate if you are in eval or train mode. 
                    You can find it useful if you want to sample from deterministic policy.
        :return: np.ndarray,, action to apply on the environment, shape (1,)
        """
        # TODO: Implement a function that returns an action from the policy for the state s.
        action = np.random.uniform(-1, 1, (1,))

        state = torch.tensor(s).unsqueeze(0)
        action, _ = self.actor.get_action_and_log_prob(state, not train) # if train=True => non-det => while training, setup=>False while training
        action = action.squeeze(0)
        action = action.detach().numpy() # convert to numpy, shouldnt follow grads to simulator

        assert action.shape == (1,), 'Incorrect action shape.'
        assert isinstance(action, np.ndarray ), 'Action dtype must be np.ndarray' 
        return action

    @staticmethod
    def run_gradient_update_step(optimizer, loss: torch.Tensor):
        '''
        This function takes in a object containing trainable parameters and an optimizer, 
        and using a given loss, runs one step of gradient update. If you set up trainable parameters 
        and optimizer inside the object, you could find this function useful while training.
        :param object: object containing trainable parameters and an optimizer
        '''
        optimizer.zero_grad()
        loss.mean().backward()
        optimizer.step()

    def critic_target_update(self, base_net: NeuralNetwork, target_net: NeuralNetwork, 
                             tau: float, soft_update: bool):
        '''
        This method updates the target network parameters using the source network parameters.
        If soft_update is True, then perform a soft update, otherwise a hard update (copy).
        :param base_net: source network
        :param target_net: target network
        :param tau: soft update parameter
        :param soft_update: boolean to indicate whether to perform a soft update or not
        '''
        for param_target, param in zip(target_net.parameters(), base_net.parameters()):
            if soft_update:
                param_target.data.copy_(param_target.data * (1.0 - tau) + param.data * tau)
            else:
                param_target.data.copy_(param.data)

    def train_agent(self):
        '''
        This function represents one training iteration for the agent. It samples a batch 
        from the replay buffer,and then updates the policy and critic networks 
        using the sampled batch.
        '''
        # TODO: Implement one step of training for the agent.
        # Hint: You can use the run_gradient_update_step for each policy and critic.
        # Example: self.run_gradient_update_step(self.policy, policy_loss)

        # Batch sampling
        batch = self.memory.sample(self.batch_size)
        s_batch, a_batch, r_batch, s_prime_batch = batch
        
        # TODO: Make Learnable
        alpha = 0.2
        # alpha = self.temp.get_param().detach()

        # calculate targets for the q function
        with torch.no_grad():
            act_prime, logp_prime = self.actor.get_action_and_log_prob(s_prime_batch, False)

            sact_prime_batch = torch.cat([s_prime_batch, act_prime], dim=1)
            q1_prime = self.critic.q1_trg(sact_prime_batch)
            q2_prime = self.critic.q2_trg(sact_prime_batch)
            
            q1q2_prime = torch.cat([q1_prime, q2_prime], dim=1)
            q, _ = torch.min(q1q2_prime, dim=1) # take min => stated in the paper
            q = q.unsqueeze(1)
            # alpha = self.temp.get_param() # backprop avoided since alpha also learned
            y_label = r_batch + self.gamma*(q - alpha*logp_prime)
            
        # TODO: Implement Critic(s) update here.
        sact_batch = torch.cat([s_batch, a_batch], dim=1)
        loss_q1 = F.mse_loss(self.critic.q1(sact_batch), y_label)
        loss_q2 = F.mse_loss(self.critic.q2(sact_batch), y_label)

        self.run_gradient_update_step(self.critic.q1_optimizer, loss_q1)
        self.run_gradient_update_step(self.critic.q2_optimizer, loss_q2)

        # TODO: Implement Policy update here
        act_hat, logp_hat = self.actor.get_action_and_log_prob(s_batch, False)
        
        sacthat_batch = torch.cat([s_batch, act_hat], dim=1)
        q1 = self.critic.q1(sacthat_batch)
        q2 = self.critic.q2(sacthat_batch)
        
        q1q2 = torch.cat([q1, q2], dim=1)
        q, _ = torch.min(q1q2, dim=1)
        q = q.unsqueeze(1)
        loss_p = (alpha*logp_hat - q).mean()
        self.run_gradient_update_step(self.actor.p_optimizer, loss_p)

        # Alpha: temperature update
        # alpha = self.temp.get_param()
        
        # loss_alpha = -self.temp.get_param()*((logp_hat - self.action_dim).detach()) # dont backrop to policy
        # loss_alpha = loss_alpha.mean()

        # self.run_gradient_update_step(self.temp.optimizer, loss_alpha)
        # self.run_gradient_update_step(self.temp.optimizer, loss_alpha)

        # Critic Network Update
        self.critic_target_update(self.critic.q1, self.critic.q1_trg, self.tau, True)
        self.critic_target_update(self.critic.q2, self.critic.q2_trg, self.tau, True)

        

In [117]:
agent.temp.get_param()

tensor(31.0197, dtype=torch.float64, grad_fn=<ExpBackward0>)

In [119]:
# This main function is provided here to enable some basic testing. 
# ANY changes here WON'T take any effect while grading.
# if __name__ == '__main__':

TRAIN_EPISODES = 50
TEST_EPISODES = 300

# You may set the save_video param to output the video of one of the evalution episodes, or 
# you can disable console printing during training and testing by setting verbose to False.
save_video = True
verbose = True

# agent = Agent()
agent = Agent()

env = get_env(g=10.0, train=True)

for EP in range(TRAIN_EPISODES):
    run_episode(env, agent, None, verbose, train=True)

if verbose:
    print('\n')

test_returns = []
env = get_env(g=10.0, train=False)

if save_video:
    video_rec = VideoRecorder(env, "pendulum_episode.mp4")

for EP in range(TEST_EPISODES):
    rec = video_rec if (save_video and EP == TEST_EPISODES - 1) else None
    with torch.no_grad():
        episode_return = run_episode(env, agent, rec, verbose, train=False)
    test_returns.append(episode_return)

avg_test_return = np.mean(np.array(test_returns))

print("\n AVG_TEST_RETURN:{:.1f} \n".format(avg_test_return))

if save_video:
    video_rec.close()


Using device: cpu
MODE: TRAIN, RETURN: -1441.0
MODE: TRAIN, RETURN: -1319.3
MODE: TRAIN, RETURN: -1473.2
MODE: TRAIN, RETURN: -1636.6
MODE: TRAIN, RETURN: -1645.3
MODE: TRAIN, RETURN: -1801.6
MODE: TRAIN, RETURN: -1703.8
MODE: TRAIN, RETURN: -1700.8
MODE: TRAIN, RETURN: -1801.9
MODE: TRAIN, RETURN: -1703.4
MODE: TRAIN, RETURN: -1624.5
MODE: TRAIN, RETURN: -1485.9
MODE: TRAIN, RETURN: -1483.4
MODE: TRAIN, RETURN: -1341.2
MODE: TRAIN, RETURN: -1272.9
MODE: TRAIN, RETURN: -1275.7
MODE: TRAIN, RETURN: -1200.6
MODE: TRAIN, RETURN: -1119.4
MODE: TRAIN, RETURN: -1089.5
MODE: TRAIN, RETURN: -1016.1
MODE: TRAIN, RETURN: -979.2
MODE: TRAIN, RETURN: -910.7
MODE: TRAIN, RETURN: -884.1
MODE: TRAIN, RETURN: -979.2
MODE: TRAIN, RETURN: -637.1
MODE: TRAIN, RETURN: -881.2
MODE: TRAIN, RETURN: -509.8
MODE: TRAIN, RETURN: -399.9
MODE: TRAIN, RETURN: -354.9
MODE: TRAIN, RETURN: -462.9
MODE: TRAIN, RETURN: -509.9
MODE: TRAIN, RETURN: -463.2
MODE: TRAIN, RETURN: -375.2
MODE: TRAIN, RETURN: -368.2
MODE: TRAI

                                                               

Moviepy - Done !
Moviepy - video ready pendulum_episode.mp4


In [122]:
agent.temp.get_param()

tensor(0.1601, dtype=torch.float64, grad_fn=<ExpBackward0>)

In [121]:
episode_return

-355.60176212866827

In [126]:
import os

dirr = 'exp/'

if not os.path.exists(dirr):
    os.makedirs(dirr)

In [None]:
%ls

In [138]:
import os

N = 5
res_list = []
dirr = 'exp/'
grav_list = [15.0, 10.0, 1.0, 5.0]

for i in range(N):
    acc_gravlist = []
    for grav in grav_list:
        # This main function is provided here to enable some basic testing. 
        # ANY changes here WON'T take any effect while grading.
        # if __name__ == '__main__':

        TRAIN_EPISODES = 50
        TEST_EPISODES = 300

        # You may set the save_video param to output the video of one of the evalution episodes, or 
        # you can disable console printing during training and testing by setting verbose to False.
        save_video = True
        verbose = True

        # agent = Agent()
        agent = Agent()

        env = get_env(g=grav, train=True)

        for EP in range(TRAIN_EPISODES):
            run_episode(env, agent, None, verbose, train=True)

        if verbose:
            print('\n')

        test_returns = []
        env = get_env(g=grav, train=False)

        if save_video:
            if not os.path.exists(dirr):
                os.makedirs(dirr)
            video_rec = VideoRecorder(env, dirr + "pendulum_episode_" + str(grav) + "_" + str(i) + ".mp4")

        for EP in range(TEST_EPISODES):
            rec = video_rec if (save_video and EP == TEST_EPISODES - 1) else None
            with torch.no_grad():
                episode_return = run_episode(env, agent, rec, verbose, train=False)
            test_returns.append(episode_return)

        avg_test_return = np.mean(np.array(test_returns))

        print("\n AVG_TEST_RETURN:{:.1f} \n".format(avg_test_return))
        
        # save gravity experiment results
        acc_gravlist.append(avg_test_return)

        if save_video:
            video_rec.close()

    # append gravity results => different experiment iterations
    res_list.append(acc_gravlist)


Using device: cpu
MODE: TRAIN, RETURN: -1702.3
MODE: TRAIN, RETURN: -1762.6
MODE: TRAIN, RETURN: -1736.3
MODE: TRAIN, RETURN: -1726.6
MODE: TRAIN, RETURN: -1815.0
MODE: TRAIN, RETURN: -1774.7
MODE: TRAIN, RETURN: -1858.2
MODE: TRAIN, RETURN: -1811.3
MODE: TRAIN, RETURN: -1692.1
MODE: TRAIN, RETURN: -1674.6
MODE: TRAIN, RETURN: -1487.9
MODE: TRAIN, RETURN: -1438.2
MODE: TRAIN, RETURN: -1319.3
MODE: TRAIN, RETURN: -1241.6
MODE: TRAIN, RETURN: -1103.5
MODE: TRAIN, RETURN: -1166.5
MODE: TRAIN, RETURN: -1137.9
MODE: TRAIN, RETURN: -1147.4
MODE: TRAIN, RETURN: -1251.2
MODE: TRAIN, RETURN: -1140.7
MODE: TRAIN, RETURN: -1238.8
MODE: TRAIN, RETURN: -1120.6
MODE: TRAIN, RETURN: -1215.6
MODE: TRAIN, RETURN: -1003.6
MODE: TRAIN, RETURN: -1134.1
MODE: TRAIN, RETURN: -1146.2
MODE: TRAIN, RETURN: -1075.3
MODE: TRAIN, RETURN: -964.8
MODE: TRAIN, RETURN: -821.4
MODE: TRAIN, RETURN: -1041.2
MODE: TRAIN, RETURN: -932.2
MODE: TRAIN, RETURN: -931.8
MODE: TRAIN, RETURN: -924.8
MODE: TRAIN, RETURN: -563.3
MO

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_15.0_0.mp4
Using device: cpu
MODE: TRAIN, RETURN: -1500.5
MODE: TRAIN, RETURN: -1731.2
MODE: TRAIN, RETURN: -1809.2
MODE: TRAIN, RETURN: -1506.4
MODE: TRAIN, RETURN: -1734.7
MODE: TRAIN, RETURN: -1548.7
MODE: TRAIN, RETURN: -1759.7
MODE: TRAIN, RETURN: -1797.1
MODE: TRAIN, RETURN: -1706.0
MODE: TRAIN, RETURN: -1496.5
MODE: TRAIN, RETURN: -1455.9
MODE: TRAIN, RETURN: -1304.6
MODE: TRAIN, RETURN: -1202.3
MODE: TRAIN, RETURN: -1305.8
MODE: TRAIN, RETURN: -1225.8
MODE: TRAIN, RETURN: -1300.8
MODE: TRAIN, RETURN: -899.4
MODE: TRAIN, RETURN: -881.0
MODE: TRAIN, RETURN: -881.4
MODE: TRAIN, RETURN: -984.0
MODE: TRAIN, RETURN: -611.7
MODE: TRAIN, RETURN: -984.3
MODE: TRAIN, RETURN: -976.0
MODE: TRAIN, RETURN: -756.9
MODE: TRAIN, RETURN: -606.7
MODE: TRAIN, RETURN: -471.5
MODE: TRAIN, RETURN: -584.6
MODE: TRAIN, RETURN: -485.4
MODE: TRAIN, RETURN: -478.9
MODE: TRAIN, RETURN: -377.7
MODE: TRAIN, RETURN: -499.2
MODE: TRAIN, RETURN: -499.3

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_10.0_0.mp4
Using device: cpu
MODE: TRAIN, RETURN: -1069.5
MODE: TRAIN, RETURN: -1424.1
MODE: TRAIN, RETURN: -1347.9
MODE: TRAIN, RETURN: -1370.3
MODE: TRAIN, RETURN: -959.0
MODE: TRAIN, RETURN: -1219.8
MODE: TRAIN, RETURN: -1636.8
MODE: TRAIN, RETURN: -652.5
MODE: TRAIN, RETURN: -900.9
MODE: TRAIN, RETURN: -666.0
MODE: TRAIN, RETURN: -756.1
MODE: TRAIN, RETURN: -377.0
MODE: TRAIN, RETURN: -388.7
MODE: TRAIN, RETURN: -240.1
MODE: TRAIN, RETURN: -162.5
MODE: TRAIN, RETURN: -155.0
MODE: TRAIN, RETURN: -138.8
MODE: TRAIN, RETURN: -147.8
MODE: TRAIN, RETURN: -137.1
MODE: TRAIN, RETURN: -141.0
MODE: TRAIN, RETURN: -132.3
MODE: TRAIN, RETURN: -136.6
MODE: TRAIN, RETURN: -133.2
MODE: TRAIN, RETURN: -139.1
MODE: TRAIN, RETURN: -132.5
MODE: TRAIN, RETURN: -135.2
MODE: TRAIN, RETURN: -137.7
MODE: TRAIN, RETURN: -135.1
MODE: TRAIN, RETURN: -137.0
MODE: TRAIN, RETURN: -130.7
MODE: TRAIN, RETURN: -126.3
MODE: TRAIN, RETURN: -137.9
MODE: TRA

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_1.0_0.mp4
Using device: cpu
MODE: TRAIN, RETURN: -1657.4
MODE: TRAIN, RETURN: -1726.9
MODE: TRAIN, RETURN: -1695.0
MODE: TRAIN, RETURN: -1770.0
MODE: TRAIN, RETURN: -1444.1
MODE: TRAIN, RETURN: -1028.3
MODE: TRAIN, RETURN: -1501.5
MODE: TRAIN, RETURN: -1699.6
MODE: TRAIN, RETURN: -1511.8
MODE: TRAIN, RETURN: -1408.0
MODE: TRAIN, RETURN: -1176.7
MODE: TRAIN, RETURN: -1252.7
MODE: TRAIN, RETURN: -1209.6
MODE: TRAIN, RETURN: -1023.6
MODE: TRAIN, RETURN: -1137.5
MODE: TRAIN, RETURN: -821.8
MODE: TRAIN, RETURN: -1094.2
MODE: TRAIN, RETURN: -899.5
MODE: TRAIN, RETURN: -944.8
MODE: TRAIN, RETURN: -497.2
MODE: TRAIN, RETURN: -333.1
MODE: TRAIN, RETURN: -337.7
MODE: TRAIN, RETURN: -317.8
MODE: TRAIN, RETURN: -330.7
MODE: TRAIN, RETURN: -969.5
MODE: TRAIN, RETURN: -1489.4
MODE: TRAIN, RETURN: -1461.7
MODE: TRAIN, RETURN: -1434.3
MODE: TRAIN, RETURN: -175.6
MODE: TRAIN, RETURN: -168.9
MODE: TRAIN, RETURN: -160.4
MODE: TRAIN, RETURN: -165

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_5.0_0.mp4
Using device: cpu
MODE: TRAIN, RETURN: -1806.9
MODE: TRAIN, RETURN: -1708.6
MODE: TRAIN, RETURN: -1570.7
MODE: TRAIN, RETURN: -1680.1
MODE: TRAIN, RETURN: -1692.9
MODE: TRAIN, RETURN: -1784.3
MODE: TRAIN, RETURN: -1823.9
MODE: TRAIN, RETURN: -1844.8
MODE: TRAIN, RETURN: -1669.0
MODE: TRAIN, RETURN: -1575.1
MODE: TRAIN, RETURN: -1522.0
MODE: TRAIN, RETURN: -1392.5
MODE: TRAIN, RETURN: -1265.2
MODE: TRAIN, RETURN: -1278.0
MODE: TRAIN, RETURN: -1199.2
MODE: TRAIN, RETURN: -1161.1
MODE: TRAIN, RETURN: -1063.3
MODE: TRAIN, RETURN: -1115.3
MODE: TRAIN, RETURN: -1032.0
MODE: TRAIN, RETURN: -1079.9
MODE: TRAIN, RETURN: -1134.6
MODE: TRAIN, RETURN: -1037.2
MODE: TRAIN, RETURN: -1004.9
MODE: TRAIN, RETURN: -1042.7
MODE: TRAIN, RETURN: -1116.9
MODE: TRAIN, RETURN: -1174.5
MODE: TRAIN, RETURN: -1019.8
MODE: TRAIN, RETURN: -936.3
MODE: TRAIN, RETURN: -1041.9
MODE: TRAIN, RETURN: -795.3
MODE: TRAIN, RETURN: -558.6
MODE: TRAIN, RET

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_15.0_1.mp4
Using device: cpu
MODE: TRAIN, RETURN: -1505.5
MODE: TRAIN, RETURN: -1568.5
MODE: TRAIN, RETURN: -1475.2
MODE: TRAIN, RETURN: -1685.9
MODE: TRAIN, RETURN: -1681.9
MODE: TRAIN, RETURN: -1795.2
MODE: TRAIN, RETURN: -1702.4
MODE: TRAIN, RETURN: -1749.1
MODE: TRAIN, RETURN: -1631.2
MODE: TRAIN, RETURN: -1761.5
MODE: TRAIN, RETURN: -1518.0
MODE: TRAIN, RETURN: -1408.8
MODE: TRAIN, RETURN: -1242.0
MODE: TRAIN, RETURN: -1078.4
MODE: TRAIN, RETURN: -1241.9
MODE: TRAIN, RETURN: -1215.1
MODE: TRAIN, RETURN: -1105.7
MODE: TRAIN, RETURN: -1182.3
MODE: TRAIN, RETURN: -965.7
MODE: TRAIN, RETURN: -757.7
MODE: TRAIN, RETURN: -659.8
MODE: TRAIN, RETURN: -732.5
MODE: TRAIN, RETURN: -629.6
MODE: TRAIN, RETURN: -383.6
MODE: TRAIN, RETURN: -361.7
MODE: TRAIN, RETURN: -354.4
MODE: TRAIN, RETURN: -475.9
MODE: TRAIN, RETURN: -461.6
MODE: TRAIN, RETURN: -462.1
MODE: TRAIN, RETURN: -451.2
MODE: TRAIN, RETURN: -482.2
MODE: TRAIN, RETURN: -478

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_10.0_1.mp4
Using device: cpu
MODE: TRAIN, RETURN: -1259.9
MODE: TRAIN, RETURN: -1193.2
MODE: TRAIN, RETURN: -1320.3
MODE: TRAIN, RETURN: -1213.6
MODE: TRAIN, RETURN: -1452.9
MODE: TRAIN, RETURN: -861.6
MODE: TRAIN, RETURN: -946.3
MODE: TRAIN, RETURN: -1044.3
MODE: TRAIN, RETURN: -900.1
MODE: TRAIN, RETURN: -436.9
MODE: TRAIN, RETURN: -709.3
MODE: TRAIN, RETURN: -263.3
MODE: TRAIN, RETURN: -176.7
MODE: TRAIN, RETURN: -194.4
MODE: TRAIN, RETURN: -175.2
MODE: TRAIN, RETURN: -165.7
MODE: TRAIN, RETURN: -146.8
MODE: TRAIN, RETURN: -136.7
MODE: TRAIN, RETURN: -143.5
MODE: TRAIN, RETURN: -131.1
MODE: TRAIN, RETURN: -132.5
MODE: TRAIN, RETURN: -138.3
MODE: TRAIN, RETURN: -141.6
MODE: TRAIN, RETURN: -143.2
MODE: TRAIN, RETURN: -131.6
MODE: TRAIN, RETURN: -130.2
MODE: TRAIN, RETURN: -132.5
MODE: TRAIN, RETURN: -132.1
MODE: TRAIN, RETURN: -133.5
MODE: TRAIN, RETURN: -131.5
MODE: TRAIN, RETURN: -138.2
MODE: TRAIN, RETURN: -136.9
MODE: TRA

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_1.0_1.mp4
Using device: cpu
MODE: TRAIN, RETURN: -1718.4
MODE: TRAIN, RETURN: -1621.3
MODE: TRAIN, RETURN: -1673.3
MODE: TRAIN, RETURN: -1378.9
MODE: TRAIN, RETURN: -1608.0
MODE: TRAIN, RETURN: -1500.7
MODE: TRAIN, RETURN: -1811.0
MODE: TRAIN, RETURN: -1465.8
MODE: TRAIN, RETURN: -1598.7
MODE: TRAIN, RETURN: -1349.1
MODE: TRAIN, RETURN: -1117.9
MODE: TRAIN, RETURN: -1250.1
MODE: TRAIN, RETURN: -1199.2
MODE: TRAIN, RETURN: -1138.8
MODE: TRAIN, RETURN: -1112.2
MODE: TRAIN, RETURN: -1108.8
MODE: TRAIN, RETURN: -1519.4
MODE: TRAIN, RETURN: -1401.7
MODE: TRAIN, RETURN: -1481.2
MODE: TRAIN, RETURN: -1523.9
MODE: TRAIN, RETURN: -1497.5
MODE: TRAIN, RETURN: -1335.4
MODE: TRAIN, RETURN: -1264.6
MODE: TRAIN, RETURN: -1114.0
MODE: TRAIN, RETURN: -709.8
MODE: TRAIN, RETURN: -1391.2
MODE: TRAIN, RETURN: -1504.4
MODE: TRAIN, RETURN: -1284.2
MODE: TRAIN, RETURN: -1506.3
MODE: TRAIN, RETURN: -1503.7
MODE: TRAIN, RETURN: -1467.1
MODE: TRAIN, R

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_5.0_1.mp4
Using device: cpu
MODE: TRAIN, RETURN: -1794.7
MODE: TRAIN, RETURN: -1712.6
MODE: TRAIN, RETURN: -1659.7
MODE: TRAIN, RETURN: -1658.7
MODE: TRAIN, RETURN: -1793.2
MODE: TRAIN, RETURN: -1751.5
MODE: TRAIN, RETURN: -1879.6
MODE: TRAIN, RETURN: -1738.0
MODE: TRAIN, RETURN: -1747.7
MODE: TRAIN, RETURN: -1610.5
MODE: TRAIN, RETURN: -1506.1
MODE: TRAIN, RETURN: -1383.8
MODE: TRAIN, RETURN: -1282.4
MODE: TRAIN, RETURN: -1267.3
MODE: TRAIN, RETURN: -1166.1
MODE: TRAIN, RETURN: -1064.3
MODE: TRAIN, RETURN: -1238.3
MODE: TRAIN, RETURN: -1210.2
MODE: TRAIN, RETURN: -1137.2
MODE: TRAIN, RETURN: -1161.2
MODE: TRAIN, RETURN: -1105.5
MODE: TRAIN, RETURN: -1131.6
MODE: TRAIN, RETURN: -1048.7
MODE: TRAIN, RETURN: -1032.3
MODE: TRAIN, RETURN: -1066.2
MODE: TRAIN, RETURN: -1031.4
MODE: TRAIN, RETURN: -1045.6
MODE: TRAIN, RETURN: -1052.3
MODE: TRAIN, RETURN: -963.5
MODE: TRAIN, RETURN: -907.3
MODE: TRAIN, RETURN: -673.7
MODE: TRAIN, RET

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_15.0_2.mp4
Using device: cpu
MODE: TRAIN, RETURN: -1413.3
MODE: TRAIN, RETURN: -1467.7
MODE: TRAIN, RETURN: -1713.9
MODE: TRAIN, RETURN: -1424.3
MODE: TRAIN, RETURN: -1429.2
MODE: TRAIN, RETURN: -1830.1
MODE: TRAIN, RETURN: -1850.4
MODE: TRAIN, RETURN: -1739.5
MODE: TRAIN, RETURN: -1528.3
MODE: TRAIN, RETURN: -1517.9
MODE: TRAIN, RETURN: -1458.8
MODE: TRAIN, RETURN: -1437.4
MODE: TRAIN, RETURN: -1330.4
MODE: TRAIN, RETURN: -1178.8
MODE: TRAIN, RETURN: -1098.7
MODE: TRAIN, RETURN: -1309.9
MODE: TRAIN, RETURN: -1174.6
MODE: TRAIN, RETURN: -1018.6
MODE: TRAIN, RETURN: -893.6
MODE: TRAIN, RETURN: -751.7
MODE: TRAIN, RETURN: -750.6
MODE: TRAIN, RETURN: -790.0
MODE: TRAIN, RETURN: -628.8
MODE: TRAIN, RETURN: -627.2
MODE: TRAIN, RETURN: -602.3
MODE: TRAIN, RETURN: -490.1
MODE: TRAIN, RETURN: -355.6
MODE: TRAIN, RETURN: -357.4
MODE: TRAIN, RETURN: -347.0
MODE: TRAIN, RETURN: -345.8
MODE: TRAIN, RETURN: -478.3
MODE: TRAIN, RETURN: -372

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_10.0_2.mp4
Using device: cpu
MODE: TRAIN, RETURN: -1142.9
MODE: TRAIN, RETURN: -1198.4
MODE: TRAIN, RETURN: -1122.7
MODE: TRAIN, RETURN: -1163.7
MODE: TRAIN, RETURN: -995.6
MODE: TRAIN, RETURN: -1549.0
MODE: TRAIN, RETURN: -613.1
MODE: TRAIN, RETURN: -995.3
MODE: TRAIN, RETURN: -1218.2
MODE: TRAIN, RETURN: -583.3
MODE: TRAIN, RETURN: -325.7
MODE: TRAIN, RETURN: -314.8
MODE: TRAIN, RETURN: -188.9
MODE: TRAIN, RETURN: -181.0
MODE: TRAIN, RETURN: -145.9
MODE: TRAIN, RETURN: -163.1
MODE: TRAIN, RETURN: -157.2
MODE: TRAIN, RETURN: -158.0
MODE: TRAIN, RETURN: -142.6
MODE: TRAIN, RETURN: -147.9
MODE: TRAIN, RETURN: -133.1
MODE: TRAIN, RETURN: -138.9
MODE: TRAIN, RETURN: -138.2
MODE: TRAIN, RETURN: -138.2
MODE: TRAIN, RETURN: -131.1
MODE: TRAIN, RETURN: -135.4
MODE: TRAIN, RETURN: -132.2
MODE: TRAIN, RETURN: -134.0
MODE: TRAIN, RETURN: -129.2
MODE: TRAIN, RETURN: -134.7
MODE: TRAIN, RETURN: -134.0
MODE: TRAIN, RETURN: -136.7
MODE: TRA

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_1.0_2.mp4
Using device: cpu
MODE: TRAIN, RETURN: -1645.5
MODE: TRAIN, RETURN: -1648.1
MODE: TRAIN, RETURN: -1563.4
MODE: TRAIN, RETURN: -1499.4
MODE: TRAIN, RETURN: -1646.3
MODE: TRAIN, RETURN: -1683.9
MODE: TRAIN, RETURN: -1608.2
MODE: TRAIN, RETURN: -1589.4
MODE: TRAIN, RETURN: -1566.1
MODE: TRAIN, RETURN: -1428.1
MODE: TRAIN, RETURN: -1268.8
MODE: TRAIN, RETURN: -1250.4
MODE: TRAIN, RETURN: -1168.0
MODE: TRAIN, RETURN: -1191.4
MODE: TRAIN, RETURN: -1119.6
MODE: TRAIN, RETURN: -1431.0
MODE: TRAIN, RETURN: -1076.4
MODE: TRAIN, RETURN: -1400.0
MODE: TRAIN, RETURN: -1486.3
MODE: TRAIN, RETURN: -1402.7
MODE: TRAIN, RETURN: -1321.1
MODE: TRAIN, RETURN: -1113.0
MODE: TRAIN, RETURN: -835.9
MODE: TRAIN, RETURN: -593.9
MODE: TRAIN, RETURN: -626.1
MODE: TRAIN, RETURN: -532.4
MODE: TRAIN, RETURN: -689.2
MODE: TRAIN, RETURN: -524.6
MODE: TRAIN, RETURN: -364.9
MODE: TRAIN, RETURN: -506.9
MODE: TRAIN, RETURN: -200.9
MODE: TRAIN, RETURN: -

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_5.0_2.mp4
Using device: cpu
MODE: TRAIN, RETURN: -1831.1
MODE: TRAIN, RETURN: -1692.2
MODE: TRAIN, RETURN: -1880.5
MODE: TRAIN, RETURN: -1792.4
MODE: TRAIN, RETURN: -1827.8
MODE: TRAIN, RETURN: -1789.4
MODE: TRAIN, RETURN: -1745.6
MODE: TRAIN, RETURN: -1747.2
MODE: TRAIN, RETURN: -1715.6
MODE: TRAIN, RETURN: -1589.0
MODE: TRAIN, RETURN: -1604.9
MODE: TRAIN, RETURN: -1489.5
MODE: TRAIN, RETURN: -1295.5
MODE: TRAIN, RETURN: -1180.6
MODE: TRAIN, RETURN: -1170.3
MODE: TRAIN, RETURN: -1181.8
MODE: TRAIN, RETURN: -1154.2
MODE: TRAIN, RETURN: -1232.8
MODE: TRAIN, RETURN: -1253.4
MODE: TRAIN, RETURN: -1140.0
MODE: TRAIN, RETURN: -1141.9
MODE: TRAIN, RETURN: -1021.7
MODE: TRAIN, RETURN: -1006.9
MODE: TRAIN, RETURN: -1143.2
MODE: TRAIN, RETURN: -1148.3
MODE: TRAIN, RETURN: -1017.3
MODE: TRAIN, RETURN: -1101.1
MODE: TRAIN, RETURN: -1055.2
MODE: TRAIN, RETURN: -1070.7
MODE: TRAIN, RETURN: -933.8
MODE: TRAIN, RETURN: -949.1
MODE: TRAIN, RE

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_15.0_3.mp4
Using device: cpu
MODE: TRAIN, RETURN: -1656.2
MODE: TRAIN, RETURN: -1649.2
MODE: TRAIN, RETURN: -1640.8
MODE: TRAIN, RETURN: -1616.5
MODE: TRAIN, RETURN: -1719.3
MODE: TRAIN, RETURN: -1728.1
MODE: TRAIN, RETURN: -1812.8
MODE: TRAIN, RETURN: -1710.5
MODE: TRAIN, RETURN: -1549.6
MODE: TRAIN, RETURN: -1553.7
MODE: TRAIN, RETURN: -1392.2
MODE: TRAIN, RETURN: -1201.1
MODE: TRAIN, RETURN: -1327.4
MODE: TRAIN, RETURN: -1215.0
MODE: TRAIN, RETURN: -778.7
MODE: TRAIN, RETURN: -921.5
MODE: TRAIN, RETURN: -946.7
MODE: TRAIN, RETURN: -993.4
MODE: TRAIN, RETURN: -970.7
MODE: TRAIN, RETURN: -1057.2
MODE: TRAIN, RETURN: -1070.1
MODE: TRAIN, RETURN: -850.5
MODE: TRAIN, RETURN: -840.1
MODE: TRAIN, RETURN: -631.4
MODE: TRAIN, RETURN: -479.4
MODE: TRAIN, RETURN: -482.3
MODE: TRAIN, RETURN: -373.8
MODE: TRAIN, RETURN: -470.9
MODE: TRAIN, RETURN: -358.1
MODE: TRAIN, RETURN: -475.3
MODE: TRAIN, RETURN: -361.8
MODE: TRAIN, RETURN: -391.6

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_10.0_3.mp4
Using device: cpu
MODE: TRAIN, RETURN: -1193.7
MODE: TRAIN, RETURN: -1147.0
MODE: TRAIN, RETURN: -1071.3
MODE: TRAIN, RETURN: -1203.3
MODE: TRAIN, RETURN: -1158.8
MODE: TRAIN, RETURN: -819.9
MODE: TRAIN, RETURN: -1332.4
MODE: TRAIN, RETURN: -1225.7
MODE: TRAIN, RETURN: -877.8
MODE: TRAIN, RETURN: -724.8
MODE: TRAIN, RETURN: -326.0
MODE: TRAIN, RETURN: -359.7
MODE: TRAIN, RETURN: -389.9
MODE: TRAIN, RETURN: -271.9
MODE: TRAIN, RETURN: -230.0
MODE: TRAIN, RETURN: -167.9
MODE: TRAIN, RETURN: -175.5
MODE: TRAIN, RETURN: -155.1
MODE: TRAIN, RETURN: -155.5
MODE: TRAIN, RETURN: -143.2
MODE: TRAIN, RETURN: -135.1
MODE: TRAIN, RETURN: -135.2
MODE: TRAIN, RETURN: -134.9
MODE: TRAIN, RETURN: -140.7
MODE: TRAIN, RETURN: -133.5
MODE: TRAIN, RETURN: -126.7
MODE: TRAIN, RETURN: -136.1
MODE: TRAIN, RETURN: -138.8
MODE: TRAIN, RETURN: -136.7
MODE: TRAIN, RETURN: -131.1
MODE: TRAIN, RETURN: -130.5
MODE: TRAIN, RETURN: -130.4
MODE: TR

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_1.0_3.mp4
Using device: cpu
MODE: TRAIN, RETURN: -1795.7
MODE: TRAIN, RETURN: -1633.9
MODE: TRAIN, RETURN: -1665.3
MODE: TRAIN, RETURN: -1689.7
MODE: TRAIN, RETURN: -1545.9
MODE: TRAIN, RETURN: -1755.7
MODE: TRAIN, RETURN: -1442.5
MODE: TRAIN, RETURN: -1731.7
MODE: TRAIN, RETURN: -1444.8
MODE: TRAIN, RETURN: -1285.9
MODE: TRAIN, RETURN: -1269.7
MODE: TRAIN, RETURN: -1205.0
MODE: TRAIN, RETURN: -1173.3
MODE: TRAIN, RETURN: -1122.7
MODE: TRAIN, RETURN: -1174.5
MODE: TRAIN, RETURN: -1387.3
MODE: TRAIN, RETURN: -1080.5
MODE: TRAIN, RETURN: -1047.1
MODE: TRAIN, RETURN: -879.8
MODE: TRAIN, RETURN: -741.4
MODE: TRAIN, RETURN: -451.8
MODE: TRAIN, RETURN: -614.8
MODE: TRAIN, RETURN: -610.6
MODE: TRAIN, RETURN: -322.7
MODE: TRAIN, RETURN: -302.9
MODE: TRAIN, RETURN: -310.9
MODE: TRAIN, RETURN: -305.5
MODE: TRAIN, RETURN: -311.9
MODE: TRAIN, RETURN: -310.7
MODE: TRAIN, RETURN: -313.2
MODE: TRAIN, RETURN: -181.3
MODE: TRAIN, RETURN: -312.

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_5.0_3.mp4
Using device: cpu
MODE: TRAIN, RETURN: -1730.1
MODE: TRAIN, RETURN: -1649.8
MODE: TRAIN, RETURN: -1617.7
MODE: TRAIN, RETURN: -1825.0
MODE: TRAIN, RETURN: -1703.7
MODE: TRAIN, RETURN: -1804.3
MODE: TRAIN, RETURN: -1734.7
MODE: TRAIN, RETURN: -1789.9
MODE: TRAIN, RETURN: -1680.6
MODE: TRAIN, RETURN: -1700.3
MODE: TRAIN, RETURN: -1512.6
MODE: TRAIN, RETURN: -1448.7
MODE: TRAIN, RETURN: -1378.6
MODE: TRAIN, RETURN: -1209.5
MODE: TRAIN, RETURN: -1115.7
MODE: TRAIN, RETURN: -1027.4
MODE: TRAIN, RETURN: -1048.3
MODE: TRAIN, RETURN: -1151.5
MODE: TRAIN, RETURN: -1159.2
MODE: TRAIN, RETURN: -1091.5
MODE: TRAIN, RETURN: -958.7
MODE: TRAIN, RETURN: -971.6
MODE: TRAIN, RETURN: -1119.4
MODE: TRAIN, RETURN: -897.5
MODE: TRAIN, RETURN: -958.5
MODE: TRAIN, RETURN: -1017.3
MODE: TRAIN, RETURN: -686.4
MODE: TRAIN, RETURN: -696.7
MODE: TRAIN, RETURN: -736.1
MODE: TRAIN, RETURN: -583.0
MODE: TRAIN, RETURN: -580.2
MODE: TRAIN, RETURN: -

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_15.0_4.mp4
Using device: cpu
MODE: TRAIN, RETURN: -1662.1
MODE: TRAIN, RETURN: -1817.7
MODE: TRAIN, RETURN: -1744.3
MODE: TRAIN, RETURN: -1747.7
MODE: TRAIN, RETURN: -1768.8
MODE: TRAIN, RETURN: -1778.3
MODE: TRAIN, RETURN: -1829.3
MODE: TRAIN, RETURN: -1836.0
MODE: TRAIN, RETURN: -1645.1
MODE: TRAIN, RETURN: -1562.2
MODE: TRAIN, RETURN: -1469.7
MODE: TRAIN, RETURN: -1273.8
MODE: TRAIN, RETURN: -1302.7
MODE: TRAIN, RETURN: -1095.2
MODE: TRAIN, RETURN: -1222.2
MODE: TRAIN, RETURN: -913.1
MODE: TRAIN, RETURN: -1149.9
MODE: TRAIN, RETURN: -1004.8
MODE: TRAIN, RETURN: -978.5
MODE: TRAIN, RETURN: -1058.2
MODE: TRAIN, RETURN: -1077.6
MODE: TRAIN, RETURN: -1041.5
MODE: TRAIN, RETURN: -1083.1
MODE: TRAIN, RETURN: -1075.2
MODE: TRAIN, RETURN: -836.7
MODE: TRAIN, RETURN: -711.1
MODE: TRAIN, RETURN: -588.7
MODE: TRAIN, RETURN: -583.6
MODE: TRAIN, RETURN: -610.6
MODE: TRAIN, RETURN: -490.2
MODE: TRAIN, RETURN: -521.9
MODE: TRAIN, RETURN: 

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_10.0_4.mp4
Using device: cpu
MODE: TRAIN, RETURN: -1297.5
MODE: TRAIN, RETURN: -1548.2
MODE: TRAIN, RETURN: -1213.5
MODE: TRAIN, RETURN: -1427.0
MODE: TRAIN, RETURN: -1208.9
MODE: TRAIN, RETURN: -790.1
MODE: TRAIN, RETURN: -1140.8
MODE: TRAIN, RETURN: -654.1
MODE: TRAIN, RETURN: -461.3
MODE: TRAIN, RETURN: -241.4
MODE: TRAIN, RETURN: -814.5
MODE: TRAIN, RETURN: -348.0
MODE: TRAIN, RETURN: -160.2
MODE: TRAIN, RETURN: -182.8
MODE: TRAIN, RETURN: -188.3
MODE: TRAIN, RETURN: -146.0
MODE: TRAIN, RETURN: -155.8
MODE: TRAIN, RETURN: -140.9
MODE: TRAIN, RETURN: -130.5
MODE: TRAIN, RETURN: -137.6
MODE: TRAIN, RETURN: -133.1
MODE: TRAIN, RETURN: -134.4
MODE: TRAIN, RETURN: -132.5
MODE: TRAIN, RETURN: -135.1
MODE: TRAIN, RETURN: -132.4
MODE: TRAIN, RETURN: -130.2
MODE: TRAIN, RETURN: -128.5
MODE: TRAIN, RETURN: -130.6
MODE: TRAIN, RETURN: -135.8
MODE: TRAIN, RETURN: -126.6
MODE: TRAIN, RETURN: -133.3
MODE: TRAIN, RETURN: -131.8
MODE: TRA

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_1.0_4.mp4
Using device: cpu
MODE: TRAIN, RETURN: -1338.3
MODE: TRAIN, RETURN: -1378.9
MODE: TRAIN, RETURN: -1748.5
MODE: TRAIN, RETURN: -1768.1
MODE: TRAIN, RETURN: -1791.3
MODE: TRAIN, RETURN: -1171.5
MODE: TRAIN, RETURN: -1618.8
MODE: TRAIN, RETURN: -1767.6
MODE: TRAIN, RETURN: -1557.7
MODE: TRAIN, RETURN: -1404.9
MODE: TRAIN, RETURN: -1140.0
MODE: TRAIN, RETURN: -1186.2
MODE: TRAIN, RETURN: -1203.7
MODE: TRAIN, RETURN: -880.4
MODE: TRAIN, RETURN: -1142.4
MODE: TRAIN, RETURN: -1033.6
MODE: TRAIN, RETURN: -940.2
MODE: TRAIN, RETURN: -794.2
MODE: TRAIN, RETURN: -395.6
MODE: TRAIN, RETURN: -322.3
MODE: TRAIN, RETURN: -367.2
MODE: TRAIN, RETURN: -336.5
MODE: TRAIN, RETURN: -329.4
MODE: TRAIN, RETURN: -323.9
MODE: TRAIN, RETURN: -322.3
MODE: TRAIN, RETURN: -317.2
MODE: TRAIN, RETURN: -308.7
MODE: TRAIN, RETURN: -311.1
MODE: TRAIN, RETURN: -302.3
MODE: TRAIN, RETURN: -314.9
MODE: TRAIN, RETURN: -196.8
MODE: TRAIN, RETURN: -305.4
M

                                                               

Moviepy - Done !
Moviepy - video ready exp/pendulum_episode_5.0_4.mp4




In [140]:
d2 = np.array(res_list)

print(d2.min(axis=0))
print(d2.mean(axis=0))
print(d2.std(axis=0))

print(grav_list)

[-446.89678006 -394.98422963 -141.3584432  -159.40303028]
[-439.84273093 -365.04690555 -137.89686523 -155.5607243 ]
[ 5.41903111 16.37749229  1.77636234  2.13970934]
[15.0, 10.0, 1.0, 5.0]


In [141]:
d2

array([[-442.2114593 , -369.58425799, -137.15052739, -159.40303028],
       [-432.42482314, -355.66805178, -141.3584432 , -156.20418064],
       [-446.89678006, -349.22658912, -137.00023502, -154.59047858],
       [-434.69156735, -394.98422963, -137.61212289, -153.25816152],
       [-442.98902481, -355.77139921, -136.36299767, -154.34777047]])

: 

In [None]:
# alpha learned

In [136]:
d = np.array(res_list)

print(d.min(axis=0))
print(d.mean(axis=0))
print(d.std(axis=0))

[-400.24582961 -139.13192027 -153.85457099]
[-375.12828482 -138.04483418 -151.43922686]
[14.45417123  1.05961822  1.33309955]


In [134]:
d

array([[-400.24582961, -137.32470152, -151.14079856],
       [-382.45110993, -136.34019285, -153.85457099],
       [-361.83404713, -138.4758756 , -151.35271726],
       [-365.35956456, -139.13192027, -149.75541824],
       [-365.7508729 , -138.95148066, -151.09262923]])

Using device: cpu
MODE: TRAIN, RETURN: -1609.6
MODE: TRAIN, RETURN: -1689.6
MODE: TRAIN, RETURN: -1651.1
MODE: TRAIN, RETURN: -1602.7
MODE: TRAIN, RETURN: -1689.1
MODE: TRAIN, RETURN: -1808.6
MODE: TRAIN, RETURN: -1732.4
MODE: TRAIN, RETURN: -1692.3
MODE: TRAIN, RETURN: -1624.1
MODE: TRAIN, RETURN: -1644.2
MODE: TRAIN, RETURN: -1558.0
MODE: TRAIN, RETURN: -1596.3
MODE: TRAIN, RETURN: -1546.6
MODE: TRAIN, RETURN: -1566.3
MODE: TRAIN, RETURN: -1528.7
MODE: TRAIN, RETURN: -1434.1
MODE: TRAIN, RETURN: -1197.5
MODE: TRAIN, RETURN: -1436.5
MODE: TRAIN, RETURN: -1253.0
MODE: TRAIN, RETURN: -1266.0
MODE: TRAIN, RETURN: -1369.5
MODE: TRAIN, RETURN: -1469.6
MODE: TRAIN, RETURN: -1489.1
MODE: TRAIN, RETURN: -1442.8
MODE: TRAIN, RETURN: -1521.1
MODE: TRAIN, RETURN: -1535.1
MODE: TRAIN, RETURN: -1598.8
MODE: TRAIN, RETURN: -1711.3
MODE: TRAIN, RETURN: -1670.0
MODE: TRAIN, RETURN: -1589.2
MODE: TRAIN, RETURN: -1669.3
MODE: TRAIN, RETURN: -1441.3
MODE: TRAIN, RETURN: -1614.2
MODE: TRAIN, RETURN: -166

                                                               

Moviepy - Done !
Moviepy - video ready pendulum_episode.mp4




In [160]:
torch.cat([s_batch, a_batch], dim=1).shape

torch.Size([200, 4])

In [158]:
s_batch.shape

torch.Size([200, 3])

In [159]:
a_batch.shape

torch.Size([200, 1])

In [13]:
class RandomAgent:
    def __init__(self):
        # Environment variables. You don't need to change this.
        self.state_dim = 3  # [cos(theta), sin(theta), theta_dot]
        self.action_dim = 1  # [torque] in[-1,1]
        self.batch_size = 200
        self.min_buffer_size = 1000
        self.max_buffer_size = 100000
        # If your PC possesses a GPU, you should be able to use it for training, 
        # as self.device should be 'cuda' in that case.
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print("Using device: {}".format(self.device))
        self.memory = ReplayBuffer(self.min_buffer_size, self.max_buffer_size, self.device)
        
        self.setup_agent()

    def setup_agent(self):
        # TODO: Setup off-policy agent with policy and critic classes. 
        # Feel free to instantiate any other parameters you feel you might need.   
        pass

    def get_action(self, s: np.ndarray, train: bool) -> np.ndarray:
        """
        :param s: np.ndarray, state of the pendulum. shape (3, )
        :param train: boolean to indicate if you are in eval or train mode. 
                    You can find it useful if you want to sample from deterministic policy.
        :return: np.ndarray,, action to apply on the environment, shape (1,)
        """
        # TODO: Implement a function that returns an action from the policy for the state s.
        action = np.random.uniform(-1, 1, (1,))

        assert action.shape == (1,), 'Incorrect action shape.'
        assert isinstance(action, np.ndarray ), 'Action dtype must be np.ndarray' 
        return action

    @staticmethod
    def run_gradient_update_step(object: Union[Actor, Critic], loss: torch.Tensor):
        '''
        This function takes in a object containing trainable parameters and an optimizer, 
        and using a given loss, runs one step of gradient update. If you set up trainable parameters 
        and optimizer inside the object, you could find this function useful while training.
        :param object: object containing trainable parameters and an optimizer
        '''
        object.optimizer.zero_grad()
        loss.mean().backward()
        object.optimizer.step()

    def critic_target_update(self, base_net: NeuralNetwork, target_net: NeuralNetwork, 
                             tau: float, soft_update: bool):
        '''
        This method updates the target network parameters using the source network parameters.
        If soft_update is True, then perform a soft update, otherwise a hard update (copy).
        :param base_net: source network
        :param target_net: target network
        :param tau: soft update parameter
        :param soft_update: boolean to indicate whether to perform a soft update or not
        '''
        for param_target, param in zip(target_net.parameters(), base_net.parameters()):
            if soft_update:
                param_target.data.copy_(param_target.data * (1.0 - tau) + param.data * tau)
            else:
                param_target.data.copy_(param.data)

    def train_agent(self):
        '''
        This function represents one training iteration for the agent. It samples a batch 
        from the replay buffer,and then updates the policy and critic networks 
        using the sampled batch.
        '''
        # TODO: Implement one step of training for the agent.
        # Hint: You can use the run_gradient_update_step for each policy and critic.
        # Example: self.run_gradient_update_step(self.policy, policy_loss)

        # Batch sampling
        batch = self.memory.sample(self.batch_size)
        s_batch, a_batch, r_batch, s_prime_batch = batch

        # TODO: Implement Critic(s) update here.

        # TODO: Implement Policy update here

In [75]:
# This main function is provided here to enable some basic testing. 
# ANY changes here WON'T take any effect while grading.
# if __name__ == '__main__':

TRAIN_EPISODES = 50
TEST_EPISODES = 300

# You may set the save_video param to output the video of one of the evalution episodes, or 
# you can disable console printing during training and testing by setting verbose to False.
save_video = True
verbose = True

# agent = Agent()
agent = Agent()

env = get_env(g=10.0, train=True)

for EP in range(TRAIN_EPISODES):
    run_episode(env, agent, None, verbose, train=True)

if verbose:
    print('\n')

test_returns = []
env = get_env(g=10.0, train=False)

if save_video:
    video_rec = VideoRecorder(env, "pendulum_episode.mp4")

for EP in range(TEST_EPISODES):
    rec = video_rec if (save_video and EP == TEST_EPISODES - 1) else None
    with torch.no_grad():
        episode_return = run_episode(env, agent, rec, verbose, train=False)
    test_returns.append(episode_return)

avg_test_return = np.mean(np.array(test_returns))

print("\n AVG_TEST_RETURN:{:.1f} \n".format(avg_test_return))

if save_video:
    video_rec.close()


Using device: cpu
MODE: TRAIN, RETURN: -1511.1
MODE: TRAIN, RETURN: -1820.2
MODE: TRAIN, RETURN: -1637.2
MODE: TRAIN, RETURN: -1629.7


RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

In [66]:
episode_return

-907.1563821302733

In [77]:
# This main function is provided here to enable some basic testing. 
# ANY changes here WON'T take any effect while grading.
# if __name__ == '__main__':

TRAIN_EPISODES = 50
TEST_EPISODES = 300

# You may set the save_video param to output the video of one of the evalution episodes, or 
# you can disable console printing during training and testing by setting verbose to False.
save_video = True
verbose = True

# agent = Agent()
agent = Agent()

env = get_env(g=10.0, train=True)

for EP in range(TRAIN_EPISODES):
    run_episode(env, agent, None, verbose, train=True)

if verbose:
    print('\n')

test_returns = []
env = get_env(g=10.0, train=False)

if save_video:
    video_rec = VideoRecorder(env, "pendulum_episode.mp4")

for EP in range(TEST_EPISODES):
    rec = video_rec if (save_video and EP == TEST_EPISODES - 1) else None
    with torch.no_grad():
        episode_return = run_episode(env, agent, rec, verbose, train=False)
    test_returns.append(episode_return)

avg_test_return = np.mean(np.array(test_returns))

print("\n AVG_TEST_RETURN:{:.1f} \n".format(avg_test_return))

if save_video:
    video_rec.close()


Using device: cpu
MODE: TRAIN, RETURN: -1631.3
MODE: TRAIN, RETURN: -1693.8
MODE: TRAIN, RETURN: -1780.6
MODE: TRAIN, RETURN: -1757.8


RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

In [69]:
episode_return

-361.7752793997678

In [215]:
s_batch

tensor([[-0.9203, -0.3912,  0.1553],
        [-0.9942,  0.1076, -1.5829],
        [-0.9973, -0.0733,  0.8567],
        [-0.7763,  0.6303, -0.8624],
        [-1.0000, -0.0089,  0.2573],
        [-0.9996,  0.0284,  0.4442],
        [-0.8889, -0.4580, -0.4302],
        [-0.9916, -0.1296,  1.3949],
        [-0.9933,  0.1151, -0.7247],
        [-0.9402, -0.3407,  1.2617],
        [-0.9678, -0.2515,  1.0344],
        [-0.9759,  0.2183, -0.0223],
        [-0.9955, -0.0946, -1.4615],
        [-0.9886, -0.1505, -1.2979],
        [-0.9774, -0.2115,  1.4232],
        [-0.9370, -0.3493,  1.6744],
        [-0.7656,  0.6433,  0.0234],
        [-0.9472, -0.3206, -1.2272],
        [-0.9530, -0.3028,  0.2201],
        [-0.9937,  0.1117,  0.3962],
        [-0.9925, -0.1226,  1.3897],
        [-0.9500, -0.3123,  0.1983],
        [-0.8749,  0.4843,  0.1830],
        [-0.9967, -0.0812, -0.5633],
        [-0.9435,  0.3313, -0.4998],
        [-0.9998, -0.0216, -1.7625],
        [-0.9876,  0.1569,  0.9114],
 

In [186]:
q1 = torch.randn(16,1)
q2 = torch.randn(16,1)
catq1q2 = torch.cat([q1, q2], dim=1)
catq1q2.shape
torch.min(catq1q2, dim=1)[0]

torch.Size([16, 2])

In [193]:
torch.min(catq1q2, dim=1)

torch.return_types.min(
values=tensor([ 1.2743e-01, -9.3533e-01, -1.1035e+00, -7.3926e-01, -4.9917e-01,
        -1.6889e+00,  1.3162e+00, -5.3926e-01, -6.3668e-01,  1.3035e-01,
         2.9842e-01, -6.8639e-01, -3.4316e-01, -1.7422e+00, -9.2666e-01,
        -1.6996e-03]),
indices=tensor([1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0]))

In [190]:
torch.min(catq1q2, dim=1)[1]

tensor([1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0])

In [89]:
batch = agent.memory.sample(agent.batch_size)
s_batch, a_batch, r_batch, s_prime_batch = batch

In [90]:
print(s_batch.shape)
print(a_batch.shape)
print(r_batch.shape)
print(s_prime_batch.shape)

torch.Size([200, 3])
torch.Size([200, 1])
torch.Size([200, 1])
torch.Size([200, 3])
