In [1]:
import ptan
import torch
import torch.nn as nn

In [2]:
class DQNNet(nn.Module):
    def __init__(self, actions: int):
        super(DQNNet, self).__init__()
        self.actions = actions
        
    def forward(self, x):
        # we always produce diagonal tensor of shape (batch_size, actions)
        return torch.eye(x.size()[0], self.actions)

In [3]:
class PolicyNet(nn.Module):
    def __init__(self, actions: int):
        super(PolicyNet, self).__init__()
        self.actions = actions
        
    def forward(self, x):
        # now we produce the tensor with first two actions
        # having the same logit scores
        shape = (x.size()[0], self.actions)
        res = torch.zeros(shape, dtype=torch.float32)
        res[:, 0] = 1
        res[:, 1] = 1
        return res

In [4]:
net = DQNNet(actions=3)
# input : a batch of two observatios, each having five values
net_out = net(torch.zeros(2, 10))
print('dqn_net:')
print(net_out)

dqn_net:
tensor([[1., 0., 0.],
        [0., 1., 0.]])


In [5]:
selector = ptan.actions.ArgmaxActionSelector()
agent = ptan.agent.DQNAgent(dqn_model=net, action_selector=selector)
# the agent returns a tuple of two objects
ag_out = agent(torch.zeros(2, 5))
print("Argmax: ", ag_out)

Argmax:  (array([0, 1], dtype=int64), [None, None])


In [6]:
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1.0)
agent = ptan.agent.DQNAgent(dqn_model=net, action_selector=selector)
ag_out = agent(torch.zeros(10, 5))[0]
print("eps=1.0: ", ag_out)

eps=1.0:  [2 0 1 0 2 1 2 2 0 0]


In [7]:
selector.epsilon = 0.5
ag_out = agent(torch.zeros(10, 5))[0]
print("eps=0.5: ", ag_out)

eps=0.5:  [0 1 1 0 0 0 1 2 1 2]


In [8]:
selector.epsilon = 0.1
ag_out = agent(torch.zeros(10, 5))[0]
print("eps=0.1: ", ag_out)

eps=0.1:  [0 1 2 0 0 0 0 0 0 0]


In [11]:
net = PolicyNet(actions=5)
net_out = net(torch.zeros(6, 10))
print("policy_net:")
print(net_out)

policy_net:
tensor([[1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.]])


In [13]:
selector = ptan.actions.ProbabilityActionSelector()
agent = ptan.agent.PolicyAgent(model=net, action_selector=selector, apply_softmax=True)
ag_out = agent(torch.zeros(6, 5))[0]
print(ag_out)

[2 3 3 0 2 3]


In [14]:
torch.nn.functional.softmax(net(torch.zeros(1, 10)), dim=1)

tensor([[0.3222, 0.3222, 0.1185, 0.1185, 0.1185]])