In [1]:
import gym
import gvgai
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import torchvision.transforms as T
import torch.nn.functional as F


from tqdm import tqdm

from matplotlib import pyplot as plt

In [2]:
from agent.NNagent import NNagent
from torch.autograd import Variable

In [3]:
GAMMA = 0.99

total_rewards = []
total_estimate_rewards = []

In [4]:
policy = NNagent(GG=None, actions=6, depth=13)

In [5]:
policy.nn

Net(
  (conv1): Conv2d(13, 8, kernel_size=(3, 3), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(8, 32, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=100, out_features=48, bias=True)
  (fc2): Linear(in_features=48, out_features=24, bias=True)
  (fc3): Linear(in_features=24, out_features=6, bias=True)
)

In [6]:
optimizer = optim.Adam(policy.nn.parameters(), lr=.002)

In [7]:
eps = np.finfo(np.float32).eps.item()

In [8]:
class Value(nn.Module):
    def __init__(self, depth):
        super(Value, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=depth, out_channels=8, kernel_size=3)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(in_channels=8, out_channels=32, kernel_size=3)
        self.fc = nn.Linear((32 * 3) + 4, 1)

    def forward(self, x, compass):
        x = self.pool(F.relu(self.conv1(x)))
        x = F.relu(self.conv2(x))
        x = x.view(len(x), 32 * 3)
        x = torch.cat([x, compass], dim=1)
        x = F.relu(self.fc(x))
        return x

In [9]:
V = Value(13)
V.double()

Value(
  (conv1): Conv2d(13, 8, kernel_size=(3, 3), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(8, 32, kernel_size=(3, 3), stride=(1, 1))
  (fc): Linear(in_features=100, out_features=1, bias=True)
)

In [10]:
v_loss = nn.MSELoss()
v_optimizer = optim.SGD(V.parameters(), lr=0.01)

In [11]:
from generator.env_gen_wrapper import GridGame

env = GridGame(game='dzelda', 
                      play_length=1000, 
                      path='./levels',
                      lvl_name='1.txt',
                      mechanics=['+', 'g'], # monsters, key, door, wall
                      images=False,
                  )

In [None]:
for i in tqdm(range(10000)):
    state = env.reset()

    rewards = []
    states = [state]
    compass_info = []
    log_probs = []

    terminal = False
    
    while not terminal:
        # Action selection
        probs = policy.rl_get_action(state)
        compass_info.append(policy.compass_info)
        c = Categorical(probs)
        action = c.sample()
        log_probs.append(c.log_prob(action))

        # Carry out the action
        state, reward, terminal, _ = env.step(action.item())

        if not terminal:
            states.append(state)

        rewards.append(reward)

#     if baseline == 'estimate':
#         total_estimate_rewards.append(sum(rewards))
#     else:
    total_rewards.append(sum(rewards))

    # Calculate the returns
    R = 0
    returns = []
    for r in reversed(rewards):
        R = r + GAMMA * R
        returns.insert(0, R)
    returns = torch.tensor(returns)

    values = V(torch.DoubleTensor(states), 
               torch.DoubleTensor(np.vstack(compass_info)))

#     if baseline == 'estimate':
#         returns = returns - values.data.squeeze()

    # Calculate the loss
    policy_loss = [-log_prob * R for log_prob, R in zip(log_probs, returns)]

    # Backprop for the NN
    optimizer.zero_grad()
    policy_loss = torch.stack(policy_loss, dim=0).sum()
    policy_loss.backward()
    optimizer.step()

    # Backprop for the value estimator
    v_optimizer.zero_grad()
    estimate_loss = v_loss(values.squeeze(), returns.double())
    estimate_loss.backward()
    v_optimizer.step()

  4%|▍         | 426/10000 [05:52<1:44:55,  1.52it/s]

In [None]:
N = 20
plt.figure(figsize=(15,10))
plt.plot(np.convolve(total_rewards, np.ones((N,))/N, mode='valid'), label='No Baseline')