In [36]:
import gym
import torch as T
import torch.multiprocessing as mp
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import numpy as np
import nle

# Helper functions
def concat_state(state):
    state = np.array(state['glyphs'])
    state = np.expand_dims(state, axis=0)
    return state

class SharedAdam(T.optim.Adam):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.99), eps=1e-8,
            weight_decay=0):
        super(SharedAdam, self).__init__(params, lr=lr, betas=betas, eps=eps,
                weight_decay=weight_decay)

        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'] = 0
                state['exp_avg'] = T.zeros_like(p.data)
                state['exp_avg_sq'] = T.zeros_like(p.data)

                state['exp_avg'].share_memory_()
                state['exp_avg_sq'].share_memory_()
# Class ActorCritic Represents the grunt work needed by each Agent. 
# Is responsible for an Agents Neural Networks to update the Policy and
# Values based on what it sees on the screen. 
class ActorCritic(nn.Module):
    def __init__(self, input_dims, n_actions, gamma=0.99):
        super(ActorCritic, self).__init__()
        
        # Just for readibility
        x, y, z = input_dims
        self.gamma = gamma
        # Conv2d for Policy. 
        self.pi1 = nn.Sequential(
            # Increase filters by a factor of 4
            nn.Conv2d(z, z*4, kernel_size=3, padding='same'),
            nn.BatchNorm2d(z*4),
            nn.ReLU(), # 21 * 79 * 16
            nn.MaxPool2d(3, padding=(0,1)), # 7 * 27 * 16
            nn.Flatten(),
            nn.Linear(7*27*16, z*4),
            nn.ReLU(),
            nn.Linear(3*z*4, n_actions),
        )
        #Conv2d for Value. 
        self.v1 = nn.Sequential(
            # Increase filters by a factor of 4
            nn.Conv2d(z, z*4, kernel_size=3, padding='same'),
            nn.BatchNorm2d(z*4),
            nn.ReLU(), # 21 * 79 * 16
            nn.MaxPool2d(3, padding=(0,1)), # 7 * 27 * 16
            nn.Flatten(),
            nn.Linear(7*27*16, z*4),
            nn.ReLU(),
            nn.Linear(3*z*4, 1),
        )

        self.rewards = []
        self.actions = []
        self.states = []

    def remember(self, state, action, reward):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)

    def clear_memory(self):
        self.states = []
        self.actions = []
        self.rewards = []

    def forward(self, state):
        # Run the current state into each of their respective network
        pi = self.pi1(state)
        v = self.v1(state)

        return pi, v

    def calc_R(self, done):
        states = T.tensor(np.array(self.states), dtype=T.float)
        _, v = self.forward(states)

        R = v[-1]*(1-int(done))

        batch_return = []
        for reward in self.rewards[::-1]:
            R = reward + self.gamma*R
            batch_return.append(R)
        batch_return.reverse()
        batch_return = T.tensor(batch_return, dtype=T.float)

        return batch_return

    def calc_loss(self, done):
        states = T.tensor(np.array(self.states), dtype=T.float)
        actions = T.tensor(np.array(self.actions), dtype=T.float)

        returns = self.calc_R(done)

        pi, values = self.forward(states)
        values = values.squeeze()
        critic_loss = (returns-values)**2

        probs = T.softmax(pi, dim=1)
        dist = Categorical(probs)
        log_probs = dist.log_prob(actions)
        actor_loss = -log_probs*(returns-values)

        total_loss = (critic_loss + actor_loss).mean()
    
        return total_loss

    def choose_action(self, observation):
        observation = concat_state(observation)
        state = T.tensor(np.array([observation]), dtype=T.float)
        pi, v = self.forward(state)
        probs = T.softmax(pi, dim=1)
        
        # Use softmax in order to determine the next action to take
        dist = Categorical(probs)
        
        # Sample the action with the highest probability
        action = dist.sample().numpy()[0]
        return action

class Agent(mp.Process):
    def __init__(self, global_actor_critic, optimizer, input_dims, n_actions, 
                gamma, lr, name, global_ep_idx, env_id):
        super(Agent, self).__init__()
        self.local_actor_critic = ActorCritic(input_dims, n_actions, gamma)
        self.global_actor_critic = global_actor_critic
        self.name = 'w%02i' % name
        self.episode_idx = global_ep_idx
        self.env = gym.make(env_id)
        self.optimizer = optimizer

    def run(self):
        t_step = 1
        while self.episode_idx.value < N_GAMES:
            done = False
            observation = self.env.reset()
            score = 0
            self.local_actor_critic.clear_memory()
            while not done:
                action = self.local_actor_critic.choose_action(observation)
                observation_, reward, done, info = self.env.step(action)
                score += reward
                self.local_actor_critic.remember(observation, action, reward)
                if t_step % T_MAX == 0 or done:
                    loss = self.local_actor_critic.calc_loss(done)
                    self.optimizer.zero_grad()
                    loss.backward()
                    for local_param, global_param in zip(
                            self.local_actor_critic.parameters(),
                            self.global_actor_critic.parameters()):
                        global_param._grad = local_param.grad
                    self.optimizer.step()
                    self.local_actor_critic.load_state_dict(
                            self.global_actor_critic.state_dict())
                    self.local_actor_critic.clear_memory()
                t_step += 1
                observation = observation_
            with self.episode_idx.get_lock():
                self.episode_idx.value += 1
                print(self.name, 'episode ', self.episode_idx.value, 'reward %.1f' % score)

if __name__ == '__main__':
    lr = 1e-4
    # Environment string name to be used to make 
    # a new environment in gym. One per thread.
    env_id = 'NetHack-v0'
    # Number of actions in the environment env_id (NetHack-v0 has 79)
    n_actions = 79
    # Size of the input layer, i.e. size of the world in nethack
    # (Eventually with more of a third dimension as it gets more data
    # from the state dictionary)
    input_dims = (21, 79, 1)
    # Steps to play
    N_GAMES = 3000
    T_MAX = 5
    
    # The main Network connecting all of the Agents. Holds the 
    # Most current parameters.
    global_actor_critic = ActorCritic(input_dims, n_actions)
    global_actor_critic.share_memory()
    optim = SharedAdam(global_actor_critic.parameters(), lr=lr, 
                        betas=(0.92, 0.999))
    global_ep = mp.Value('i', 0)
    
    # Dedi
    workers = [Agent(global_actor_critic,
                    optim,
                    input_dims,
                    n_actions,
                    gamma=0.99,
                    lr=lr,
                    name=i,
                    global_ep_idx=global_ep,
                    env_id=env_id) for i in range(mp.cpu_count())]
    [w.start() for w in workers]
    [w.join() for w in workers]

Process w00:
Process w01:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/cs230/anaconda3/envs/snowbunnies/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/tmp/ipykernel_7273/143053832.py", line 153, in run
    action = self.local_actor_critic.choose_action(observation)
  File "/home/cs230/anaconda3/envs/snowbunnies/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/tmp/ipykernel_7273/143053832.py", line 124, in choose_action
    pi, v = self.forward(state)
  File "/tmp/ipykernel_7273/143053832.py", line 82, in forward
    pi = self.pi1(state)
  File "/home/cs230/anaconda3/envs/snowbunnies/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
Process w03:
  File "/home/cs230/anaconda3/envs/snowbunnies/lib/python3.9/site-packages/torch/nn/modules/container.py", line 141, in forward
    input = module(i

In [15]:
env_test = gym.make('NetHack-v0', savedir=None)

In [17]:
len(env_test._actions)

79

In [35]:
env_test.step(1)[0]['glyphs'].shape


(21, 79)