In [1]:
import numpy as np
import psutil
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.optim import Adam
from torch.distributions import Categorical
from tqdm import tqdm_notebook as tqdm

import sys; sys.path.append("../screeps_rl_env")
from screeps_rl_env import ScreepsEnv
from screeps_rl_env.utils import kill_backend_processes

In [2]:
class SimplePolicy(nn.Module):

    def __init__(self, env, H = 30, gamma = 0.99):
        super().__init__()

        in_dim = 4  # two set of (x, y) coords
        out_dim = 8  # can move in 8 directions

        self.linear1 = torch.nn.Linear(in_dim, H)
        self.linear2 = torch.nn.Linear(H, out_dim)

        self.gamma = gamma

        # Episode policy and reward history
        self.policy_history = Variable(torch.Tensor())
        self.reward_episode = []
        
        # Overall reward and loss history
        self.reward_history = []
        self.loss_history = []
        
    def select_action(self, state):
#         state = torch.from_numpy(state).type(torch.FloatTensor)
        state = torch.FloatTensor(state)
        state = self.forward(state)
        c = Categorical(state)
        action = c.sample()
        log_prob = c.log_prob(action).unsqueeze(0)
        
        # Add log probability of our chosen action to our history    
        if torch.numel(self.policy_history) == 0:
            self.policy_history = log_prob
        else:
            self.policy_history = torch.cat((self.policy_history, log_prob))
        return action
        
    def update_policy(self):
        R = 0
        rewards = []
        
        # Discount future rewards back to the present using gamma
        for r in self.reward_episode[::-1]:
            R = r + self.gamma * R
            rewards.insert(0,R)
            
        # Scale rewards
        rewards = torch.FloatTensor(rewards)
        rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
        
        # Calculate loss
        loss = torch.sum(torch.mul(self.policy_history, Variable(rewards)).mul(-1), -1)
        
        # Update network weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        #Save and intialize episode history counters
        self.loss_history.append(loss.item())
        self.reward_history.append(np.sum(self.reward_episode))
        self.policy_history = Variable(torch.Tensor())
        self.reward_episode = []
    
    def forward(self, x):
        """Returns a size-8 vector of one-hot probabilities to move in whichever direction"""
        out = self.linear1(x)
        out = nn.ReLU()(out)
        out = self.linear2(out)
        out = nn.Softmax(dim=0)(out)
        return out


In [3]:
def train(episodes):
    running_reward = 10
    
    for episode in range(episodes):
        
        print(f"Starting episode {episode}")
        
        state = env.reset() # Reset environment and record the starting state
    
        for time in tqdm(range(500)):
            
            action = policy.select_action(state)
            
            # Step through environment using chosen action
            state, reward, done, _ = env.step(action.item())
#             print(state)
            # Save reward
            policy.reward_episode.append(reward)
            if done:
                break
        
        # Used to determine when the environment is solved.
        running_reward = (running_reward * 0.99) + (time * 0.01)

        policy.update_policy()

        if episode % 50 == 0:
            print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format(episode, time, running_reward))

        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and the last episode runs to {} time steps!".format(running_reward, time))
            break

In [4]:
# Hyperparameters
learning_rate = 0.01

env = ScreepsEnv(0)
policy = SimplePolicy(env)
optimizer = Adam(policy.parameters(), lr=learning_rate)
train(10)

Starting remote server at 21025...
Connected; response: [None]
Starting episode 0
Resetting training environment


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Episode 0	Last length:   499	Average length: 14.89


AttributeError: 'NoneType' object has no attribute 'reward_threshold'

In [5]:
loss

NameError: name 'loss' is not defined

In [6]:
%debug

> [0;32m<ipython-input-5-de191f53719d>[0m(1)[0;36m<module>[0;34m()[0m
[0;32m----> 1 [0;31m[0mloss[0m[0;34m[0m[0;34m[0m[0m
[0m
*** NameError: name 'back' is not defined

Documented commands (type help <topic>):
EOF    cl         disable  interact  next    psource  rv         unt   
a      clear      display  j         p       q        s          until 
alias  commands   down     jump      pdef    quit     source     up    
args   condition  enable   l         pdoc    r        step       w     
b      cont       exit     list      pfile   restart  tbreak     whatis
break  continue   h        ll        pinfo   return   u          where 
bt     d          help     longlist  pinfo2  retval   unalias  
c      debug      ignore   n         pp      run      undisplay

Miscellaneous help topics:
exec  pdb



In [15]:
env.step(1)

((42, 37, 34, 46), 37.9584054212077, False, {})

In [18]:
def kill_backend_processes(servers=(0,1,2), verbose=True):
    
    def on_terminate(proc):
        if verbose: print("Process {} terminated with exit code {}".format(proc, proc.returncode))
    
    look_ports = []
    for server in servers:
        look_ports.extend([21025 + 5 * server + 0, 21025 + 5 * server + 1, 21025 + 5 * server + 2])
    
    kill_pids = []
    for conn in psutil.net_connections():
        if (len(conn.laddr) > 0 and conn.laddr.port in look_ports) \
        or (len(conn.raddr) > 0 and conn.raddr.port in look_ports):
            kill_pids.append(conn.pid)
        
    print("Killing processes with pids {}".format(kill_pids))
    processes = [psutil.Process(pid = pid) for pid in kill_pids]

    for pid in kill_pids:
        process = psutil.Process(pid = pid)
        process.kill()
        
    dead, alive = psutil.wait_procs(processes, timeout=3, callback=on_terminate)
    if verbose: print("Terminated processes: ", dead)
    if verbose: print("Remaining processes: ", alive)

In [20]:
kill_backend_processes()

Killing processes with pids []
Terminated processes:  []
Remaining processes:  []


In [17]:
env.interface.tick()