In [1]:
import environment as gw

In [8]:
import numpy as np

import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.distributions import Categorical

from collections import namedtuple

SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])

In [9]:
class convAC(torch.nn.Module):
    def __init__(self):
        super(convAC, self).__init__()
        self.conv1 = nn.Conv2d(3,3, kernel_size=(5,5), stride=(1,1), padding=(1,1))
        self.mp1 = nn.MaxPool2d(kernel_size=5, stride=1, padding=1, dilation=1)
        self.conv2 = nn.Conv2d(3,3, kernel_size=(5,5), stride=(1,1), padding=(1,1))
        self.mp2 = nn.MaxPool2d(kernel_size=5, stride=1, padding=1, dilation=1)
        self.phi = nn.Linear(in_features=432, out_features=1000, bias=True)
        self.psi = nn.Linear(in_features=1000, out_features=1000, bias=True)

        self.actor = nn.Linear(in_features=1000, out_features=4, bias=True)
        self.critic = nn.Linear(in_features=1000, out_features=1, bias=True)

        self.gamma = 0.98
        self.eta = 5e-4
        self.saved_actions = []
        self.rewards = []

    def forward(self,x):
        x = F.relu(self.conv1(x))
        x = self.mp1(x)
        x = F.relu(self.conv2(x))
        x = self.mp2(x)
        x = x.view(x.shape[0],-1)
        phi = F.relu(self.phi(x))
        psi = F.relu(self.psi(phi))

        p = F.softmax(self.actor(psi), dim=1)
        v = self.critic(psi)
        return p, v, phi, psi

class linAC(torch.nn.Module):
    def __init__(self):
        super(linAC, self).__init__()
        self.layer1 = nn.Linear(400, 100)
        self.layer2 = nn.Linear(100, 50)
        #self.layer3 = nn.Linear(2000, 2000)
        #self.phi = nn.Linear(50, 50)
        #self.psi = nn.Linear(50, 50)

        self.actor = nn.Linear(50, 4)
        self.critic =nn.Linear(50,1)

        self.gamma = 0.98
        self.eta = 5e-4
        self.saved_actions = []
        self.rewards = []

    def forward(self,x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        #x = F.relu(self.layer3(x))
        #x = F.relu(self.phi(x))
        #x = F.relu(self.psi(x))

        p = F.softmax(self.actor(x), dim=1)
        v = self.critic(x)

        return p, v

class basic_env_params():
    def __init__(self):
        self.rows, self.columns = 20,20
        self.shape = (self.rows,self.columns)
        self.env_type = None
        self.rho = 0.0
        self.penalty = -0.01
        self.reward_location = (5,5)
        self.reward_mag = 10
        self.actionlist = ['Down', 'Up', 'Right', 'Left']
        self.rewarded_action = None

In [10]:
def select_action(obs):
    obs = np.expand_dims(obs, axis=0)
    state = torch.Tensor(obs).float()
    p, v, _, __ = agent(state)
    
    m = Categorical(p)
    action = m.sample()
    
    agent.saved_actions.append(SavedAction(m.log_prob(action),v))
    return p,v, action.item()


def finish_ep():
    R = 0
    saved_actions = agent.saved_actions
    p_loss = []
    v_loss = []
    
    returns = []
    for r in agent.rewards[::-1]:
        R = r+ agent.gamma*R
        returns.insert(0,R)
        
    returns = torch.tensor(returns).float()
    
    for (log_prob, value), R in zip(saved_actions, returns):
        advantage = R - value.item()

        # calculate actor (policy) loss 
        pl = (-log_prob * advantage)
        p_loss.append(pl)

        # calculate critic (value) loss using L1 smooth loss
        rval = Variable(torch.tensor([R])).unsqueeze(-1).unsqueeze(-1)
        vl = F.smooth_l1_loss(value, rval)
        v_loss.append(vl)

    # reset gradients
    optimizer.zero_grad()

    # sum up all the values of policy_losses and value_losses
    ploss, vloss = torch.stack(p_loss).sum() , torch.stack(v_loss).sum()
    loss = ploss + vloss
    

    # perform backprop
    loss.backward(retain_graph=True)
    optimizer.step()
    
    
    # reset rewards and action buffer
    del agent.rewards[:]
    del agent.saved_actions[:]
    return ploss, vloss

In [11]:
## make environment

ep = basic_env_params()
env = gw.GridWorld(rows=20, cols=20, env_type=ep.env_type,
                           rewards={ep.reward_location: ep.reward_mag},
                           step_penalization=ep.penalty,
                           rho=ep.rho,
                           actionlist=ep.actionlist,
                           rewarded_action=ep.rewarded_action)

In [12]:
#agent = linAC()
agent = convAC()
optimizer = optim.Adam(agent.parameters(), lr=1e-1)

In [14]:
ntrials = 1000
nevents = 250

PL,VL = [], []
rt = []

for trial in range(ntrials):
    env.resetEnvironment()
    ep_reward = 0
    for event in range(nevents):
        state = env.get_observation()
        p, v, choice = select_action(state)
        action = env.action_list[choice][0]
        next_state, reward, done = env.move(action)
        
        agent.rewards.append(reward)
        ep_reward += reward
        if done:
            break
    rt.append(ep_reward)
    if trial%10==0:
        print(f'trial{trial}: R={ep_reward} in {event+1} steps')
    # finish episode
    pl, vl = finish_ep()
    PL.append(pl)
    VL.append(vl)

trial0: R=-2.4999999999999907 in 250 steps


  vl = F.smooth_l1_loss(value, rval)


KeyboardInterrupt: 