In [1]:
from __future__ import print_function
import time
from time import gmtime, strftime
import random

import torch 
from torch.autograd import Variable
from torch import autograd, optim, nn
import torch.nn.functional as F
from torch.distributions import Categorical

import numpy as np

from collections import namedtuple

import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.colors as colors
import matplotlib.cm as cmx
import matplotlib.patches as patches
from matplotlib.collections import PatchCollection

from utils import *
%matplotlib inline

In [2]:
# set up network
class AC_Net(nn.Module):
    def __init__(self, dims):        
        super(AC_Net, self).__init__()
        # dims is a list of dimensions of each layer [dimension_of_input, dim_of_hiddenlayer1, ... dim_of_hiddenlayerN, dim_of_output]
        self.layers = dims
        if len(dims)>2: 
            self.hidden = []
            for i in range(len(dims)-2):
                self.hidden.append(nn.Linear(dims[i], dims[i+1]))
            self.h_layers = nn.ModuleList(self.hidden)
            
        self.p_in = nn.Linear(dims[-2],dims[-1])
        self.v_in = nn.Linear(dims[-2],1)
        
        for m in self.modules():
            if isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.001)
                m.bias.data.zero_()
        
        self.saved_actions = []
        self.rewards = []
        
    def forward(self, x):
        if len(self.layers)>2:
            for i in range(len(self.hidden)):
                x = F.relu(self.hidden[i](x))
        pol = F.softmax(self.p_in(x),dim=1)
        val = self.v_in(x)

        return pol, val

SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])
def select_action(state):
    policy_, value_ = model(state)
    a = Categorical(policy_)
    action = a.sample()
    model.saved_actions.append(SavedAction(a.log_prob(action), value_))
    return action.data[0], policy_.data[0], value_.data[0]


def finish_trial():
    R = 0
    returns_ = discount_rwds(np.asarray(model.rewards), gamma=discount_factor)
    saved_actions = model.saved_actions
    
    policy_losses = []
    value_losses = []
    
    returns_ = torch.Tensor(returns_)
    #returns_ = (returns_ - returns_.mean()) / (returns_.std() + np.finfo(np.float32).eps)
    for (log_prob, value), r in zip(saved_actions, returns_):
        rpe = r - value.data[0, 0]
        policy_losses.append(-log_prob * rpe)
        value_losses.append(F.smooth_l1_loss(value, Variable(torch.Tensor([r]))))
    optimizer.zero_grad()
    loss = torch.cat(policy_losses).sum() + torch.cat(value_losses).sum()
    loss.backward(retain_graph=False)
    optimizer.step()
    del model.rewards[:]
    del model.saved_actions[:]

In [3]:
#========================
# Environment Parameters
#======================== 
height = 15
width = 15

mazetype = 'none'
#obstacle density
obs_rho = 0

#place cells
place_cells = 700
#place cell full width half max (must be <1)
fwhm = .2

#make environment
maze = gridworld([height, width],rho=obs_rho,num_pc=place_cells, pc_fwhm=fwhm, maze_type=mazetype)

make_env_plots(maze,1,0,0)

TypeError: object of type 'zip' has no len()

In [None]:
#==================================
# Set up Data Recording Structures
#==================================
print_freq = 1./10
total_reward = [] #track total reward achieved in trial 
total_loss = [[],[]] #[actor loss, critic loss]

# data frames for value and policy maps
val_maps = []
pol_maps = np.zeros((height, width), dtype=[('action_taken', 'i4'),('taken_prob', 'f4'), ('likely_action', 'i4'),('likely_prob', 'f4'), ('timestep', 'i4')])

# record current time before beginning of trial
print("Run started: ", strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()))
runtime= time.time()
blocktime = time.time()
for trial in xrange(NUM_TRIALS):
    start_time = time.time()

    reward_sum = 0

    # reset the environment, initialize agent position and reset value map
    maze.start_trial()
    maze.value_map = maze.init_value_map

    # get inital state
    state = Variable(torch.FloatTensor(maze.net_state))
    for event in xrange(NUM_EVENTS):
        # get policy and value estimate in order to select next action
        choice, policy, value = select_action(model,state)

        # select action
        action = maze.actionlist[choice]
        choice_prob = policy[choice]

        maze.value_map[maze.cur_state[1]][maze.cur_state[0]] = value
        pol_maps[maze.cur_state[1]][maze.cur_state[0]] = (choice, list(policy)[choice], list(policy).index(max(list(policy))), max(list(policy)), trial)

        # get new state of the environment and reward from action 
        if event < NUM_EVENTS:
            next_state = maze.move(action)
            
        model.rewards.append(maze.rwd)
        # update state
        state = autograd.Variable(torch.FloatTensor(next_state))

        reward_sum += maze.rwd
    
    total_reward.append(reward_sum)
    
    finish_trial(model, discount_factor,optimizer)
    val_maps.append(maze.value_map.copy())
    # print reward measure
    if trial==1 or trial%(print_freq*NUM_TRIALS)==0 or trial == NUM_TRIALS-1: 
        print ("Trial {0} finished w total reward = {1} (Avg {2:.3f})".format(trial, reward_sum, float(reward_sum)/float(NUM_EVENTS)), "Block took {0:.3f}".format(time.time()-blocktime))
        blocktime = time.time() 
    
print("Run took {0:.3f}".format(time.time()-runtime))

In [None]:
plt.plot(total_reward)
plt.show()

In [None]:
val_maps.append(maze.value_map.copy())

In [None]:
np.nanmax(val_maps)

plotrows = 4
plotcols = 5
fig, axes = plt.subplots(nrows=plotrows, ncols=plotcols, sharex=True, sharey =True)
items = np.linspace(0, len(val_maps)-1, plotrows*plotcols)

for i, ax in enumerate(axes.flat):
    data = val_maps[int(items[i])]
    #im = ax.pcolor(data, cmap= 'Spectral_r', vmin=np.nanmin(data), vmax=np.nanmax(data))
    im = ax.pcolor(data, cmap= 'Spectral_r', vmin=0, vmax=50)
    im.cmap.set_under('white')
    ax.axis('off')
    ax.annotate('*', np.add(maze.rwd_loc, (-0, 1.5)), color='k')
    ax.set_title('{}'.format(int(items[i])))

axes[0,0].invert_yaxis()

fig.subplots_adjust(right=0.8)
cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
fig.colorbar(im, cax=cbar_ax)
if mazetype == 'none':
    plt.savefig('./figures/grid_obs{}_valuemap.svg'.format(obs_rho), format='svg')
else: 
    plt.savefig('./figures/{}_valuemap.svg'.format(mazetype), format='svg')
plt.show()


data = val_maps[-1]
plt.imshow(data, vmin=np.nanmin(data), vmax=np.nanmax(data), cmap='jet', interpolation='none')
#plt.imshow(data, vmin=1.32, vmax=1.5, cmap='jet', interpolation='none')

plt.colorbar()
plt.show()

In [None]:
model = Policy()
optimizer = optim.Adam(model.parameters(), lr=3e-2)


def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs, state_value = model(Variable(state))
    m = Categorical(probs)
    action = m.sample()
    model.saved_actions.append(SavedAction(m.log_prob(action), state_value))
    return action.data[0]


def finish_episode():
    R = 0
    saved_actions = model.saved_actions
    policy_losses = []
    value_losses = []
    rewards = []
    for r in model.rewards[::-1]:
        R = r + args.gamma * R
        rewards.insert(0, R)
    rewards = torch.Tensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
    for (log_prob, value), r in zip(saved_actions, rewards):
        reward = r - value.data[0, 0]
        policy_losses.append(-log_prob * reward)
        value_losses.append(F.smooth_l1_loss(value, Variable(torch.Tensor([r]))))
    optimizer.zero_grad()
    loss = torch.cat(policy_losses).sum() + torch.cat(value_losses).sum()
    loss.backward()
    optimizer.step()
    del model.rewards[:]
    del model.saved_actions[:]


def main():
    running_reward = 10
    for i_episode in count(1):
        state = env.reset()
        for t in range(10000):  # Don't infinite loop while learning
            action = select_action(state)
            state, reward, done, _ = env.step(action)
            if args.render:
                env.render()
            model.rewards.append(reward)
            if done:
                break

        running_reward = running_reward * 0.99 + t * 0.01
        finish_episode()
        if i_episode % args.log_interval == 0:
            print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format(
                i_episode, t, running_reward))
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
            break


if __name__ == '__main__':
    main()

In [None]:
# set up network
n_in = 500
n_out = 6
dims = [n_in, n_out]

in_layer = Variable(torch.rand(n_in))
W = nn.Linear(n_in, n_out)
policy = F.softmax(W(in_layer))
print(out_layer)