In [None]:
# import packages
import os
from os import listdir
from os.path import isfile, join

import time
from time import gmtime, strftime

import numpy as np
import random

import matplotlib.pyplot as plt
%matplotlib inline 

import imageio

# for more info check http://pytorch.org/docs/master/torch.html
import torch 
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

import pandas as pd

from utils import *

import tensorflow as tf



In [None]:
#### Need to change from TF to torch initialization
# Functions for making tensorflow graph objects for each layer
def add_layer(namescope, inputs, in_size, out_size, activation_function=None):
    inputs = tf.cast(inputs, tf.float32)
    with tf.name_scope(namescope):
        #w = tf.random_normal([in_size, out_size], mean=0, stddev=0.000001)
        w = tf.zeros([in_size, out_size])
        Weights = tf.Variable(w, dtype=tf.float32, name='weights')
        biases = tf.Variable(tf.zeros([out_size]), dtype=tf.float32, name='biases')
        Wx_plus_b = tf.matmul(inputs , Weights)+biases
        if activation_function is None :
            output = Wx_plus_b
        else:
            output = activation_function(Wx_plus_b)
            
    return output 

# Define Agent 
class agent():
    def __init__(self, lr, dims):
        # lr is learning rate
        # dims is a list of dimensions of each layer [dimension_of_input, dim_of_hiddenlayer1, ... dim_of_hiddenlayerN, dim_of_output]
        in_dim = dims[0]
        out_dim = dims[-1]
        
        #These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        with tf.name_scope("Input"):
            self.input_layer = tf.placeholder(dtype = tf.float32, shape=[None, in_dim], name = 'state')
        
        # make hidden layers only if specified in dims list
        if len(dims)>2:
            hidden_layers = []
            for i in range(len(dims)-2):
                layername = "H_"+str(i+1)
                if i == 0:
                    hiddenlayer = add_layer(layername, self.input_layer, dims[i], dims[i+1], activation_function=tf.nn.sigmoid)
                else:
                    hiddenlayer = add_layer(layername, hidden_layers[-1], dims[i], dims[i+1], activation_function=tf.nn.sigmoid)
                hidden_layers.append(hiddenlayer)

            self.policy = add_layer("Policy", hidden_layers[-1], dims[-2], out_dim, activation_function = tf.nn.softmax)
            self.value  = add_layer("Value", hidden_layers[-1], dims[-2], 1, activation_function=None)
        # if dims only has input/output dimension, connect output layer directly to input
        # using this right now for testing
        else:
            self.policy = add_layer("Policy", self.input_layer, in_dim, out_dim, activation_function = tf.nn.softmax)
            self.value  = add_layer("Value", self.input_layer, in_dim, 1, activation_function=None)
        
        
        # make variable lists to specify to tensorflow which variables to update with which gradients
        self.policy_vars = tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope="Layer1")+tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope="Layer2")+tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope="Policy")
        self.value_vars = tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope="Layer1")+tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope="Layer2")+tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope="Value")
        #list of all variables, not used -- may take out later
        self.tvars = tf.trainable_variables()
        
        #The next six lines establish the training proceedure. We feed the return and chosen action into the network
        #to compute the loss, and use it to update the network.
        with tf.name_scope("Loss"):
            self.return_holder = tf.placeholder(shape=[None],dtype=tf.float32)
            self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32)

            self.indexes = tf.range(0, tf.shape(self.policy)[0]) * tf.shape(self.policy)[1] + self.action_holder
            self.responsible_outputs = tf.gather(tf.reshape(self.policy, [-1]), self.indexes)
            # calculate reward prediction errors
            self.rpe = self.return_holder - self.value
            
            self.actor_loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.rpe)
            self.critic_loss = 0.5 * tf.reduce_mean(tf.square(self.rpe))

        with tf.name_scope("Training"):
            # make placeholders to store gradients -- maybe a clunky way to do this but will become useful later
            # rather than feeding gradients directly to the apply gradients operation
            self.p_gradient_holders = []
            self.v_gradient_holders = []
            for idx,var in enumerate(self.policy_vars):
                placeholder = tf.placeholder(tf.float32,name=str(idx)+'_p_holder')
                self.p_gradient_holders.append(placeholder)

            for idx,var in enumerate(self.value_vars):
                placeholder = tf.placeholder(tf.float32,name=str(idx)+'_v_holder')
                self.v_gradient_holders.append(placeholder)
            
            # specifiy which training we are going to do for policy
            pol_trainer = tf.train.GradientDescentOptimizer(learning_rate=lr)
            
            # calculate gradients for policy loss only on policy variables
            self.get_pol_grads = tf.gradients(self.actor_loss, self.policy_vars)
            # op for updating policy gradients
            self.update_pol = pol_trainer.apply_gradients(zip(self.p_gradient_holders, self.policy_vars))
            
            # specifiy which training we are going to do for value
            val_trainer = tf.train.GradientDescentOptimizer(learning_rate=0.001*lr)
            # calulate gradients for value loss
            self.get_val_grads = tf.gradients(self.critic_loss, self.value_vars)
            # op for updating value gradients
            self.update_val = val_trainer.apply_gradients(zip(self.v_gradient_holders, self.value_vars)) 

In [None]:
#========================
# Environment Parameters
#======================== 
height = 10
width = 10

mazetype = 'room'
#obstacle density
obs_rho = 0.2

#place cells
place_cells = 1000
#place cell full width half max (must be <1)
fwhm = 0.25

#make environment
maze = gridworld([height, width],rho=obs_rho,num_pc=place_cells, pc_fwhm=fwhm, maze_type=mazetype)
## plots
plot_grid(maze)
plt.show()


In [None]:
print maze.cur_state
print maze.start_loc


plt.scatter(maze.pcs.x, maze.pcs.y, c=maze.pcs.activity(maze.cur_state), cmap = 'jet', vmin = 0, vmax =1, s = 100)#a.field_width)
plt.ylim([1,0])
plt.colorbar()
plt.show()

#plt.pcolor(maze.pcs.activity(maze.cur_state).reshape(1,maze.pcs.activity(maze.cur_state).shape[0]), vmin=0, vmax=1, cmap='jet')
#plt.colorbar()

In [None]:
#=====================
# Training Parameters
#=====================
#trial parameters
NUM_TRIALS = 6000
NUM_EVENTS = 300

discount_factor = 0.98
port_shift = 'none'

#gradient descent learning rate
eta = 1e-2

In [None]:
# Single run of NUM_TRIALS each with NUM_EVENTS
tf.reset_default_graph()

dims = [len(maze.net_state[0]),len(maze.actionlist)]

myAgent = agent(lr=eta, dims=dims)
#tf.summary.FileWriter('./outputs/maze3/', graph=tf.get_default_graph())

print_freq = 1./10
init = tf.global_variables_initializer()

saveplots = False
if saveplots:
    if mazetype=='none':
        pathvar = './valplots/grid/{}/'.format(obs_rho)
    else: 
        pathvar = './valplots/{}/'.format(mazetype)
    if not os.path.exists(pathvar):
        os.makedirs(pathvar)


with tf.Session() as sess:
    steps_rwd = []
    
    sess.run(init)
    
    pGrad_buf = sess.run(myAgent.policy_vars)
    vGrad_buf = sess.run(myAgent.value_vars)
    
    total_reward = []
    trialtime = []
    total_loss = [[],[]]

    blocktime = time.time()
    val_maps = []
    
    print strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
    for i in xrange(NUM_TRIALS):
        start_time = time.time()

        _ep_buffer = []
        reward_sum = 0

        # reset gradient buffers
        for ix,grad in enumerate(pGrad_buf):
            pGrad_buf[ix] = grad * 0
        for ix,grad in enumerate(vGrad_buf):
            vGrad_buf[ix] = grad * 0

        # reset the environment
        maze.start_trial()
        maze.value_map = maze.init_value_map
        
        state = maze.net_state
        
        for j in xrange(NUM_EVENTS): 
            # get policy and value estimate in order to select next action
            tfprob_, val_ = sess.run([myAgent.policy, myAgent.value], feed_dict={myAgent.input_layer:state})

            # process tensorflow lists to get right shape
            tfprob = tfprob_[0]
            val = val_[0][0]
            #print maze.cur_state, val, "###"
            maze.value_map[maze.cur_state[1]][maze.cur_state[0]] = val

            # select action
            choice = np.random.choice(np.arange(len(tfprob)), 1, p=tfprob)[0]
            action = maze.actionlist[choice]

            # get new state of the environment and reward from action 
            if j < NUM_EVENTS:
                next_state = maze.move(action)

            rwd = maze.rwd

            # store buffer of agents experiences so that later we can calulate returns/etc. backwards through time
            _ep_buffer.append([state,choice,rwd,next_state, val])

            # update state
            state = next_state

            reward_sum += rwd
            
            #if maze.done == True: 
            #    trialtime.append(j)
            #    break
            #elif j == NUM_EVENTS-1:
            #    trialtime.append(NUM_EVENTS)
        # make data storage useable type 
        _ep_buffer = np.array(_ep_buffer)
        # compute returns
        _returns = discount_rwds(_ep_buffer[:,2], gamma=discount_factor)

        feed_dict = {myAgent.return_holder:_returns, myAgent.action_holder:_ep_buffer[:,1], myAgent.input_layer:np.vstack(_ep_buffer[:,0])}

        # calculate gradients using the information stored in the episode buffer
        # computed returns (backward through time)
        # which actions were taken gives which policy unit was responsible (so grads are computed properly)
        # pass states agent was in at each timestep through the network again to recompute the value and policy (for gradients computation in tf)
        a_loss, c_loss = sess.run([myAgent.actor_loss, myAgent.critic_loss], feed_dict=feed_dict)
        p_grads, v_grads = sess.run([myAgent.get_pol_grads, myAgent.get_val_grads], feed_dict=feed_dict)
        
        total_loss[0].append(a_loss)
        total_loss[1].append(c_loss)
        
        # store gradients in gradient buffers -- not necessary for the current formulation but will be more flexible for later
        # sorry for the additional complication 
        for idx, grad in enumerate(p_grads):
            pGrad_buf[idx] += grad

        for idx, grad in enumerate(v_grads):
            vGrad_buf[idx] += grad

        feed_dict = dict(zip(myAgent.p_gradient_holders, pGrad_buf)+zip(myAgent.v_gradient_holders, vGrad_buf))

        # run gradient update operations 
        _, __ = sess.run([myAgent.update_pol, myAgent.update_val], feed_dict = feed_dict)

        total_reward.append(reward_sum)
        val_maps.append(maze.value_map.copy())
        if saveplots:
            if (i%100 == 0):
                plt.clf()
                current_cmap = plt.cm.get_cmap()
                current_cmap.set_bad(color='white')
                plt.imshow(maze.value_map.copy(), vmin = 0, vmax=36, cmap = 'jet', interpolation='none')

                plt.annotate('*', np.add(maze.rwd_loc, (0, -0)), color='w')
                plt.title('{}'.format(i))
                #plt.gca().invert_yaxis()
                plt.colorbar()
                plt.savefig(pathvar+str(i),format='png')

        # print reward measure
        if i==1 or i%(print_freq*NUM_TRIALS)==0 or i == NUM_TRIALS-1: 
            print "Trial {0} finished in {1:.3f}. Total reward = {2} (Avg {3:.3f})".format(i, time.time()-start_time, reward_sum, float(reward_sum)/float(NUM_EVENTS)),
            print "Block took {0:.3f}".format(time.time()-blocktime)
            blocktime = time.time()


In [None]:
if mazetype =='none':
    mypath = './valplots/grid/{}/'.format(obs_rho)
else:
    mypath = './valplots/{}/'.format(mazetype)
if not os.path.exists(mypath):
    os.makedirs(mypath)

onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
intfiles = [int(f) for f in onlyfiles]
intfiles.sort()
if mazetype =='none':
    gifname = './valplots/gifs/grid{}.gif'.format(obs_rho)
else: 
    gifname = './valplots/gifs/{}.gif'.format(mazetype)

with imageio.get_writer(gifname, mode='I', duration=0.5) as writer:
            for filename in intfiles:
                image = imageio.imread(mypath+str(filename))
                writer.append_data(image)

In [None]:
plt.plot(total_reward, 'b-')
plt.axhline(y=NUM_EVENTS*(0.983), color='r', linestyle='-', label='Minimum \nPefect Score')
#plt.xlim([0,1000])
plt.show()

plt.plot(total_loss[0], 'b')
plt.title('gamma = {}'.format(discount_factor))
plt.plot(total_loss[1], 'r')
#plt.xlim([5000,6000])
#plt.ylim([0,100])
plt.show()

In [None]:
plotrows = 4
plotcols = 5
fig, axes = plt.subplots(nrows=plotrows, ncols=plotcols, sharex=True, sharey =True)
items = np.linspace(0, len(val_maps)-1, plotrows*plotcols)

for i, ax in enumerate(axes.flat):
    data = val_maps[int(items[i])]
    im = ax.imshow(data, vmin = 0, cmap= 'jet', interpolation='None')
    ax.annotate('*', np.add(maze.rwd_loc, (-.5, 1)), color='w')
    ax.set_title('{}'.format(int(items[i])))


fig.subplots_adjust(right=0.8)
cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
fig.colorbar(im, cax=cbar_ax)
plt.savefig('./valplots/obs{}_valuemap'.format(obs_rho), format='svg')
plt.show()

print np.nanmax(val_maps)

data = val_maps[-1]- val_maps[3157]
plt.imshow(data, vmin=0, vmax=30, cmap='jet', interpolation='none')
plt.colorbar()
plt.show()

In [None]:
# Functions for making tensorflow graph objects for each layer
def add_layer(inputs, in_size, out_size, activation_function=None):
    inputs = tf.cast(inputs, tf.float32)
    with tf.name_scope(namescope):
        #w = tf.random_normal([in_size, out_size], mean=0, stddev=0.000001)
        w = tf.zeros([in_size, out_size])
        Weights = tf.Variable(w, dtype=tf.float32, name='weights')
        biases = tf.Variable(tf.zeros([out_size]), dtype=tf.float32, name='biases')
        Wx_plus_b = tf.matmul(inputs , Weights)+biases
        if activation_function is None :
            output = Wx_plus_b
        else:
            output = activation_function(Wx_plus_b)
            
    return output   


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        dims = [500, 200, 200, 6]
        in_dim = dims[0]
        if len(dims) >2: 
            hidden_dims = []
            for i in range(len(dims-2)):
                hidden_dims.append(dims[1+i])
        out_dim = dims[-1]

        self.layers = []
        for i in range(len(dims)):
        self.layers.append()
        
        self.layer1 = nn.Linear(in_dim)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net()
print(net)

In [None]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Variables for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Variable of input data to the Module and it produces
    # a Variable of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Variables containing the predicted and true
    # values of y, and the loss function returns a Variable containing the
    # loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Variables with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Variable, so
    # we can access its data and gradients like we did before.
    for param in model.parameters():
        param.data -= learning_rate * param.grad.data

In [None]:
dims = [500, 200, 200, 6]
in_dim = dims[0]
if len(dims)>2: 
    hidden_dims = []
    for i in range(len(dims)-2):
        hidden_dims.append(dims[1+i])
out_dim = dims[-1]

In [None]:
hidden_dims