# Equations

$ \delta = [R_{t+1} + \gamma Q(s_{t+1}, a_{t+1})] - Q(s_t,a_t) $

$ Q(s_t,a_t) = Q(s_t,a_t) + \alpha\delta $

## Working Memory With SARSA

In [43]:
#from plotly.graph_objs import Scatter, Layout
import matplotlib.pyplot as plt
import plotly
import numpy as np
import random
import hrr
from plotly.graph_objs import Scatter, Layout, Surface
plotly.offline.init_notebook_mode(connected=True)

In [132]:
def TD(nstates,nepisodes,lrate,gamma,td_lambda): 
    n = 1024
    #nstates = 50
    nactions = 2
    nslots = 2
    
    #goal for red is at 0, green at middle
    goal = [0,nstates//2]
    reward = np.zeros((nslots,nstates))
    
    # reward matrix for each context
    for x in range(nslots):
        reward[x,goal[x]] = 1
    
    # basic actions are left and right
    states = hrr.hrrs(n,nstates)
    actions = hrr.hrrs(n,nactions)
    
    # identity vector
    hrr_i = np.zeros(n)
    hrr_i[0] = 1
    
    # WorkingMemory
    wm_slots = hrr.hrrs(n,nslots)
    
    # External representation of color
    ncolors = 2
    colors = hrr.hrrs(n,ncolors)
    
    # add identity vector to colors matrix
    colors = np.row_stack((colors,hrr_i))
    #print("color shape:",colors.shape)
    
    # add identity vector to wm_slots matrix
    wm_slots = np.row_stack((wm_slots,hrr_i))
    
    # precomputed state/action/working_memory triplet
    stateactions = hrr.oconvolve(actions,states)
    s_a_wm = hrr.oconvolve(stateactions,wm_slots)
    #s_a_wm = np.reshape(s_a_wm,(nslots,nstates,nactions,n))
    s_s_a_wm = hrr.oconvolve(s_a_wm,colors)
    s_s_a_wm = np.reshape(s_s_a_wm,(ncolors+1,nslots+1,nstates,nactions,n))
    
    
    # weight vector
    W = hrr.hrr(n)
    bias = 1

    #lrate = 0.1
    eligibility = np.zeros(n)
    #gamma = 0.9
    #td_lambda = 0.5
    epsilon = 0.05
    #nepisodes = 10000
    nsteps = 100
   
    
    for episode in range(nepisodes):
        state = random.randrange(0,nstates)
        #initialize current Working memory
        current_wm = nslots - 1
        
        # cue to signal context
        color_signal = random.randrange(0,ncolors)
        #values = np.dot(s_a_wm[:,state,:,:],W) + bias
        values = np.dot(s_s_a_wm[color_signal,:,state,:,:],W) + bias
        # returns index (row,col) of max value
       
        color_action = np.unravel_index(values.argmax(), values.shape)
        #print(color_action.shape)
        #action = values.argmax()
    
        color = color_action[0]
        action = color_action[1]
        if random.random() < epsilon:
            action = random.randrange(0,nactions)
            
        eligibility = np.zeros(n)
        
        for step in range(nsteps):
            r = reward[color_signal,state]
            if state == goal[color_signal]:
                #eligibility = s_a_wm[color,state,action,:] + td_lambda*gamma*eligibility
                eligibility = s_s_a_wm[color_signal,current_wm,state,action,:] + td_lambda*gamma*eligibility
                error = r - values[current_wm,action]
                W += lrate*error*eligibility
                break
                
            pstate = state
            pvalues = values
            paction = action
            
            previous_wm = current_wm
            
            #eligibility = s_a_wm[color,state,action,:] + td_lambda*gamma*eligibility
            eligibility = s_s_a_wm[color_signal,current_wm,state,action,:] + td_lambda*gamma*eligibility
            state = ((state+np.array([-1,1]))%nstates)[action]
            
            #values = np.dot(s_a_wm[:,state,:,:],W) + bias 
            values = np.dot(s_s_a_wm[color_signal,:,state,:,:],W) + bias
            color_action = np.unravel_index(values.argmax(), values.shape)
            current_wm = color_action[0]
            action = color_action[1]
            #action = values.argmax()
            if random.random() < epsilon:
                action = random.randrange(0,nactions)
                
            error = (r+gamma*values[current_wm,action])-pvalues[paction]
            W += lrate*error*eligibility
            
        
    V1 = list(map(lambda x: np.dot(x,W)+bias, s_s_a_wm[0,0,0,:,:]))
    V2 = list(map(lambda x: np.dot(x,W)+bias, s_s_a_wm[0,0,1,:,:]))
    V3 = list(map(lambda x: np.dot(x,W)+bias, s_s_a_wm[1,1,0,:,:]))
    V4 = list(map(lambda x: np.dot(x,W)+bias, s_s_a_wm[1,1,1,:,:]))
    
    plotly.offline.iplot([
    dict(x=[x for x in range(len(V1))] , y=V1, type='scatter',name='left and red'),
    dict(x=[x for x in range(len(V1))] , y=V2, type='scatter',name='right and red'),
    dict(x=[x for x in range(len(V1))] , y=V3, type='scatter',name='left and green'),
    dict(x=[x for x in range(len(V1))] , y=V4, type='scatter',name='right and green')
    ])

In [133]:
TD(50,10000,.1,.9,.5)
#inputs: nstates,nepisodes,lrate,gamma,td_lambda

color shape: (3, 1024)


ValueError: operands could not be broadcast together with shapes (2,) (1024,) 

# Testing Stuff Below

In [31]:
nslots = 3
nstates = 5
ncolors = 2
n = 10
nactions = 2
bias = 1
# basic actions are left and right
states = hrr.hrrs(n,nstates)
actions = hrr.hrrs(n,nactions)
wm_slots = hrr.hrrs(n,nslots)
colors = hrr.hrrs(n,ncolors)
    
stateactions = hrr.oconvolve(actions,states)
s_a_wm = hrr.oconvolve(stateactions,wm_slots)
s_s_a_wm = hrr.oconvolve(s_a_wm,colors)
s_s_a_wm = np.reshape(s_s_a_wm,(ncolors,nslots,nstates,nactions,n))

W = hrr.hrr(n)
print(s_s_a_wm)

[[[[[  2.77710361e-01  -1.38923364e-01  -3.64956344e-01   9.02234529e-02
       3.44268711e-01  -2.37548228e-02  -9.45448047e-02   5.05515983e-02
      -2.92461853e-01  -2.10651761e-02]
    [ -1.52650195e-01  -2.19966194e-01   1.48672286e-01   2.37240793e-01
      -9.66869392e-02  -1.86226126e-01  -2.02194123e-03  -3.72009952e-02
       1.40017725e-01   2.83722774e-01]]

   [[ -2.91782726e-01  -2.83811969e-01   5.03339004e-01   1.80637922e-01
      -2.82913357e-01  -2.79541901e-01   2.58913565e-01   4.89037539e-02
       9.98663404e-02   2.01095109e-01]
    [ -6.48620230e-02   4.15745746e-01   8.44468283e-02  -3.85951194e-01
      -1.19222796e-01   1.94544048e-01   1.69909313e-01  -7.56270999e-02
      -2.45165645e-02  -2.97245343e-01]]

   [[  1.69800880e-01   2.20687373e-02   1.46427733e-01  -5.35918251e-02
      -2.32499142e-01   1.67990348e-01  -9.38813959e-02  -2.23785553e-01
      -5.68122021e-02   1.34537756e-01]
    [ -1.23430593e-01  -9.18559403e-02  -6.63351615e-02  -7.754293

In [None]:
s_a_wm = np.reshape(s_a_wm,(nslots,nstates,nactions,n))
print(s_a_wm)

In [None]:
state = 3
values = np.dot(s_a_wm[:,state,:,],W) + bias
print(values)
#action = np.argmax(values)
action = np.unravel_index(values.argmax(), values.shape)
print(action)
(nslots,nactions)

In [None]:
s_a_wm.shape

In [None]:
s_a_wm[0]

In [None]:
goal = [0,nstates//2]

In [None]:
goal

In [None]:
reward = np.zeros((2,3))
print (reward)

In [None]:
s_a_wm[0,0,:,:]

In [142]:
states = hrr.hrrs(n,nstates)
actions = hrr.hrrs(n,nactions)

    # identity vector
hrr_i = np.zeros(n)
hrr_i[0] = 1
    
    # WorkingMemory
wm_slots = hrr.hrrs(n,nslots)
    
    # External representation of color
ncolors = 2
colors = hrr.hrrs(n,ncolors)
nslots = 2

    # add identity vector to colors matrix
colors = np.row_stack((colors,hrr_i))
    #print("color shape:",colors.shape)
    
    # add identity vector to wm_slots matrix
wm_slots = np.row_stack((wm_slots,hrr_i))
    
    # precomputed state/action/working_memory triplet
stateactions = hrr.oconvolve(actions,states)
s_a_wm = hrr.oconvolve(stateactions,wm_slots)
    #s_a_wm = np.reshape(s_a_wm,(nslots,nstates,nactions,n))
s_s_a_wm = hrr.oconvolve(s_a_wm,colors)
s_s_a_wm = np.reshape(s_s_a_wm,(ncolors+1,nslots+1,nstates,nactions,n))
    
    
    # weight vector
W = hrr.hrr(n)
bias = 1

In [143]:
color_signal = 0
state = 0
values = np.dot(s_s_a_wm[color_signal,:,state,:,:],W) + bias
        # returns index (row,col) of max value
       
color_action = np.unravel_index(values.argmax(), values.shape)
        #print(color_action.shape)
        #action = values.argmax()
    
color = color_action[0]
action = color_action[1]

In [139]:
print(color_action)
print(values)

(2, 1)
[[ 0.94648173  0.73398013]
 [ 0.75742677  0.99427962]
 [ 0.89761643  1.2950496 ]
 [ 0.8058958   0.91645998]]


In [144]:
s_s_a_wm.shape

(3, 3, 5, 2, 10)