In [1]:
# Create a foler to store results
import pathlib
pathlib.Path('./Full_SR').mkdir()

In [1]:
import numpy as np
import random as rd

np.set_printoptions(suppress=True, linewidth=np.inf)

In [2]:
def list_flatten(list):
    return [item for row in list for item in row]

def get_flattened_index(list, row, item):
    index = 0
    for i in range(row):
        index += len(list[i])
    index += item
    return index

In [35]:
# function for one episode
def fullSR1(gamma, alpha, explore_chance, end_states, start_states, rewards, transitions, num_pairs, init_sr, init_weight, state_list, action_list, RPE_list, weight_list): # weight_list=[[],[],[], ..., []]
    weight = init_weight # starts with zeros
    time_step = 1
    feat = init_sr # feature vector; Successor Representation: starts with zeros
    
    # Initial state value (will also be zeros)
    v_state = []
    for k in range(num_pairs):
        v_state.append(np.sum(weight*feat[k]))
        
    current_state = np.random.choice(start_states) - 1
    timestep_list = []
    not_end = True
    end_states_adjusted = [i-1 for i in end_states]
    
    while not_end:
        if current_state in end_states_adjusted:
            not_end = False
            break
        
        else:
            # Determine the next state, either a random subsequent state or the highest-value subsequent state, depending on the exploration parameter
            if np.random.uniform() < explore_chance:
                next_move = np.random.randint(len(transitions[current_state]))
            else:
                next_move_index = get_flattened_index(transitions, current_state, 0)
                next_values = v_state[next_move_index:(next_move_index+len(transitions[current_state]))]
                next_move = np.argmax(next_values)

            next_state = transitions[current_state][next_move] - 1

            # Determine the best action to take from the NEXT state, used to calculate the one-hot vector for updating the successor matrix
            next_move_index = get_flattened_index(transitions, next_state, 0)
            next_values = v_state[next_move_index:(next_move_index+len(transitions[next_state]))]

            # Occasionally assume a random next move as the immediate successor instead of the best next move.
            # This way the successor matrix will reflect all possible successor states but have larger values for the highest-reward ones
            # This is important for the policy reevaluation condition
            best_next_move = np.argmax(next_values) + next_move_index
            random_next_move = np.random.randint(len(transitions[next_state])) + next_move_index
            if np.random.uniform() < explore_chance:
                next_move_one_hot = random_next_move
            else:
                next_move_one_hot = best_next_move

            # Get reward
            reward = rewards[current_state][next_move]

            weight_delta = reward - weight[get_flattened_index(rewards, current_state, next_move)]

            weight[get_flattened_index(rewards, current_state, next_move)] += alpha * weight_delta

            one_hot = np.zeros(num_pairs)

            #one_hot[next_move_one_hot] = 1
            one_hot[get_flattened_index(transitions, current_state, next_move)] = 1

            # Needs to be based on current state & next move, not just current state
            feat_delta = one_hot + gamma * feat[next_move_one_hot] - feat[get_flattened_index(transitions, current_state, next_move)]

            feat[get_flattened_index(transitions, current_state, next_move)] += alpha * feat_delta
            
            '''
            # calculate RPE and update weights and state values
            if next_state in end_states: # reached the goal state
                delta = reward + 0 - v_state[current_state][next_move]
            else:
                delta = reward + gamma*np.max(v_state[next_state]) - v_state[current_state][next_move]
            
            # update weights
            weight += alpha * delta * feat[current_state]
            '''
            
            # update state value
            for k in range(num_pairs):
                v_state[k] = np.sum(weight*feat[k])
            
            state_list.append(current_state + 1)
            action_list.append(next_state + 1)
            RPE_list.append(weight_delta)
            timestep_list.append(time_step)
            
            for k in range(num_pairs):
                weight_list[k].append(weight[k])
            
            # Move to the next state
            current_state = next_state
            
            time_step += 1

    return weight, feat, state_list, action_list, RPE_list, timestep_list, weight_list

# function for multi episodes
def fullSR2(epi_num, gamma, alpha, explore_chance, end_states, start_states, rewards, transitions, num_pairs, init_sr, init_weight, state_list, action_list, RPE_list, weight_list, epi_num_list):
    epi_length = []
    for k in range(epi_num):
        c_weight, c_feat, c_state_list, c_action_list, c_RPE_list, timestep_list, c_weight_list = \
        fullSR1(gamma, alpha, explore_chance, end_states, start_states, rewards, transitions, num_pairs, init_sr, init_weight, state_list, action_list, RPE_list, weight_list)
        
        for j in range(len(timestep_list)):
            epi_num_list.append(k+1)
                
        for j in range(len(timestep_list)):
            epi_length.append(k+1)
        
        init_weight = c_weight
        init_sr = c_feat
        state_list = c_state_list
        action_list = c_action_list
        RPE_list = c_RPE_list
        weight_list = c_weight_list
        
    return c_weight, c_feat, c_state_list, c_action_list, c_RPE_list, c_weight_list, epi_num_list, epi_length


# function for multi simulations
def fullSR3(sim_num, epi_num, gamma, alpha, end_states, start_states, rewards, transitions, num_pairs, state_list, action_list, RPE_list, weight_list, epi_num_list):
    sim_num_list = []
    
    # SR
    '''
    init_sr = np.array([])
    for j in range(state_n):
        row = np.array([])
    
        z = np.zeros(j)
        row = np.append(row, z)

        for k in range(state_n - j):
            row = np.append(row, gamma**(k))
            
        init_sr = np.append(init_sr, row)

    init_sr = init_sr.reshape((state_n, state_n))
    '''

    init_sr = np.zeros((num_pairs, num_pairs))

    '''
    for i in range(num_pairs):
        init_sr[i][i] = 1
    '''
    
    # Simulation
    for t in range(sim_num):

        init_weight = np.zeros(num_pairs)

        '''
        init_weight = []
        for i in range(len(rewards)):
            for j in range(len(rewards[i])):
                init_weight.append(0)
        '''
        
        c_weight, c_feat, c_state_list, c_action_list, c_RPE_list, c_weight_list, c_epi_num_list, epi_length = \
        fullSR2(epi_num, gamma, alpha, explore_chance, end_states, start_states, rewards, transitions, num_pairs, init_sr, init_weight, state_list, action_list, RPE_list, weight_list, epi_num_list)
        
        for u in range(len(epi_length)):
            sim_num_list.append(t+1)
    
        state_list = c_state_list
        action_list = c_action_list
        RPE_list = c_RPE_list
        weight_list = c_weight_list
        epi_num_list = c_epi_num_list
    
    return c_weight, c_feat, c_state_list, c_action_list, c_RPE_list, c_weight_list, c_epi_num_list, sim_num_list

In [36]:
sim_num = 1
epi_num = 200
gamma = 0.97
alpha = 0.50
explore_chance = 0.5

end_states_base = [10, 11, 12, 13]
start_states_base = [1]
rewards_base = [[0, 0], [0, 0], [0, 0], [15], [0], [30], [0], [0], [0], [0], [0], [0]]
transitions_base = [[2, 3], [4, 5], [5, 6], [7], [8], [9], [10], [11], [12], [13], [13], [13]]
num_pairs = len(list_flatten(rewards_base))

init_sr_base = np.zeros((num_pairs, num_pairs))
init_weight_base = np.zeros(num_pairs)

state_list = []
action_list = []
RPE_list = []
weight_list = [[] for k in range(num_pairs)]
epi_num_list = []

In [37]:
rl_base = fullSR2(epi_num, gamma, alpha, explore_chance, end_states_base, start_states_base, rewards_base, transitions_base, num_pairs, init_sr_base, init_weight_base, \
    state_list, action_list, RPE_list, weight_list, epi_num_list)

In [48]:
end_states_policy_base = [10, 11, 12, 13]
start_states_policy_base = [1]
rewards_policy_base = [[0, 0], [0, 0], [0, 0], [0], [15], [30], [0], [0], [0], [0], [0], [0]]
transitions_policy_base = [[2, 3], [4, 5], [5, 6], [7], [8], [9], [10], [11], [12], [13], [13], [13]]
num_pairs = len(list_flatten(rewards_policy_base))

init_sr_policy_base = np.zeros((num_pairs, num_pairs))
init_weight_policy_base = np.zeros(num_pairs)

state_list = []
action_list = []
RPE_list = []
weight_list = [[] for k in range(num_pairs)]
epi_num_list = []

In [49]:
rl_policy_base = fullSR2(epi_num, gamma, alpha, explore_chance, end_states_policy_base, start_states_policy_base, rewards_policy_base, transitions_policy_base, num_pairs, \
    init_sr_policy_base, init_weight_policy_base, state_list, action_list, RPE_list, weight_list, epi_num_list)
init_weight_policy_base = rl_policy_base[0]
init_sr_policy_base = rl_policy_base[1]

In [38]:
v_state_base = np.zeros(num_pairs)
for k in range(num_pairs):
    v_state_base[k] = np.sum(rl_base[0]*rl_base[1][k])

print(np.around(v_state_base[0:2], 3))

[13.555 22.931]


In [50]:
# Re-Learning Phase
init_weight_base = rl_base[0]
init_sr_base = rl_base[1]
relearning_episodes = 200
relearning_start_states = [2, 3]

# Reward Revaluation
init_weight_reward = init_weight_base.copy()
init_sr_reward = np.copy(init_sr_base)
rewards_reward = [[0, 0], [0, 0], [0, 0], [45], [0], [30], [0], [0], [0], [0], [0], [0]]
rl_reward = fullSR2(relearning_episodes, gamma, alpha, explore_chance, end_states_base, relearning_start_states, rewards_reward, transitions_base, num_pairs, init_sr_reward, init_weight_reward, state_list, action_list, RPE_list, weight_list, epi_num_list)

# Transition Revaluation
init_weight_transition = init_weight_base.copy()
init_sr_transition = np.copy(init_sr_base)
transitions_transition = [[2, 3], [5, 6], [4, 5], [7], [8], [9], [10], [11], [12], [13], [13], [13]]
rl_transition = fullSR2(relearning_episodes, gamma, alpha, explore_chance, end_states_base, relearning_start_states, rewards_base, transitions_transition, num_pairs, init_sr_transition, init_weight_transition, state_list, action_list, RPE_list, weight_list, epi_num_list)

# Policy Revaluation
init_weight_policy = init_weight_policy_base.copy()
init_sr_policy = np.copy(init_sr_policy_base)
rewards_policy = [[0, 0], [0, 0], [0, 0], [45], [15], [30], [0], [0], [0], [0], [0], [0]]
rl_policy = fullSR2(relearning_episodes, gamma, alpha, explore_chance, end_states_base, relearning_start_states, rewards_policy, transitions_base, num_pairs, init_sr_policy, init_weight_policy, state_list, action_list, RPE_list, weight_list, epi_num_list)

# Goal State Revaluation
init_weight_goal = init_weight_base.copy()
init_sr_goal = np.copy(init_sr_base)
rewards_goal = [[0, 0], [0, 0], [0, 0], [15], [0], [30], [30], [0], [0], [0], [0], [0]]
rl_goal = fullSR2(relearning_episodes, gamma, alpha, explore_chance, end_states_base, relearning_start_states, rewards_goal, transitions_base, num_pairs, init_sr_goal, init_weight_goal, state_list, action_list, RPE_list, weight_list, epi_num_list)

# Control
init_weight_control = init_weight_base.copy()
init_sr_control = np.copy(init_sr_base)
rewards_control = [[0, 0], [0, 0], [0, 0], [15], [0], [30], [0], [0], [45], [0], [0], [0]]
rl_control = fullSR2(relearning_episodes, gamma, alpha, explore_chance, end_states_base, relearning_start_states, rewards_control, transitions_base, num_pairs, init_sr_control, init_weight_control, state_list, action_list, RPE_list, weight_list, epi_num_list)

In [43]:
weight_base = rl_base[0]
feat_base = rl_base[1]

print(weight_base)
print(np.around(feat_base, 2))

[ 0.  0.  0.  0.  0.  0. 15.  0. 30.  0.  0.  0.  0.  0.  0.]
[[1.   0.   0.93 0.04 0.   0.   0.9  0.04 0.   0.88 0.04 0.   0.   0.   0.  ]
 [0.   1.   0.   0.   0.18 0.79 0.   0.18 0.76 0.   0.17 0.74 0.   0.   0.  ]
 [0.   0.   1.   0.   0.   0.   0.97 0.   0.   0.94 0.   0.   0.   0.   0.  ]
 [0.   0.   0.   1.   0.   0.   0.   0.97 0.   0.   0.94 0.   0.   0.   0.  ]
 [0.   0.   0.   0.   1.   0.   0.   0.97 0.   0.   0.94 0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   1.   0.   0.   0.97 0.   0.   0.94 0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   1.   0.   0.   0.97 0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.97 0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.97 0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.  ]
 [

In [51]:
print("Original Values")
v_state_base = np.zeros(num_pairs)
for k in range(num_pairs):
    v_state_base[k] = np.sum(rl_base[0]*rl_base[1][k])
print(np.around(v_state_base[0:2], 3))

print("Reward Revaluation")
v_state_reward = np.zeros(num_pairs)
for k in range(num_pairs):
    v_state_reward[k] = np.sum(rl_reward[0]*rl_reward[1][k])
print(np.around(v_state_reward[0:2], 3))

print("Transition Revaluation")
v_state_transition = np.zeros(num_pairs)
for k in range(num_pairs):
    v_state_transition[k] = np.sum(rl_transition[0]*rl_transition[1][k])
print(np.around(v_state_transition[0:2], 3))

print("Policy Revaluation")
v_state_policy = np.zeros(num_pairs)
for k in range(num_pairs):
    v_state_policy[k] = np.sum(rl_policy[0]*rl_policy[1][k])
print(np.around(v_state_policy[0:2], 3))

print("Goal State Revaluation")
v_state_goal = np.zeros(num_pairs)
for k in range(num_pairs):
    v_state_goal[k] = np.sum(rl_goal[0]*rl_goal[1][k])
print(np.around(v_state_goal[0:2], 3))

print("Control")
v_state_control = np.zeros(num_pairs)
for k in range(num_pairs):
    v_state_control[k] = np.sum(rl_control[0]*rl_control[1][k])
print(np.around(v_state_control[0:2], 3))

Original Values
[13.555 22.931]
Reward Revaluation
[40.665 22.931]
Transition Revaluation
[13.555 22.931]
Policy Revaluation
[17.285 20.684]
Goal State Revaluation
[39.852 22.931]
Control
[13.555 56.296]


In [42]:
weight = rl[0]
feat = rl[1]

print(weight)
print(np.around(rl[1], 2))

NameError: name 'rl' is not defined

In [161]:
v_state = np.zeros(num_pairs)
for k in range(num_pairs):
    v_state[k] = np.sum(rl[0]*rl[1][k])

print(np.around(v_state, 3))

[14.55 29.1  15.    0.    0.   30.    0.    0.    0.    0.    0.    0.    0.    0.    0.  ]


In [3]:
# Multi Simulations

sim_num = 100
epi_num = 200
gamma = 0.97
alpha = 0.50
state_n = 10
stay_prob = 0.75
state_list = []
action_list = []
RPE_list = []
weight_list = [[] for k in range(state_n)]
init_weight = []
epi_num_list = []

rl = fullSR3(sim_num, epi_num, gamma, alpha, state_n, init_weight, stay_prob, 
             state_list, action_list, RPE_list, weight_list, epi_num_list)

# Create dataframe
import pandas as pd

result = \
pd.DataFrame({'Simulation': rl[6], 'Episode': rl[5], 'State': rl[1], 'Action': rl[2], 
              'RPE': rl[3], 'W1': rl[4][0], 'W2': rl[4][1], 'W3': rl[4][2], 'W4': rl[4][3], 
              'W5': rl[4][4], 'W6': rl[4][5], 'W7': rl[4][6], 'W8': rl[4][7], 'W9': rl[4][8], 
              'W10': rl[4][9]})

# Convert dataframe to csv
result.to_csv('./Full_SR/{}sim_{}epi_g{:.0f}_s{:.0f}_{:.0f}states.csv'.format(sim_num, epi_num, 100*gamma, 100*stay_prob, state_n))