In [1]:
# Create a folder to store csv files
import pathlib
pathlib.Path('./Punctate').mkdir()

In [2]:
import numpy as np
import random as rd
import pandas as pd

In [3]:
def punc1(gamma, alpha, explore_chance, end_states, start_states, rewards, transitions, v_state, state_list, action_list, RPE_list):
    time_step = 1
    current_state = np.random.choice(start_states) - 1
    timestep_list = []
    not_end = True
    end_states_adjusted = [i-1 for i in end_states]
    
    while not_end:
        if current_state in end_states_adjusted:
            not_end = False
            break
        
        else:
            
            ## Determine the next state, either a random subsequent state or the highest-value one based on the exploration parameter
            if np.random.uniform() < explore_chance:
                next_move = np.random.randint(len(transitions[current_state]))
            else:
                next_values = v_state[current_state]
                next_move = np.argmax(next_values)

            next_state = transitions[current_state][next_move] - 1


            next_move = np.random.randint(len(transitions[current_state]))
            next_state = transitions[current_state][next_move] - 1

            # Get reward
            reward = rewards[current_state][next_move]
            
            # calculate RPE and update weights and state values
            if next_state in end_states_adjusted: # reached the goal state
                delta = reward + 0 - v_state[current_state][next_move]
            else:
                delta = reward + gamma*np.max(v_state[next_state]) - v_state[current_state][next_move]
            
            # update state value
            v_state[current_state][next_move] += alpha * delta
            
            state_list.append(current_state + 1)
            action_list.append(next_state + 1)
            RPE_list.append(delta)
            timestep_list.append(time_step)

            # Move to the next state
            current_state = next_state
            
            time_step += 1

    return v_state, state_list, action_list, RPE_list, timestep_list

In [4]:
# function for multi episodes
def punc2(epi_num, gamma, alpha, explore_chance, end_states, start_states, rewards, transitions, v_state, state_list, action_list, RPE_list, epi_num_list):
    epi_length = []
    for k in range(epi_num):
        c_v_state, c_state_list, c_action_list, c_RPE_list, timestep_list = \
        punc1(gamma, alpha, explore_chance, end_states, start_states, rewards, transitions, v_state, state_list, action_list, RPE_list)
        
        for j in range(len(timestep_list)):
            epi_num_list.append(k+1)
                
        for j in range(len(timestep_list)):
            epi_length.append(k+1)
        
        v_state = c_v_state
        state_list = c_state_list
        action_list = c_action_list
        RPE_list = c_RPE_list
        
    return c_v_state, c_state_list, c_action_list, c_RPE_list, epi_num_list, epi_length

In [5]:
# function for multi simulations
def punc3(sim_num, epi_num, gamma, alpha, explore_chance, end_states, start_states, rewards, transitions, state_list, action_list, RPE_list, epi_num_list):
    sim_num_list = []
    
    for t in range(sim_num):
        v_state = []
        
        '''
        for i in range(len(rewards)):
            v_state.append(rewards[i].copy())
        '''

        for i in range(len(rewards)):
            row = []
            for j in range(len(rewards[i])):
                row.append(0)
            v_state.append(row)
        
        c_v_state, c_state_list, c_action_list, c_RPE_list, c_epi_num_list, epi_length = \
        punc2(epi_num, gamma, alpha, explore_chance, end_states, start_states, rewards, transitions, v_state, state_list, action_list, RPE_list, epi_num_list)
        
        for u in range(len(epi_length)):
            sim_num_list.append(t+1)
        
        state_list = c_state_list
        action_list = c_action_list
        RPE_list = c_RPE_list
        epi_num_list = c_epi_num_list
    
    return c_v_state, c_state_list, c_action_list, c_RPE_list, c_epi_num_list, sim_num_list

In [6]:
# Learning Phase

sim_num = 1
epi_num = 200
alpha = 0.50
gamma = 0.95
explore_chance = 0.5
state_list = []
action_list = []
RPE_list = []
epi_num_list = []

end_states_base = [10, 11, 12]
start_states_base = [1]
rewards_base = [[0, 0], [0, 0], [0, 0], [15], [0], [30], [0], [0], [0], [0], [0], [0]]
transitions_base = [[2, 3], [4, 5], [5, 6], [7], [8], [9], [10], [11], [12], [], [], []]

v_state = []
for i in range(len(rewards_base)):
    row = []
    for j in range(len(rewards_base[i])):
        row.append(0)
    v_state.append(row)

rl_base = punc2(epi_num, gamma, alpha, explore_chance, end_states_base, start_states_base, rewards_base, transitions_base, v_state, state_list, action_list, RPE_list, epi_num_list)

In [7]:
print(rl_base)

([[13.537499999998747, 27.074999999999733], [14.249999999998735, 0.0], [0.0, 28.49999999999983], [14.999999999999973], [0.0], [29.999999999999996], [0.0], [0.0], [0.0], [0], [0], [0]], [1, 3, 6, 9, 1, 3, 5, 8, 1, 3, 5, 8, 1, 3, 5, 8, 1, 2, 4, 7, 1, 2, 4, 7, 1, 2, 5, 8, 1, 3, 6, 9, 1, 3, 5, 8, 1, 2, 4, 7, 1, 3, 6, 9, 1, 2, 5, 8, 1, 2, 4, 7, 1, 3, 5, 8, 1, 3, 5, 8, 1, 2, 4, 7, 1, 3, 5, 8, 1, 3, 6, 9, 1, 3, 6, 9, 1, 2, 5, 8, 1, 3, 6, 9, 1, 2, 5, 8, 1, 2, 4, 7, 1, 3, 6, 9, 1, 3, 5, 8, 1, 3, 6, 9, 1, 3, 5, 8, 1, 3, 6, 9, 1, 3, 6, 9, 1, 3, 6, 9, 1, 2, 4, 7, 1, 2, 5, 8, 1, 3, 5, 8, 1, 3, 5, 8, 1, 2, 5, 8, 1, 3, 6, 9, 1, 2, 4, 7, 1, 3, 5, 8, 1, 3, 5, 8, 1, 2, 5, 8, 1, 3, 5, 8, 1, 3, 6, 9, 1, 3, 5, 8, 1, 3, 6, 9, 1, 2, 4, 7, 1, 3, 5, 8, 1, 3, 5, 8, 1, 3, 5, 8, 1, 2, 5, 8, 1, 2, 4, 7, 1, 2, 4, 7, 1, 2, 5, 8, 1, 2, 4, 7, 1, 3, 6, 9, 1, 3, 5, 8, 1, 2, 4, 7, 1, 2, 4, 7, 1, 3, 6, 9, 1, 3, 5, 8, 1, 3, 5, 8, 1, 3, 5, 8, 1, 2, 4, 7, 1, 2, 4, 7, 1, 2, 4, 7, 1, 2, 4, 7, 1, 2, 4, 7, 1, 2, 4, 7, 1, 3, 6, 9

In [9]:
# Re-Learning Phase

v_state_base = rl_base[0]
relearning_episodes = 200
relearning_start_states = [2, 3]

# Reward Revaluation
v_state_reward = []
for i in range(len(v_state_base)):
    v_state_reward.append(v_state_base[i].copy())
rewards_reward = [[0, 0], [0, 0], [0, 0], [45], [0], [30], [0], [0], [0], [0], [0], [0]]
rl_reward = punc2(relearning_episodes, gamma, alpha, explore_chance, end_states_base, relearning_start_states, rewards_reward, transitions_base, v_state_reward, state_list, action_list, RPE_list, epi_num_list)

# Transition Revaluation
v_state_transition = []
for i in range(len(v_state_base)):
    v_state_transition.append(v_state_base[i].copy())
transitions_transition = [[2, 3], [5, 6], [4, 5], [7], [8], [9], [10], [11], [12], [], [], []]
rl_transition = punc2(relearning_episodes, gamma, alpha, explore_chance, end_states_base, relearning_start_states, rewards_base, transitions_transition, v_state_transition, state_list, action_list, RPE_list, epi_num_list)

# Policy Revaluation
v_state_policy = []
for i in range(len(v_state_base)):
    v_state_policy.append(v_state_base[i].copy())
rewards_policy = [[0, 0], [0, 0], [0, 0], [45], [15], [30], [0], [0], [0], [0], [0], [0]]
rl_policy = punc2(relearning_episodes, gamma, alpha, explore_chance, end_states_base, relearning_start_states, rewards_policy, transitions_base, v_state_policy, state_list, action_list, RPE_list, epi_num_list)

# Goal State Revaluation
v_state_goal = []
for i in range(len(v_state_base)):
    v_state_goal.append(v_state_base[i].copy())
rewards_goal = [[0, 0], [0, 0], [0, 0], [15], [0], [30], [30], [0], [0], [0], [0], [0]]
rl_goal = punc2(relearning_episodes, gamma, alpha, explore_chance, end_states_base, relearning_start_states, rewards_goal, transitions_base, v_state_goal, state_list, action_list, RPE_list, epi_num_list)

# Control
v_state_control = []
for i in range(len(v_state_base)):
    v_state_control.append(v_state_base[i].copy())
rewards_control = [[0, 0], [0, 0], [0, 0], [15], [0], [30], [0], [0], [45], [0], [0], [0]]
rl_control = punc2(relearning_episodes, gamma, alpha, explore_chance, end_states_base, relearning_start_states, rewards_control, transitions_base, v_state_control, state_list, action_list, RPE_list, epi_num_list)

In [10]:
print("Original Values")
print(v_state_base[0])

print("Reward Revaluation")
print(rl_reward[0][0])

print("Transition Revaluation")
print(rl_transition[0][0])

print("Policy Revaluation")
print(rl_policy[0][0])

print("Goal State Revaluation")
print(rl_goal[0][0])

print("Control")
print(rl_control[0][0])

Original Values
[13.537499999998747, 27.074999999999733]
Reward Revaluation
[13.537499999998747, 27.074999999999733]
Transition Revaluation
[13.537499999998747, 27.074999999999733]
Policy Revaluation
[13.537499999998747, 27.074999999999733]
Goal State Revaluation
[13.537499999998747, 27.074999999999733]
Control
[13.537499999998747, 27.074999999999733]


In [None]:
# Simulations with various parameters
import numpy as np
import random as rd
import pandas as pd

seed_list = [22, 76, 50, 57, 30, 55, 33, 54,  0]
index = 0

for gamma in [0.95, 0.97, 0.99]:
    for stay_prob in [0.50, 0.75, 0.90]:
        
        rd.seed(seed_list[index])
        
        # set constant variables
        sim_num = 100
        epi_num = 200
        alpha = 0.50
        state_n = 10
        state_list = []
        action_list = []
        RPE_list = []
        epi_num_list = []
        
        # simulation
        rl = punc3(sim_num, epi_num, gamma, alpha, state_n, stay_prob, state_list,
                  action_list, RPE_list, epi_num_list)
        
        # create dataframe and convert it to csv
        result = pd.DataFrame({'Simulation': rl[5], 'Episode': rl[4], 'State': rl[1],
                              'Action': rl[2], 'RPE': rl[3]})
        result.to_csv('./Punctate/g{:.0f}_s{:.0f}_{:.0f}states.csv'.format(100*gamma, 100*stay_prob, state_n))
        index += 1