# Q Learning 

In [1]:
import numpy as np

# display output
from random import uniform
import random
import time
from IPython.display import display, clear_output

In [2]:
actions = [[-1, 0], [0, 1], [1, 0], [0, -1]] #up, right, down, left = (clockwise from up) 
action_count = len(actions) # total number of actions
gridSize = 5 # create a square grid of gridSize by gridSize
state_count = gridSize*gridSize # total number of states

In [3]:
class Gridworld():
    def __init__(self, gridSize):
        self.valueMap = np.zeros((gridSize, gridSize))
        self.states = [[i, j] for i in range(gridSize) for j in range(gridSize)]
        self.size = gridSize
        self.new_pos = [0, 0] # initialize new position for p_transition
        self.transition_prob = 1 # deterministic
    
    def initial_state(self):        # return initial state
        return grid.states[gridSize*gridSize-1]
   
    def transition_reward(self, current_pos, action): # return the transition probability

        # get next position: state: [0, 0], action: [0, 1], new_state = [0, 1]
        self.new_pos = np.array(current_pos) + np.array(action)

        # normally, reward = 0
        reward = 0

        # if new pos results in off the grid, return reward -1
        if -1 in self.new_pos or self.size in self.new_pos:
            reward = -1
        # if in state A, receive + 10
        if current_pos == [0, 1]:
            reward = 10
        # if in state B, receive + 5
        if current_pos == [0, 3]:
            reward = 5

        # if taking an action crosses the border; agent's new_pos is the same as the current pos
        if -1 in self.new_pos or self.size in self.new_pos: 
            self.new_pos = current_pos
            
        # if in state A, transition to state A'
        if current_pos == [0, 1]:
            self.new_pos = [4, 1]
            
        # if in state B, transition to state B'
        if current_pos == [0, 3]:
            self.new_pos = [2, 3]

        return self.new_pos, reward

In [4]:
# create a grid object
grid = Gridworld(5)

## Q Learning

In [5]:
# initialize q values for all state action pairs
Q_values = np.zeros((state_count, action_count))

# initialize other parameters
epsilon = 0.2
gamma = 0.99
lr = 0.1
# state = 1

In [6]:
def choose_action(state, epsilon):
    
    # choose an action type: explore or exploit
    action_type = int(np.random.choice(2, 1, p=[epsilon,1-epsilon]))

    # find best action based on Q values
    best_action = np.argmax(Q_values[state])

    # pick a random action
    random_action = random.choice(range(4))

    # while random action is the same as the best action, pick a new action
    while random_action == best_action:
        random_action = random.choice(range(4))

    # choose an action based on exploit or explore
    if action_type == 0:
        # explore
        action = random_action
    else:
        # exploit
        action = best_action
    
    return action

In [7]:
# # randomly generate Q values (for testing purpose)
# Q_values = np.random.randint(10, size=(state_count, action_count))
# Q_values

In [8]:
Q_values

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [9]:
for episode in range(500):

    # initialize state
    state = grid.initial_state()

    for step in range(200):

        # get state index
        state_index = grid.states.index(state)

        # choose an action based on epsilon-greedy
        chosen_action = choose_action(state_index, epsilon)

        # convert chosen action to its vector representation
        action_vector = actions[chosen_action]

        # get the next state and reward after taking the chosen action in the current state
        next_state, reward = grid.transition_reward(state, action_vector)

        # get next_state's index
        next_state_index = grid.states.index(list(next_state))

        # update Q value
        Q_values[state_index][chosen_action] = Q_values[state_index][chosen_action] + lr*(reward + gamma*np.max(Q_values[next_state_index])-Q_values[state_index][chosen_action])

        # set the next state as the current state
        state = list(next_state)

In [10]:
np.set_printoptions(precision=2)
Q_values

array([[  2.31,   4.85,  33.24,   2.47],
       [ 17.92,  33.76,  35.2 , 161.41],
       [150.44, 166.66, 150.35, 116.1 ],
       [168.34, 168.34, 168.34, 168.34],
       [ 92.1 ,  89.09, 163.34, 118.94],
       [  4.76, 142.26,  11.37,  16.65],
       [138.58, 133.08, 160.09,  91.61],
       [164.99, 164.99, 161.71, 158.49],
       [166.66, 163.34, 163.34, 163.34],
       [161.71, 162.34, 161.71, 164.99],
       [ 40.21, 157.71,   0.  ,  -0.22],
       [154.86, 161.71, 154.38, 147.85],
       [163.34, 163.34, 160.09, 160.09],
       [164.99, 161.71, 161.71, 161.71],
       [163.34, 160.71, 160.09, 163.34],
       [ 12.9 ,  90.46,   0.  ,   8.11],
       [160.09,  95.61,  87.68,  30.33],
       [161.71, 155.88, 134.61, 149.26],
       [163.34, 160.09, 160.09, 160.09],
       [161.05, 159.02, 158.05, 161.71],
       [  0.  ,  77.71,   3.69,   0.  ],
       [157.88,  76.6 ,  47.52,  22.6 ],
       [158.38,  24.78,  39.68,  26.73],
       [161.71, 127.1 , 125.64,  99.38],
       [160.09, 

# Visualize 

In [11]:
# PRINT POLICY TABLE ################################################################################
# import pandas library
import pandas as pd
# define column and index
columns=range(grid.size)
index = range(grid.size)
# define dataframe to represent policy table
policy_table = pd.DataFrame(index = index, columns=columns)

# iterate through policy to make a table that represents action number
# as action name (eg. left, right, up, down)
for state in range(len(Q_values)):
    
    # find the best action at each state
    best_action = np.argmax(Q_values[state])

    # get action name
    if best_action == 0:
        action_name = 'up'
    elif best_action == 1:
        action_name = 'right'
    elif best_action == 2:
        action_name = 'down'
    else:
        action_name = 'left'

    # calculate the row and column coordinate of the current state number
    row = int(state/grid.size)
    column = round((state/grid.size - int(state/grid.size))*grid.size)
            
    # assign action name
    policy_table.loc[row][column] = action_name

print("Policy Table: ")
print(policy_table)
print()

Policy Table: 
       0      1      2   3     4
0   down   left  right  up  down
1  right   down  right  up  left
2  right  right     up  up  left
3  right     up     up  up  left
4  right     up     up  up    up

