In [1]:
import numpy as np
from tqdm import tqdm
import random
import pandas as pd

In [2]:
gamma = 0.85 # discounting rate
lr = 1
rewardSize = -1
gridSize = 4
terminationStates = [[0,0], [gridSize-1, gridSize-1]]
actionLabels = ['Up', 'Down', 'Right', 'Left']
actions = [[-1, 0], [1, 0], [0, 1], [0, -1]]
numIterations = 20000

In [3]:
Value_table = np.zeros((gridSize, gridSize, len(actions)))
states = [[i, j] for i in range(gridSize) for j in range(gridSize)]

In [4]:
states

[[0, 0],
 [0, 1],
 [0, 2],
 [0, 3],
 [1, 0],
 [1, 1],
 [1, 2],
 [1, 3],
 [2, 0],
 [2, 1],
 [2, 2],
 [2, 3],
 [3, 0],
 [3, 1],
 [3, 2],
 [3, 3]]

In [5]:
Value_table

array([[[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]]])

In [6]:
def generateEpisode():
    initState = random.choice(states[1:-1])
    episode = []
    
    while True:
        if list(initState) in terminationStates:
            return episode
        
        action = random.choice(actions)
        finalState = np.array(initState)+np.array(action)
        
        if -1 in list(finalState) or gridSize in list(finalState):
            finalState = initState
            
        episode.append([list(initState), action, rewardSize, list(finalState)])
        initState = finalState

In [7]:
for it in tqdm(range(numIterations)):

    episode = generateEpisode()

    for i, step in enumerate(episode[::-1]):

        if step[0] not in [x[0] for x in episode[::-1][len(episode)-i:]]:


            idxinitstate = (step[0][0], step[0][1])  
            
            idxaction = actions.index(step[1])
    
            reward = step[2]
        
            idxnewstate = (step[3][0], step[3][1])
            
            oldValue = Value_table[idxinitstate[0], idxinitstate[1], idxaction]

            Value_table[idxinitstate[0], idxinitstate[1], idxaction] = oldValue + lr * (reward + gamma * np.max(Value_table[idxnewstate[0], idxnewstate[1], :]) - oldValue)

100%|██████████| 20000/20000 [00:05<00:00, 3609.96it/s]


In [8]:
Value_table

array([[[ 0.      ,  0.      ,  0.      ,  0.      ],
        [-1.85    , -2.5725  , -2.5725  , -1.      ],
        [-2.5725  , -3.186625, -3.186625, -1.85    ],
        [-3.186625, -2.5725  , -3.186625, -2.5725  ]],

       [[-1.      , -2.5725  , -2.5725  , -1.85    ],
        [-1.85    , -3.186625, -3.186625, -1.85    ],
        [-2.5725  , -2.5725  , -2.5725  , -2.5725  ],
        [-3.186625, -1.85    , -2.5725  , -3.186625]],

       [[-1.85    , -3.186625, -3.186625, -2.5725  ],
        [-2.5725  , -2.5725  , -2.5725  , -2.5725  ],
        [-3.186625, -1.85    , -1.85    , -3.186625],
        [-2.5725  , -1.      , -1.85    , -2.5725  ]],

       [[-2.5725  , -3.186625, -2.5725  , -3.186625],
        [-3.186625, -2.5725  , -1.85    , -3.186625],
        [-2.5725  , -1.85    , -1.      , -2.5725  ],
        [ 0.      ,  0.      ,  0.      ,  0.      ]]])

In [9]:
Qdf = pd.DataFrame(Value_table.reshape((16,4)), columns=actionLabels)

In [10]:
Qdf['States'] = states

In [11]:
Qdf = Qdf.set_index('States')

In [12]:
Pi_table = np.zeros((gridSize * gridSize, len(actions)))
Pi_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [13]:
minValue = np.array(Value_table).min() - 1
minValue

-4.186624999999999

In [14]:
for state in range(len(states)):
    
    if list(states[state]) not in terminationStates:
        
        action_candidates = []
        for i in range(4):

            finalState = np.array(states[state])+np.array(actions[i])
            if -1 in list(finalState) or gridSize in list(finalState):
                action_candidates.append([minValue, i])
            else:
                row = states[state][0] 
                col = states[state][1] 

                value = Value_table[row, col, i]

                action_candidates.append([value, i])
        
        actionList = np.array(action_candidates)[:,0]

        maxvalue = max(actionList)

        for i in range(4):
            if actionList[i] == maxvalue:
                Pi_table[state][i] = 1    

In [15]:
Pi_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 1., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 1.],
       [1., 1., 1., 1.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 1., 1., 1.],
       [0., 1., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 0.]])

In [16]:
Pidf = pd.DataFrame(Pi_table, columns=actionLabels)

In [17]:
Pidf['States'] = states

In [18]:
Pidf = Pidf.set_index('States')

In [19]:
Pidf['Up'] = Pidf['Up'].apply(lambda x: 'X' if x == 1 else ' ')
Pidf['Down'] = Pidf['Down'].apply(lambda x: 'X' if x == 1 else ' ')
Pidf['Right'] = Pidf['Right'].apply(lambda x: 'X' if x == 1 else ' ')
Pidf['Left'] = Pidf['Left'].apply(lambda x: 'X' if x == 1 else ' ')

In [20]:
print('')
print('             Q-TABLE')
Qdf


             Q-TABLE


Unnamed: 0_level_0,Up,Down,Right,Left
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"[0, 0]",0.0,0.0,0.0,0.0
"[0, 1]",-1.85,-2.5725,-2.5725,-1.0
"[0, 2]",-2.5725,-3.186625,-3.186625,-1.85
"[0, 3]",-3.186625,-2.5725,-3.186625,-2.5725
"[1, 0]",-1.0,-2.5725,-2.5725,-1.85
"[1, 1]",-1.85,-3.186625,-3.186625,-1.85
"[1, 2]",-2.5725,-2.5725,-2.5725,-2.5725
"[1, 3]",-3.186625,-1.85,-2.5725,-3.186625
"[2, 0]",-1.85,-3.186625,-3.186625,-2.5725
"[2, 1]",-2.5725,-2.5725,-2.5725,-2.5725


In [21]:
print('')
print('         Optimal Policy')
Pidf


         Optimal Policy


Unnamed: 0_level_0,Up,Down,Right,Left
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"[0, 0]",,,,
"[0, 1]",,,,X
"[0, 2]",,,,X
"[0, 3]",,X,,X
"[1, 0]",X,,,
"[1, 1]",X,,,X
"[1, 2]",X,X,X,X
"[1, 3]",,X,,
"[2, 0]",X,,,
"[2, 1]",X,X,X,X
