###### Grid World
- Deterministic Random Policy.
- Reward -1 everywhere except for the transition from terminal state.

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [6]:
class Env:
    def __init__(self, n = 4, m = 4):
        self.rowNum = 4
        self.colNum = 4
        self.actions = ['l', 'r', 'u', 'd']
        self.actionToTuple = {"l": (0, -1), "r": (0, 1), "u": (-1, 0), "d": (1, 0)}
        
    #takes a tuple (i, j ) as argument and returns true if terminal state else return false
    def isTerminalState(self, s):
        i, j = s[0], s[1]
        if i == 0 and j == 0:
            return True
        elif i == self.rowNum - 1 and j == self.colNum - 1:
            return True
        return False
    
    def isValidState(self, s):
        i, j = s[0], s[1]
        return i >= 0 and i < self.rowNum and j >= 0 and j < self.colNum
    
    #even might return invalid state; return tuple
    def getNextState(self, s, a):
        diff = self.actionToTuple[a]
        return (s[0] + diff[0], s[1] + diff[1])
    
    #returns count of invalid next state that we can go from state s
    def getNextValidStateCnt(self, s):
        cnt = 0
        for act in self.actions:
            next_s = self.getNextState(s, act)
            if(self.isValidState(next_s)):
                cnt = cnt + 1
                
        return cnt
        
        
class Agent:
    def __init__(self, env):
        self.value = np.zeros((env.rowNum, env.colNum))
        
def infinity_norm(mat1, mat2):
    return np.max(np.abs((mat1 - mat2).ravel()))

In [43]:
def iterativePolicyEvaluation(env, agent):
    delta = 1
    iter_i = 0
    while(delta > 0.001):
        iter_i += 1
        delta = 0
        for i in range(env.rowNum):
            for j in range(env.colNum):
                s = (i, j)
                if(env.isTerminalState(s)):
                    continue
#                 cntValidState = env.getNextValidStateCnt(s)
                r = 0
                for act in env.actions:
                    next_s = env.getNextState(s, act)
                    if(not env.isValidState(next_s)): next_s = s
                    i1, j1 = next_s[0], next_s[1]
                    if(env.isTerminalState(next_s)):
                        r += -1
                        
                    else:
                        r  += (-1 + agent.value[i1, j1])
                        
                delta = max(delta, np.abs(r - 4*agent.value[i, j])/4)
                agent.value[i, j] = r/4
                            
        print(iter_i, agent.value)

In [44]:
env = Env()
agent = Agent(env)

In [45]:
iterativePolicyEvaluation(env, agent)

1 [[ 0.        -1.        -1.25      -1.3125   ]
 [-1.        -1.5       -1.6875    -1.75     ]
 [-1.25      -1.6875    -1.84375   -1.8984375]
 [-1.3125    -1.75      -1.8984375  0.       ]]
2 [[ 0.         -1.9375     -2.546875   -2.73046875]
 [-1.9375     -2.8125     -3.23828125 -3.40429688]
 [-2.546875   -3.23828125 -3.56835938 -3.21777344]
 [-2.73046875 -3.40429688 -3.21777344  0.        ]]
3 [[ 0.         -2.82421875 -3.83496094 -4.17504883]
 [-2.82421875 -4.03125    -4.7097168  -4.87670898]
 [-3.83496094 -4.7097168  -4.96374512 -4.26455688]
 [-4.17504883 -4.87670898 -4.26455688  0.        ]]
4 [[ 0.         -3.67260742 -5.0980835  -5.58122253]
 [-3.67260742 -5.19116211 -6.03242493 -6.18872833]
 [-5.0980835  -6.03242493 -6.14849091 -5.15044403]
 [-5.58122253 -6.18872833 -5.15044403  0.        ]]
5 [[ 0.         -4.49046326 -6.30054855 -6.91293049]
 [-4.49046326 -6.26144409 -7.22480297 -7.36922646]
 [-6.30054855 -7.22480297 -7.1876235  -5.9268235 ]
 [-6.91293049 -7.36922646 -5.9268