<a href="https://colab.research.google.com/github/arnavdodiedo/RL-Algorithms/blob/main/GridWorld.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

## Policy evalution with different solutions
#### where policy is a uniform one, equal probabilites of going to each neighbouring state

In [None]:
class gridworld():
    def __init__(self, size = 4, reward = 0, penalty_per_step = -1, epochs = 100, gamma = 0.9, epsilon = 0.1, epsilon_decay_factor = 0.1):
        self.size = size # square grid world size
        self.epsilon_decay_factor = epsilon_decay_factor
        self.returngrid = np.zeros((size, size), dtype=np.float) # reward for all states initialised to 0        
        self.goal_states = [[0,0],[size-1, size-1]] # goal states are the top left and bottom right corners of the grid
        self.reward = reward # 0 reward on reaching goal state
        self.epsilon = epsilon
        # terminate on reaching goal state OR stuck in infinite loop with 0 rewards per step in this loop

        self.policy = np.zeros((self.size, self.size, 4)) + 0.25 # set initial policy as naive, equal probabilities in all directions
        self.policy[self.goal_states[0][0],self.goal_states[0][1]] = \
                self.policy[self.goal_states[1][0], self.goal_states[1][1]] = np.zeros(4) # set movement probability for goal state as all 0

        self.state_action_values = np.zeros((self.size, self.size, 4)) # state action value matrix

        self.penalty_per_step = penalty_per_step # -1 reward per movement 
        self.movements = [[-1,0], [1,0], [0,-1], [0,1]]
        self.episodes = self.epochs = epochs # number of epochs to run policy evalution
        self.gamma = gamma

    def reset_grid(self):
        self.returngrid = np.zeros((self.size, self.size))
        self.epsilon = 0.1
        self.policy = np.zeros((self.size, self.size, 4)) + 0.25 # set initial policy as naive, equal probabilities in all directions
        self.policy[self.goal_states[0][0],self.goal_states[0][1]] = \
                self.policy[self.goal_states[1][0], self.goal_states[1][1]] = np.zeros(4) # set movement probability for goal state as all 0
        self.state_action_values = np.zeros((self.size, self.size, 4)) # reset state action value matrix

    def greedy_policy_update_dp(self, curr_state):  # update policy greedily, second step of generalized policy improvement (GPI)
        if curr_state not in self.goal_states:
            self.policy[curr_state[0]][curr_state[1]] = np.zeros(4)
            
            up = self.next_state(curr_state, 0)
            up = self.returngrid[up[0]][up[1]]

            down = self.next_state(curr_state, 1)
            down = self.returngrid[down[0]][down[1]]
            
            left = self.next_state(curr_state, 2)
            left = self.returngrid[left[0]][left[1]]
            
            right = self.next_state(curr_state, 3)
            right = self.returngrid[right[0]][right[1]]

            arr = [[up, 0], [down, 1], [left, 2], [right, 3]]            
            arr.sort()       
            arr.reverse()     

            m = [arr[0][1]]
            for i in range(1, 4):
                if arr[i][0] != arr[i-1][0]: break
                else: m.append(arr[i][1])
            
            value = 1/len(m)
            
            for i in m:
                self.policy[curr_state[0]][curr_state[1]][i] = value
            
            # print(self.policy[curr_state[0]][curr_state[1]])


    def next_state(self, curr_state, direction): # direction = 0 - up, 1 - down, 2 - left, 3 - right
        change = self.movements[direction]
        new_state = [curr_state[0]+change[0], curr_state[1]+change[1]]

        if(new_state[0]<0): new_state[0] = 0
        elif(new_state[0]>self.size-1): new_state[0] = self.size-1
        
        if(new_state[1]<0): new_state[1] = 0
        elif(new_state[1]>self.size-1): new_state[1] = self.size-1

        return new_state

    def display_grid(self):                
        print("[", end="")
        for i in range(self.size):
            print("\n       [", end=" ")
            for j in range(self.size):
                print(self.returngrid[i][j], end=", ")
            print("]", end="")
        print("\n]")

    def display_state_action_values(self):
        print("[", end="")
        for i in range(self.size):
            print("\n       [", end=" ")
            for j in range(self.size):
                print(self.state_action_values[i][j], end=", ")
            print("]", end="")
        print("\n]")

    def display_policy(self):
        print("[", end="")
        for i in range(self.size):
            print("\n       [", end=" ")
            for j in range(self.size):                
                
                if [i,j] in self.goal_states: 
                    print("stay", end=", ")
                    continue

                probs = [[self.policy[i][j][0], 0], [self.policy[i][j][1], 1], [self.policy[i][j][2], 2], [self.policy[i][j][3], 3]]
                probs.sort()
                probs.reverse()

                moves = ["up", "down", "left", "right"]
                m = [probs[0][1]]

                for k in range(1,4):
                    if probs[k][0] != probs[k-1][0]: break
                    else: m.append(probs[k][1])
                
                for k in range(len(m)):
                    if k!=len(m)-1: print(moves[m[k]], end="&")
                    else: print(moves[m[k]], end=", ")
                
            print("]", end="")
        print("\n]")

    def evaluate_current_policy_dp(self):
        self.reset_grid() # start from scratch
        for _ in range(1, 1+self.epochs):                        
            for i in range(self.size):
                for j in range(self.size):                    
                    value = 0
                    for k in range(4):
                        new_state = self.next_state([i,j], k)
                        value += (self.penalty_per_step+self.returngrid[new_state[0], new_state[1]]) * self.policy[i][j][k]                                
                    
                    self.returngrid[i][j] = value

            if (_%10==0):
                print("epoch #%d"%(_), end=" ")
                self.display_grid()                
    
    def gpi_dp(self):
        self.reset_grid() # start from scratch
        for _ in range(1, 1+self.epochs):
            # either update in self.returngrid as you traverse OR in each epoch maintain a copy of self.returngrid and use it for update at the end
            returngrid_copy = np.copy(self.returngrid)
            for i in range(self.size):
                for j in range(self.size):                    
                    value = 0
                    for k in range(4):
                        new_state = self.next_state([i,j], k)
                        value += (self.penalty_per_step+self.returngrid[new_state[0]][new_state[1]]) * self.policy[i][j][k]                    
                    returngrid_copy[i,j] = value

            if (self.returngrid == returngrid_copy).all():                 
                print("Policy did not improve. Stopping...")
                print("epoch #%d"%(_), end=" ")
                self.display_grid()
                print("policy -")
                self.display_policy()
                break
            else: 
                self.returngrid = returngrid_copy
            
            for i in range(self.size): 
                for j in range(self.size): 
                    self.greedy_policy_update_dp([i,j])            

            if (_%10==0):
                print("epoch #%d"%(_), end=" ")
                self.display_grid()
                print("policy -")
                self.display_policy()

    def evaluate_current_policy_monte_carlo(self): # first visit monte carlo
        self.reset_grid() # start from scratch

        number_of_visits = np.zeros((self.size, self.size))
        for _ in range(1, 1+self.episodes):
            is_visited_this_episode = np.zeros((self.size, self.size))

            k = np.random.choice(range(16))
            i = k//4
            j = k - 4*i                        

            state = [i, j]
            state_reward = []                        
            path = []

            if state in self.goal_states: continue

            while state not in self.goal_states:
                if is_visited_this_episode[state[0]][state[1]] == 0:
                    number_of_visits[state[0],state[1]] += 1
                    is_visited_this_episode[state[0]][state[1]] = 1
                
                path.append(state)
                state_reward.append(self.penalty_per_step)
                direction = np.random.choice(range(4), p=self.policy[state[0]][state[1]])
                new_state = self.next_state(state, direction)            
                state = new_state
                
            state_returns = np.zeros_like(state_reward, dtype=np.float)            
            # print(state_returns.shape, len(state_reward))
            state_returns[-1] = state_reward[-1]

            for i in range(len(state_reward)-2, -1, -1):
                state_returns[i] = state_reward[i] + self.gamma * state_returns[i+1]
                # print(state_returns[i], self.gamma * state_returns[i+1], end=",")
            # print("\n")

            is_visited_this_episode = np.zeros((self.size, self.size))

            for p in range(len(path)):
                if is_visited_this_episode[path[p][0]][path[p][1]] == 1: continue
                
                is_visited_this_episode[path[p][0]][path[p][1]] = 1
                
                self.returngrid[path[p][0]][path[p][1]] += (state_returns[p]-self.returngrid[path[p][0]][path[p][1]])/number_of_visits[path[p][0]][path[p][1]]
            
            if (_%1000 == 0):
                print("epoch #%d"%(_), end=" ")
                self.display_grid()
                print("policy - ")
                self.display_policy()
    
    def epsilon_greedy_policy_update_monte_carlo(self, curr_state): # epsilon greedy update of policy, second step of monte carlo policy improvement
        i, j = curr_state[0], curr_state[1]
        chance = np.random.uniform(0,1)
        
        if chance <= self.epsilon:
            self.policy[i][j] = np.zeros(4) + 0.25
        else:            
            q_vals = [[self.state_action_values[i][j][0], 0], [self.state_action_values[i][j][1], 1], [self.state_action_values[i][j][2], 2], [self.state_action_values[i][j][3], 3]]
            q_vals.sort()
            q_vals.reverse()
            
            arr = [q_vals[0][1]]

            for k in range(1, 4):
                if q_vals[k-1][0] != q_vals[k][0]: break
                else:
                    arr.append(q_vals[k][1])
            
            value = 1/len(arr)

            if len(arr)==1 and self.next_state(curr_state, arr[0]) == curr_state:
                prob = np.zeros(4) + 0.25
            else:
                prob = np.zeros(4)        
                for k in arr:
                    prob[k] = value
            
            self.policy[i][j] = prob
        
        # print("updated policy at", curr_state, "to", self.policy[i][j])

    def gpi_monte_carlo(self):
        self.reset_grid()
        number_of_visits = np.zeros((self.size, self.size, 4))
        for _ in range(1, 1+self.episodes):
            # print("epoch", _)
            is_visited_this_episode = np.zeros((self.size, self.size, 4))

            k = np.random.choice(range(16))
            action = np.random.choice([0,1,2,3])

            i = k//4
            j = k - 4*i                        

            state = [i, j]
            state_reward = []                        
            path = []            

            if state in self.goal_states: continue

            while state not in self.goal_states:
                if is_visited_this_episode[state[0]][state[1]][action] == 0:
                    number_of_visits[state[0]][state[1]][action] += 1
                    is_visited_this_episode[state[0]][state[1]][action] = 1

                # print("in state", state, "took action", action, "with policy", self.policy[state[0]][state[1]])  
                if path.count([state, action, self.policy[state[0]][state[1]].tolist()]) > 3: break

                path.append([state, action, self.policy[state[0]][state[1]].tolist()])
                state_reward.append(self.penalty_per_step)
                action = np.random.choice(range(4), p=self.policy[state[0]][state[1]])                
                state = self.next_state(state, action)                                
                
            state_action_returns = np.zeros_like(state_reward, dtype=np.float)            
            # print(state_returns.shape, len(state_reward))
            state_action_returns[-1] = state_reward[-1]

            for i in range(len(state_reward)-2, -1, -1):
                state_action_returns[i] = state_reward[i] + self.gamma * state_action_returns[i+1]
                # print(state_returns[i], self.gamma * state_returns[i+1], end=",")
            # print("\n")

            is_visited_this_episode = np.zeros((self.size, self.size, 4))
            for p in range(len(path)):
                x, y, a = path[p][0][0], path[p][0][1], path[p][1]

                if is_visited_this_episode[x][y][a] == 1: continue
                
                is_visited_this_episode[x][y][a] = 1                
                self.state_action_values[x][y][a] += (state_action_returns[p]-self.state_action_values[x][y][a])/number_of_visits[x][y][a]

            is_visited_this_episode = np.zeros((self.size, self.size, 4))
            for p in range(len(path)):
                x, y, a = path[p][0][0], path[p][0][1], path[p][1]

                if is_visited_this_episode[x][y][a] == 1: continue
                
                is_visited_this_episode[x][y][a] = 1                                
                self.epsilon_greedy_policy_update_monte_carlo([x,y])
            
            if _%100: self.epsilon *= self.epsilon_decay_factor

            if (_%(self.epochs/10) == 0):                
                print("epoch #%d"%(_), end=" ")
                self.display_state_action_values()
                print("policy - ")
                self.display_policy()            

## DP solution

In [None]:
grid = gridworld(epochs=100)
grid.gpi_dp()

Policy did not improve. Stopping...
epoch #4 [
       [ 0.0, -1.0, -2.0, -3.0, ]
       [ -1.0, -2.0, -3.0, -2.0, ]
       [ -2.0, -3.0, -2.0, -1.0, ]
       [ -3.0, -2.0, -1.0, 0.0, ]
]
policy -
[
       [ stay, left, left, left&down, ]
       [ up, left&up, right&left&down&up, down, ]
       [ up, right&left&down&up, right&down, down, ]
       [ right&up, right, right, stay, ]
]


## Monte Carlo solution

In [None]:
grid = gridworld(size = 4, epochs=10000, gamma = 0.9, epsilon_decay_factor = 0.5)
grid.gpi_monte_carlo()

epoch #1000 [
       [ [0. 0. 0. 0.], [-1.08867666 -1.34457597 -1.         -2.12588835], [-4.00954926 -4.8998378  -4.85834375 -3.7972893 ], [-5.12380581 -3.52517326 -4.9234849  -3.82246268], ]
       [ [-5.09670771 -5.13484798 -5.08211795 -4.74790233], [-2.67515035 -4.2188532  -5.2654033  -4.65032766], [-4.63809489 -5.89623428 -6.33854873 -5.32578918], [-5.59851795 -2.62816732 -2.88577891 -3.39014828], ]
       [ [-5.6675695  -6.05940803 -5.75885228 -6.06520402], [-4.07889191 -4.69615606 -5.69638959 -4.7085379 ], [-4.62571347 -5.88683874 -5.21386856 -4.71545971], [-2.30856936 -1.39999062 -2.63749504 -2.30524673], ]
       [ [-6.02581104 -6.01976381 -5.9799783  -6.29521077], [-4.77970713 -6.18227096 -5.62989818 -5.25373705], [-4.90980694 -4.13087929 -4.98083227 -5.52090472], [0. 0. 0. 0.], ]
]
policy - 
[
       [ stay, left, right, down, ]
       [ right, up, up, down, ]
       [ up, up, up, down, ]
       [ right&left&down&up, up, right&left&down&up, stay, ]
]
epoch #2000 [
       [ [

## Tester module