### Russell Value iteration and  Policy iteration
#### Baseline program for the Practice 2
Fill the right places in the program


In [None]:
import gymnasium as gym
import numpy as np
from gymnasium import spaces
import session_info

In [None]:
# Parameters
gamma = 0.99  # Discount factor
theta = 1e-6  # Convergence threshold

In [None]:
# 
class RussellsGrid(gym.Env):
    """
    RussellsGrid Env is a custom OpenAI Gym environment representing a 3x4 grid world with specific states, actions,
    and rewards. The agent navigates through the grid aiming to reach the positive goal for a reward of +1,
    avoiding the negative goal which leads to a reward of -1, and facing a slight penalty of -0.04 for each step taken.

    Actions:
    - 0: Up
    - 1: Right
    - 2: Down
    - 3: Left

    States:
    - The agent's position, 0 to nrow*ncol-1

    Rewards:
    - +1 for reaching the positive goal.
    - -1 for reaching the negative goal.
    - -0.04 for each step taken.

    Environment Dynamics:
    - The agent moves deterministically in the intended direction (80% chance).
    - There's a 20% chance of moving randomly to an adjacent cell.

    Special Positions:
    - Start position: (2, 0)
    - Positive goal: (0, 3)
    - Negative goal: (1, 3)
    - Invalid position: (1, 1)

    Rendering:
    - Supports 'human' rendering mode to visually represent the grid with agent ('A'), positive goal ('G'),
      negative goal ('R'), and invalid position ('X').

    Usage:
    - Instantiate the environment and use reset() to initialize the agent.
    - Use step(action) to interact with the environment, returning the next state, reward, and done flag.
    """
    metadata = {'render_modes': ['human'], 'render_fps': 4}

    def __init__(self, render_mode=None):
        self.ncol = 4
        self.nrow = 3
        self.size = (self.nrow, self.ncol)
#        self.num_states = self.width * self.height

        # Define action and observation space
        self.action_space = spaces.Discrete(4)  # Up, Right, Down, Left
        self.observation_space = spaces.Discrete(12)

        # Define initial state
        self.state = np.zeros(3)
        self.start_position = np.array([2,0])
        self.positive_goal = np.array([0, 3])
        self.negative_goal = np.array([1, 3])
        self.invalid_position = np.array([1, 1])

        self.render_mode = render_mode

        # Create transition matrix P
        self.P = self._create_transition_matrix()

    def _create_transition_matrix(self):
        P = {s: {a: [] for a in range(4)} for s in range(self.observation_space.n)}

        for s in range(self.observation_space.n):
            row, col = divmod(s, self.ncol)
            
            if (row, col) == tuple(self.invalid_position):
                continue  # Skip invalid position

            for a in range(4):  # Up, Right, Down, Left
                outcomes = self._get_action_outcomes(row, col, a)
                
                for next_state, prob in outcomes.items():
                    next_row, next_col = divmod(next_state, self.ncol)
                    
                    # Determine reward and done status
                    if (next_row, next_col) == tuple(self.positive_goal):
                        reward = 1.0
                        done = True
                    elif (next_row, next_col) == tuple(self.negative_goal):
                        reward = -1.0
                        done = True
                    else:
                        reward = -0.04
                        done = False

                    P[s][a].append((prob, next_state, reward, done))

        return P

    def _get_action_outcomes(self, row, col, action):
        outcomes = {}
        intended_next_state = self._get_next_state(row, col, action)
        outcomes[intended_next_state] = 0.8

        # Add random adjacent cells with 0.2 probability
        adjacent_actions = [0, 1, 2, 3]
        adjacent_actions.remove(action)
        for adj_action in adjacent_actions:
            adj_next_state = self._get_next_state(row, col, adj_action)
            if adj_next_state in outcomes:
                outcomes[adj_next_state] += 0.2 / 3
            else:
                outcomes[adj_next_state] = 0.2 / 3

        return outcomes

    def _get_next_state(self, row, col, action):
        next_row, next_col = row, col

        if action == 0:  # Up
            next_row = max(0, row - 1)
        elif action == 1:  # Right
            next_col = min(self.ncol - 1, col + 1)
        elif action == 2:  # Down
            next_row = min(self.nrow - 1, row + 1)
        elif action == 3:  # Left
            next_col = max(0, col - 1)

        # Check if next position is invalid
        if (next_row, next_col) == tuple(self.invalid_position):
            next_row, next_col = row, col  # Stay in current position

        return next_row * self.ncol + next_col
    
    def cell_id(coord, width=4):
        return coord[0]*width + coord[1]

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        
        # Reset the agent to the starting position
        self.state = self.cell_id(self.start_position)
        
        return self.state, {}
    
    def cell_id(self, coord, width=4):
        return coord[0]*width + coord[1]

    def step(self, action):
        current_state = self.agent_position[0] * self.ncol + self.agent_position[1]
        
        # Randomly choose an outcome based on probabilities
        outcomes = self.P[current_state][action]
        probs = [outcome[0] for outcome in outcomes]
        chosen_outcome = self.np_random.choice(len(outcomes), p=probs)
        
        _, next_state, reward, done = outcomes[chosen_outcome]
        
        # Update agent position
        self.agent_position = np.array(divmod(next_state, self.ncol))

        # Update the state
        self.state = self._get_observation()

        if self.render_mode == 'human':
            self.render()

        return self.state, reward, done, False, {}

    def render(self):
        if self.render_mode == 'human':
            for i in range(self.nrow):
                for j in range(self.ncol):
                    if np.array_equal([i, j], self.agent_position):
                        print('A', end=' ')
                    elif np.array_equal([i, j], self.positive_goal):
                        print('G', end=' ')
                    elif np.array_equal([i, j], self.negative_goal):
                        print('R', end=' ')
                    elif np.array_equal([i, j], self.invalid_position):
                        print('X', end=' ')
                    else:
                        print('.', end=' ')
                print()
            print()

    def close(self):
        pass

In [None]:
env = RussellsGrid(render_mode='human')
env.observation_space.n

In [None]:
# Visualizaing P (Policy)

from pprint import pprint
print("Policy state 0 and action 1")
pprint(env.P[0][1])

# structure of P
#Top level: state (an int representing the current state)

# Second level: action (an int representing the action taken)

# Third level: A list of possible transitions

# Each transition: A tuple with
#     probability: Probability of this transition (float)
#     next_state: State the agent will move to (int)
#     reward: Immediate reward received (float)
#     done: Whether it's a terminal state (bool)


In [None]:
# Policy Loops

def policy_evaluation(env, policy, theta=1e-8, gamma=1.0):
    """ 
    Evaluates policy
    
    Arguments: env, policy, theta and Gamma
    Returns:
          V (value vector)
    
    """
    V = np.zeros(env.observation_space.n)  # Initialize value function
    while True:
        delta = 0
        for s in range(env.observation_space.n):
            if s == env.invalid_position[0] * env.ncol + env.invalid_position[1]:
                continue
            v = 0
            for a, action_prob in enumerate(policy[s]):
                for prob, next_state, reward, done in env.P[s][a]:
                    v += action_prob * prob * (reward + gamma * V[next_state] * (not done))
            delta = max(delta, np.abs(v - V[s]))
            V[s] = v
        if delta < theta:
            break
    return V

def policy_improvement(env, V, gamma=1.0):
    """ 
    Computes greedy policy w.r.t a given env and value function.
    
    Arguments: env, value function, discount factor(gamma)
    Returns:   policy
    """
    policy = np.zeros([env.observation_space.n, env.action_space.n])
    for s in range(env.observation_space.n):
        if s == env.invalid_position[0] * env.ncol + env.invalid_position[1]:
            continue
        q_values = np.zeros(env.action_space.n)
        for a in range(env.action_space.n):
            for prob, next_state, reward, done in env.P[s][a]:
                q_values[a] += prob * (reward + gamma * V[next_state] * (not done))
        best_a = np.argmax(q_values)
        policy[s] = np.eye(env.action_space.n)[best_a]
    return policy

def policy_iteration(env, theta=1e-8, gamma=1.0):
    policy = np.ones([env.observation_space.n, env.action_space.n]) / env.action_space.n
    i=0
    while True:

        # [CODE FOR ACTIVITY 1]
        
    return policy, V



In [None]:
# Value Loop
def value_iteration(env, theta=1e-8, gamma=1.0):
    """ 
    Value Iteration loop
    
    Arguments: env, theta and Gamma
    Returns:
         policy vector, V (value vector)
    """
    
    V = np.zeros(env.observation_space.n)
    i = 0
    while True:
        delta = 0
        for s in range(env.observation_space.n):
            if s == env.invalid_position[0] * env.ncol + env.invalid_position[1]:
                continue
            v = V[s]

            
#            [CODE FOR ACTIVITY 2]
            
            
            delta = max(delta, np.abs(v - V[s]))
            i = i+1
        if delta < theta:
            print('Value iterations', i)
            break
    
    policy = policy_improvement(env, V, gamma)
    return policy, V



In [None]:
# Supporting utilities
def print_policy(policy, env):
    policy_actions = np.argmax(policy, axis=1)
    for i in range(env.nrow):
        for j in range(env.ncol):
            s = i * env.ncol + j
            if np.array_equal([i, j], env.positive_goal):
                print('G', end=' ')
            elif np.array_equal([i, j], env.negative_goal):
                print('N', end=' ')
            elif np.array_equal([i, j], env.invalid_position):
                print('X', end=' ')
            else:
                action = policy_actions[s]
                if action == 0:
                    print('↑', end=' ')
                elif action == 1:
                    print('→', end=' ')
                elif action == 2:
                    print('↓', end=' ')
                elif action == 3:
                    print('←', end=' ')
        print()

def print_value_function(V, env):
    for i in range(env.nrow):
        for j in range(env.ncol):
            s = i * env.ncol + j
            if np.array_equal([i, j], env.invalid_position):
                print('  X  ', end=' ')
            else:
                print(f'{V[s]:5.2f}', end=' ')
        print()


In [None]:
# Example usage:
env = RussellsGrid(render_mode='human')
observation, info = env.reset()

print("Policy Iteration:")

pi_policy, pi_V = policy_iteration(env, theta, gamma)

print("Optimal Policy:")
print_policy(pi_policy, env)
print("\nOptimal Value Function:")
print_value_function(pi_V, env)

print("\nValue Iteration:")

vi_policy, vi_V = value_iteration(env, theta, gamma)

print("Optimal Policy:")
print_policy(vi_policy, env)
print("\nOptimal Value Function:")
print_value_function(vi_V, env)

In [None]:
session_info.show(html=False)

In [None]:
#[SPACE TO ANSWER 4 QUESTIONS]