# CS 5180 Spring 2025
# Adnan Amir

In [2]:
import numpy as np
import random
import pandas as pd
from collections import defaultdict

In [3]:
# FOUR ROOM ENVIRONMENT COPIED FROM EX0
class FourRooms(object):
    def __init__(self):
        # define the four room as a 2-D array for easy state space reference and visualization
        # 0 represents an empty cell; 1 represents a wall cell
        self.four_room_space = np.array([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                                         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                                         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                                         [1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0],
                                         [0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1],
                                         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                                         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]])
        
        # find the positions for all empty cells
        # not that: the origin for a 2-D numpy array is located at top-left while the origin for the FourRooms is at
        # the bottom-left. The following codes performs the re-projection.
        empty_cells = np.where(self.four_room_space == 0.0)
        self.state_space = [[col, 10 - row] for row, col in zip(empty_cells[0], empty_cells[1])]

        # define the action space
        self.action_space = {'LEFT': np.array([-1, 0]),
                             'RIGHT': np.array([1, 0]),
                             'DOWN': np.array([0, -1]),
                             'UP': np.array([0, 1])}
        
        # define the start state
        self.start_state = [0, 0]
        
        # define the goal state
        self.goal_state = [10, 10]
        

    def reset(self):
        """
        Reset the agent's state to the start state [0, 0]
        Return both the start state and reward
        """
        state = self.start_state  # reset the agent to [0, 0]
        reward = 0  # reward is 0
        return state, reward
        

    def step(self, state, act):
        """
        Args: 
            state: a list variable containing x, y integer coordinates. (i.e., [1, 1]).
            act: a string variable (i.e., "UP"). All feasible values are ["UP", "DOWN", "LEFT", "RIGHT"].
        Output args: 
            next_state: a list variable containing x, y integer coordinates (i.e., [1, 1])
            reward: an integer. it can be either 0 or 1.
        """
        
        # CODE HERE: implement the stochastic dynamics as described in Q1. 
        # Please note, we provide you with the deterministic transition function "take_action" below.
        # Therefore, you only have to implement the logics of the stochasticity.
        
        #Reset agent if already at goal state
        if state == self.goal_state:
            return self.start_state(),1
        
        acts = []

        # Define possible actions
        if (act == "UP"):
            acts = ["UP", "LEFT", "RIGHT"]
        elif (act == "DOWN"):
            acts = ["DOWN", "LEFT", "RIGHT"]
        elif (act == "LEFT"):
            acts = ["LEFT", "UP", "DOWN"]
        elif (act == "RIGHT"):
            acts = ["RIGHT", "UP", "DOWN"]

        # Select first action with 80% chance and the other two with 10% chance
        action = random.choices(acts, weights=[0.8, 0.1, 0.1], k=1)[0]
        next_state = self.take_action(state, action)
        

        # CODE HERE: compute the reward based on the resulting state
        reward = 0 if next_state != self.goal_state else 1
        

        # return the current state, reward
        return next_state, reward
        

    """ DO NOT CHANGE BELOW """
    def take_action(self, state, act):
        """
        Input args: 
            state (list): a list variable containing x, y integer coordinates. (i.e., [1, 1]).
            act (string): a string variable (i.e., "UP"). All feasible values are ["UP", "DOWN", "LEFT", "RIGHT"].
        Output args: 
            next_state (list): a list variable containing x, y integer coordinates (i.e., [1, 1])
        """
        state = np.array(state)
        next_state = state + self.action_space[act]
        return next_state.tolist() if next_state.tolist() in self.state_space else state.tolist()

In [4]:
def generate_table(env):
    """
    Generates a transition table for the environment
    """
    sub_actions = {
        'UP':    ["UP", "LEFT", "RIGHT"],
        'DOWN':  ["DOWN", "LEFT", "RIGHT"],
        'LEFT':  ["LEFT", "UP", "DOWN"],
        'RIGHT': ["RIGHT", "UP", "DOWN"]
    }
    sub_probs = [0.8, 0.1, 0.1]
    
    # We'll store transitions in a dictionary that maps
    #   (s, a, s_prime, r) -> probability
    transition_counts = defaultdict(float)

    for s in env.state_space:
        s_tuple = tuple(s)
        
        # If s is the goal, then from s + any action => (0,0), reward=1
        if s == env.goal_state:
            for a in env.action_space.keys():
                transition_counts[(s_tuple, a, tuple(env.start_state), 1)] += 1.0
            continue
        
        # Otherwise, we accumulate sub-action probabilities for each chosen action
        for a in env.action_space.keys():
            for sub_a, p_sub in zip(sub_actions[a], sub_probs):
                next_s = env.take_action(s, sub_a)
                next_s_tuple = tuple(next_s)
                r = 1 if next_s == env.goal_state else 0
                transition_counts[(s_tuple, a, next_s_tuple, r)] += p_sub

    # Convert to a dataframe
    rows = []
    for (s, a, s_prime, r), p in transition_counts.items():
        rows.append([s, a, s_prime, r, p])
    df = pd.DataFrame(rows, columns=["s", "a", "s'", "r", "p(s',r | s,a)"])
    df.sort_values(by=["s", "a", "s'", "r"], inplace=True, ignore_index=True)
    return df


In [5]:
if __name__ == "__main__":
    env = FourRooms()

    # Create the transition table
    df_transitions = generate_table(env)
    df_transitions["s"] = df_transitions["s"].apply(lambda s: (int(s[0]), int(s[1])))

    print("Sample transitions:")
    print(df_transitions.head(10).to_markdown(index=False))
    print(df_transitions.tail(5).to_markdown(index=False))

    
    df_transitions.to_csv("transitions.csv", index=False)


Sample transitions:
| s      | a     | s'     |   r |   p(s',r | s,a) |
|:-------|:------|:-------|----:|----------------:|
| (0, 0) | DOWN  | (0, 0) |   0 |             0.9 |
| (0, 0) | DOWN  | (1, 0) |   0 |             0.1 |
| (0, 0) | LEFT  | (0, 0) |   0 |             0.9 |
| (0, 0) | LEFT  | (0, 1) |   0 |             0.1 |
| (0, 0) | RIGHT | (0, 0) |   0 |             0.1 |
| (0, 0) | RIGHT | (0, 1) |   0 |             0.1 |
| (0, 0) | RIGHT | (1, 0) |   0 |             0.8 |
| (0, 0) | UP    | (0, 0) |   0 |             0.1 |
| (0, 0) | UP    | (0, 1) |   0 |             0.8 |
| (0, 0) | UP    | (1, 0) |   0 |             0.1 |
| s        | a     | s'       |   r |   p(s',r | s,a) |
|:---------|:------|:---------|----:|----------------:|
| (10, 9)  | UP    | (10, 10) |   1 |             0.8 |
| (10, 10) | DOWN  | (0, 0)   |   1 |             1   |
| (10, 10) | LEFT  | (0, 0)   |   1 |             1   |
| (10, 10) | RIGHT | (0, 0)   |   1 |             1   |
| (10, 10) | UP    |

**For Solving Q6b**

In [6]:
from sympy import symbols, Eq, solve

v_high, v_low = symbols('v_high v_low')

eq_h = Eq(v_high, (10 + 0.18 * v_low) / 0.28)
eq_l = Eq(v_low, (0.75 + 0.675 * v_high) / 0.775)

solution = solve((eq_h, eq_l), (v_high, v_low))
solution

{v_high: 82.5654450261780, v_low: 72.8795811518325}