In [1]:
# First ensure that we can install numpy and gym here then import them
import sys
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install gym

import numpy as np
import gym
import time
np.set_printoptions(precision=3)

Collecting numpy
  Obtaining dependency information for numpy from https://files.pythonhosted.org/packages/cc/05/ef9fc04adda45d537619ea956bc33489f50a46badc949c4280d8309185ec/numpy-1.26.0-cp310-cp310-win_amd64.whl.metadata
  Downloading numpy-1.26.0-cp310-cp310-win_amd64.whl.metadata (61 kB)
     ---------------------------------------- 0.0/61.1 kB ? eta -:--:--
     ------------ ------------------------- 20.5/61.1 kB 320.0 kB/s eta 0:00:01
     ------------------- ------------------ 30.7/61.1 kB 325.1 kB/s eta 0:00:01
     ------------------------------- ------ 51.2/61.1 kB 525.1 kB/s eta 0:00:01
     -------------------------------------- 61.1/61.1 kB 540.9 kB/s eta 0:00:00
Downloading numpy-1.26.0-cp310-cp310-win_amd64.whl (15.8 MB)
   ---------------------------------------- 0.0/15.8 MB ? eta -:--:--
   ---------------------------------------- 0.1/15.8 MB 1.7 MB/s eta 0:00:10
   ---------------------------------------- 0.2/15.8 MB 1.8 MB/s eta 0:00:09
    ---------------------------

In [None]:
env = gym.make("FrozenLake-v1",  map_name="4x4", is_slippery=False)

"""
You can see from documentation that this environment contains three main things inside:
	P: nested dictionary 
	    (simulates the  p(s',r | s, a) = the probability of being in state s, applying action a and landing in state s' with a reward of r)
		From gym.core.Environment:
		For each pair of states in [1, nS] and actions in [1, nA], P[state][action] is a
		tuple of the form (probability, nextstate, reward, terminal) where
			- probability: float
				the probability of transitioning from "state" to "nextstate" with "action"
			- nextstate: int
				denotes the state we transition to (in range [0, nS - 1])
			- reward: int
				either 0 or 1, the reward for transitioning from "state" to
				"nextstate" with "action"
			- terminal: bool
			  True when "nextstate" is a terminal state (hole or goal), False otherwise
	nS: int
		number of states in the environment
	nA: int
		number of actions in the environment
		Inside, they implement it with an enum:
		LEFT = 0
        DOWN = 1
        RIGHT = 2
        UP = 3
"""

def runEpisode(env, policy, maxSteps=100):
    """
    Parameters
    ----------
    env: gym.core.Environment
      Environment to play on. Must have nS, nA, and P as
      attributes.
    Policy: np.array of shape [env.nS]
      The action to take at a given state
    """
    
    # We count here the total
    total_reward = 0
    
    # THis is how we reset the environment to an initial state, it returns the observation.
    # As documented, in this case the observation is the state where the agent currently is positionaed, 
    #, which is a number in [0, nS-1]. We can use local function stateToRC to get the row and column of the agent
    # The action give is in range [0, nA-1], check the enum defined above to understand what each number means
    obs = env.reset() 
    for t in range(maxSteps):
        # Draw the environment on screen
        env.render() 
        # Sleep a bit between decisions
        time.sleep(0.25)
        
        # Here we sample an action from our policy, we consider it deterministically at this point
        action = policy[obs]
        
        # Hwere we interact with the enviornment. We give it an action to do and it returns back:
        # - the new observation (observable state by the agent),
        # - the reward of the action just made
        # - if the simulation is done (terminal state)
        # - last parameters is an "info" output, we are not interested in this one that's why we ignore the parameter
        newObs, reward, done, _ = env.step(action)
        print(f"Agent was in state {obs}, took action {action}, now in state {newObs}")
        obs = newObs
        
        total_reward += reward
        # Close the loop before maxSteps  if we are in a terminal state
        if done:
            break
   
    if not done:   
        print(f"The agent didn't reach a terminal state in {maxSteps} steps.")
    else:
        print(f"Episode reward: {total_reward}")
    env.render() # One last  rendering of the episode.