In [None]:
import gymnasium as gym
import numpy as np
from utils import JupyterRender

%matplotlib inline

In [15]:
class MC_prediction:
    def __init__(self, env, pi, gamma=0.9, max_episode=100, render=False):
        self.env = env
        
        self.nrow = env.nrow
        self.ncol = env.ncol 
        
        self.state_dim = env.observation_space.n # self.nrow * self.ncol
        self.action_dim = env.action_space.n

        self.gamma = gamma
        self.max_episode = max_episode
        self.render = render
        
        self.pi = pi
        self.v = np.zeros([self.state_dim])
        self.returns = [[] for _ in range(self.state_dim)]
        
        #check policy validity
        assert len(self.pi) == self.state_dim
        
        for i in range(self.state_dim):
            assert self.pi[i] >= 0 and self.pi[i] < self.action_dim
            
    def run(self):
        for episode in range(self.max_episode):
            observation, _ = self.env.reset()
            
            done = False
            local_step = 0
            trajectory = []
            
            while not done:
                action = self.pi[observation]
                next_observation, reward, done, _, _ = self.env.step(action)
                
                if self.render:
                    self.env.render(title=f"Episode {episode} / step {local_step}",v=self.v, policy=self.pi)
                # give penalty for staying in ground
                if reward == 0:
                    reward = -0.001
                    
                # give penalty for falling into the hole
                if done and next_observation != 15:
                    reward = -1

                if local_step == 100:
                    done = True #prevent infinite episode
                    reward = -1

                if observation == next_observation: # prevent meaningless actions
                    reward = -1

                trajectory.append({'s': observation, 'a': action, 'r': reward})
                
                observation = next_observation 
                local_step += 1
            trajectory.reverse()
            G = 0
            
            traj_states = list(map(lambda x: x['s'], trajectory))
            
            for i in range(len(trajectory)):
                G = self.gamma * G + trajectory[i]['r'] 
                
                if trajectory[i]['s'] not in traj_states[i+1:]:
                    self.returns[trajectory[i]['s']].append(G)
                    self.v[trajectory[i]['s']] = sum(self.returns[trajectory[i]['s']]) / len(self.returns[trajectory[i]['s']])
                

In [16]:
env = gym.make("FrozenLake-v1", render_mode='rgb_array', is_slippery=False)#define the environment.
env = JupyterRender(env)

policy = np.array([1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0], dtype=int) #optimal policy

NameError: name 'JupyterRender' is not defined

In [None]:
mc_config = {
    'env': env,
    'pi': policy,
    'gamma': 0.9,
    'render': True,
    'max_episode': 10
}

mc = MC_prediction(mc_config)
mc.run()

TypeError: MC_prediction.__init__() missing 1 required positional argument: 'pi'