In [2]:
import gymnasium as gym
import numpy as np
from gymnasium.envs.registration import register

In [3]:


env=gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True)
# Custom Frozen Lake Environment
class CustomFrozenLake(gym.envs.toy_text.FrozenLakeEnv):
    def __init__(self, **kwargs):
        super(CustomFrozenLake, self).__init__(**kwargs)

    def step(self, action):
        state, reward, done, truncated, info = super(CustomFrozenLake, self).step(action)
        self.goal=env.unwrapped.desc.shape[0]*env.unwrapped.desc.shape[1]-1
        if state == self.goal:
            reward = 5  # Positive reward for reaching the goal
        elif done and state != self.goal:
            reward = -5  # Negative reward for falling into a hole
        else:
            reward =-1
        return state, reward, done, truncated, info

# Register the custom environment
register(
    id='CustomFrozenLake-v0',
    entry_point='__main__:CustomFrozenLake',
    kwargs={'is_slippery': True,
          }
)

# Create the custom environment
env = gym.make('CustomFrozenLake-v0', render_mode='human')
state = env.reset()


2024-11-22 09:25:33.232 python[52978:884143] +[IMKClient subclass]: chose IMKClient_Modern
2024-11-22 09:25:33.232 python[52978:884143] +[IMKInputSession subclass]: chose IMKInputSession_Modern


In [7]:
class LinearValueFunction:
    def __init__(self, num_features):
        self.num_features = num_features

    def __call__(self, state, weights):
        features = self.get_features(state)
        return np.dot(features, weights)

    def gradient(self, state, weights):
        return self.get_features(state)

    def get_features(self, state):
        return state // 4, state % 4

num_states = env.observation_space.n
value_function = LinearValueFunction(num_states)

In [5]:
def random_policy(state):
    return env.action_space.sample()

random_policy

<function __main__.random_policy(state)>

In [9]:
class GradientMonteCarlo:
    def __init__(self, env, policy, value_function, alpha=0.01):
        self.env = env
        self.policy = policy
        self.value_function = value_function
        self.alpha = alpha
        self.weights = np.random.rand(2)

    def generate_episode(self):
        episode = []
        state = self.env.reset()
        done = False
        while not done:
            action = self.policy(state)
            next_state, reward, done, truncated, info = self.env.step(action)
            episode.append((state, action, reward))
            state = next_state
        return episode

    def update_weights(self, episode):
        G = 0
        for state, action, reward in reversed(episode):
            print(state, action, reward,'w')
            G = reward + G
            value_estimate = self.value_function(state, self.weights)
            gradient = self.value_function.gradient(state, self.weights)
            self.weights += self.alpha * (G - value_estimate) * np.array(gradient)

    def train(self, num_episodes):
        for _ in range(num_episodes):
            episode = self.generate_episode()
            episode.remove(episode[0])
            print(episode,'e')
            self.update_weights(episode)


policy = random_policy
gmc = GradientMonteCarlo(env, policy, value_function)
gmc.train(100)

[(0, np.int64(3), -1), (0, np.int64(0), -1), (4, np.int64(3), -1), (0, np.int64(3), -1), (1, np.int64(1), -1), (2, np.int64(0), -1), (6, np.int64(0), -5)] e
6 0 -5 w
2 0 -1 w
1 1 -1 w
0 3 -1 w
4 3 -1 w
0 0 -1 w
0 3 -1 w
[(1, np.int64(2), -1), (2, np.int64(1), -1), (3, np.int64(1), -1), (2, np.int64(0), -1), (2, np.int64(0), -1), (1, np.int64(2), -5)] e
1 2 -5 w
2 0 -1 w
2 0 -1 w
3 1 -1 w
2 1 -1 w
1 2 -1 w
[(1, np.int64(2), -5)] e
1 2 -5 w
[(1, np.int64(0), -5)] e
1 0 -5 w
[(4, np.int64(1), -5)] e
4 1 -5 w
[(0, np.int64(2), -1), (1, np.int64(3), -1), (0, np.int64(3), -1), (1, np.int64(3), -1), (0, np.int64(3), -1), (0, np.int64(1), -1), (1, np.int64(3), -1), (2, np.int64(2), -1), (2, np.int64(0), -1), (2, np.int64(2), -1), (3, np.int64(3), -1), (3, np.int64(3), -1), (3, np.int64(1), -1), (2, np.int64(2), -1), (2, np.int64(2), -1), (6, np.int64(3), -5)] e
6 3 -5 w
2 2 -1 w
2 2 -1 w
3 1 -1 w
3 3 -1 w
3 3 -1 w
2 2 -1 w
2 0 -1 w
2 2 -1 w
1 3 -1 w
0 1 -1 w
0 3 -1 w
1 3 -1 w
0 3 -1 w
1 3 -1 w

In [4]:
policy = lambda state: env.action_space.sample()
policy

<function __main__.<lambda>(state)>