In [83]:
import gymnasium as gym

env = gym.make("LunarLander-v3", render_mode="human")
observation, info = env.reset()

episode_over = False
while not episode_over:
    action = env.action_space.sample()  # agent policy that uses the observation and info
    observation, reward, terminated, truncated, info = env.step(action)

    episode_over = terminated or truncated

env.close()

In [84]:
import collections
from torch.utils.tensorboard import SummaryWriter

In [85]:
writer = SummaryWriter('runs/q_learning')

In [86]:
ENV_NAME='FrozenLake-v1'
GAMMA=0.9
TEST_EPISODES=20

In [90]:
class Agent:
    def __init__(self):
        self.env = gym.make(ENV_NAME)
        self.initial_state, _ = self.env.reset()
        self.value_table = collections.defaultdict(float)
        self.rewards_table = collections.defaultdict(float)
        self.transition_table = collections.defaultdict(collections.Counter)
        
    def play_n_random_steps(self, count):
        for i in range(count):
            action = self.env.action_space.sample()
            # print(self.env.step(action))
            new_state, reward, is_done, _ , _= self.env.step(action)
            self.rewards_table[(self.initial_state, action, new_state)] = reward
            self.transition_table[(self.initial_state, action)][new_state] += 1
            if is_done:
                self.initial_state, _ = self.env.reset()
                
            else:
                self.initial_state = new_state
                
    def calculate_action_value(self, state, action):
        target_counts = self.transition_table[(state, action)]
        total = sum(target_counts.values())
        action_value = 0.0
        # print(target_counts.items())
        for target_state, count in target_counts.items():
            current_reward = self.rewards_table[(state, action, target_state)]
            transition_probability = count / total
            action_value += transition_probability * (current_reward + GAMMA * self.value_table[target_state])
        return action_value
    

    def select_best_action(self, state, env):
        
        best_action = None
        best_action_value = None
        
        for i in range(env.action_space.n):
            action = i
            current_action_value = self.calculate_action_value(state, action)
            if best_action_value is None or current_action_value > best_action_value:
                best_action_value = current_action_value
                best_action = i
        return best_action
    
    
    def v_iteration(self):
        for state in range(self.env.observation_space.n):
            q_values = [ self.calculate_action_value(state, action) for action in range(self.env.action_space.n)]
            preferred_action = max(q_values)
            self.value_table[state] = preferred_action
    
    def play_fullepisode(self, env):
        
        state, _ = env.reset()
        total_reward = 0.0
        while True:
           
            action = self.select_best_action(state, env)
            new_state, reward, is_done, _ , _= self.env.step(action)
            self.rewards_table[(state, action, new_state)] = reward
            self.transition_table[(state, action)][new_state] += 1
            total_reward += reward
            if is_done:
                break
            else:
                state = new_state
                
        return total_reward

In [91]:
agent = Agent()
# i, _ = agent.env.reset()
# i


In [93]:
iter = 0
best_reward = 0.0
reward = 0
while True:
    agent.play_n_random_steps(100)
    agent.v_iteration()
    iter += 1
    test_env = gym.make(ENV_NAME)
    for i in range(TEST_EPISODES):
        reward += agent.play_fullepisode(test_env)
    
    reward /= TEST_EPISODES
    
    writer.add_scalar("reward", reward, iter)
    
    if reward > best_reward:
            print("Best reward updated %.3f -> %.3f" % (
                best_reward, reward))
            best_reward = reward
            
    if reward >= 0.80:
        print(f"Solved in {iter} steps!")
        break
writer.close()

Best reward updated 0.000 -> 0.050
Best reward updated 0.050 -> 0.050
Best reward updated 0.050 -> 0.050
Best reward updated 0.050 -> 0.050
Best reward updated 0.050 -> 0.053
Best reward updated 0.053 -> 0.053
Best reward updated 0.053 -> 0.053
Best reward updated 0.053 -> 0.053
Best reward updated 0.053 -> 0.053
Best reward updated 0.053 -> 0.053
Best reward updated 0.053 -> 0.053


KeyboardInterrupt: 