# Zadanie 5

Celem ćwiczenia jest implementacja algorytmu Q-learning.

Następnie należy stworzyć agenta rozwiązującego problem [Taxi](https://gymnasium.farama.org/environments/toy_text/taxi/). Problem dostępny jest w pakiecie **gym**.

Punktacja (max 7 pkt):
- Implementacja algorytmu Q-learning. [3 pkt]
- Eksperymenty dla różnych wartości hiperparametrów [2 pkt]
- Jakość kodu [1 pkt]
- Wnioski [1 pkt]


In [7]:
import numpy as np
import gymnasium as gym

class QLearningSolver:
    """Class containing the Q-learning algorithm that might be used for different discrete environments."""

    def __init__(
        self,
        observation_space: int,
        action_space: int,
        learning_rate: float = 0.1,
        gamma: float = 0.9,
        epsilon: float = 0.1,
    ):
        self.observation_space = observation_space
        self.action_space = action_space
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon

        # Initialize Q-table with zeros
        self.q_table = np.zeros((observation_space, action_space))

    def __call__(self, state: int, action: int) -> float:
        """Return Q-value of given state and action."""
        return self.q_table[state, action]

    def update(self, state: int, action: int, reward: float, next_state: int) -> None:
        """Update Q-value of given state and action."""
        # Q-learning update rule
        max_next_q_value = np.max(self.q_table[next_state, :])
        delta = reward + self.gamma * max_next_q_value - self.q_table[state, action]
        self.q_table[state, action] += self.learning_rate * delta

    def get_best_action(self, state):
        # state = int(state)  # Ensure that state is an integer
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.action_space)
        else:
            return np.argmax(self.q_table[state, :])

    # nie wiem do końca czy tutaj nie chodzi może bardzeij o wizualną reprezentację
    def __repr__(self):
        """Elegant representation of Q-learning solver."""
        return f"QLearningSolver(observation_space={self.observation_space}, action_space={self.action_space}, learning_rate={self.learning_rate}, gamma={self.gamma}, epsilon={self.epsilon})"

    def __str__(self):
        return self.__repr__()


# Create the Taxi-v3 environment
env = gym.make('Taxi-v3')
num_states = env.observation_space.n   
num_actions = env.action_space.n

# Create a Q-learning solver
q_solver = QLearningSolver(observation_space=num_states, action_space=num_actions)

# Q-learning loop
for episode in range(5000):
    state = env.reset()
    total_reward = 0

    for step in range(200):  # Limit the number of steps per episode to avoid infinite loops
        action = q_solver.get_best_action(state)
        next_state, reward, done, _ = env.step(action)

        q_solver.update(state, action, reward, next_state)

        total_reward += reward
        state = next_state

        if done:
            break

    if episode % 100 == 0:
        print(f"Episode {episode}, Total Reward: {total_reward}")

# Test the learned policy
state = env.reset()
env.render()
while True:
    action = q_solver.get_best_action(state)
    state, _, done, _ = env.step(action)
    env.render()
    if done:
        break

env.close()


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

# Eksperymenty

# Wnioski