In [2]:
import numpy as np
import gym
from collections import deque

class QLearningAgent:
    def __init__(self, env, learning_rate=0.1, discount_factor=0.99, initial_epsilon=1.0, min_epsilon=0.01, epsilon_decay=0.995, bins=(30, 30, 30, 30)):
        self.env = env
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.initial_epsilon = initial_epsilon
        self.epsilon = initial_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay = epsilon_decay
        self.bins = bins
        self.q_table = np.zeros(self.bins + (env.action_space.n,))

        # Define upper and lower bounds for each dimension of the state space
        self.state_bounds = list(zip(env.observation_space.low, env.observation_space.high))

    def discretize_state(self, state):
        discretized_state = []
        for i in range(len(state)):
            bounds = self.state_bounds[i]
            val = min(max(state[i], bounds[0]), bounds[1])  # Clip the value to be within bounds
            discretized_val = np.linspace(bounds[0], bounds[1], self.bins[i])
            index = np.abs(discretized_val - val).argmin()  # Find the index of the closest value
            discretized_state.append(index)
        return tuple(discretized_state)

    def get_action(self, state):
        discretized_state = self.discretize_state(state)
        if np.random.rand() < self.epsilon:
            return self.env.action_space.sample()  # Random action
        else:
            return np.argmax(self.q_table[discretized_state])  # Greedy action

    def update_q_table(self, state, action, reward, next_state):
        state_index = self.discretize_state(state)
        next_state_index = self.discretize_state(next_state)
        best_next_action = np.argmax(self.q_table[next_state_index])
        td_target = reward + self.discount_factor * self.q_table[next_state_index + (best_next_action,)]
        td_error = td_target - self.q_table[state_index + (action,)]
        self.q_table[state_index + (action,)] += self.learning_rate * td_error

    def train(self, num_episodes):
        recent_rewards = deque(maxlen=100)
        consecutive_rewards = 0
        success = False
        for episode in range(num_episodes):
            state = self.env.reset()
            total_reward = 0
            done = False

            while not done:
                action = self.get_action(state)
                next_state, reward, done, _ = self.env.step(action)
                self.update_q_table(state, action, reward, next_state)
                total_reward += reward
                state = next_state

            recent_rewards.append(total_reward)
            avg_reward = np.mean(recent_rewards)
            print(f"Episode: {episode + 1}, Total Reward: {total_reward}, Average Reward: {avg_reward}, Epsilon: {self.epsilon}")

            if avg_reward >= 195:
                consecutive_rewards += 1
                if consecutive_rewards >= 100:
                    print(f"Problem solved after {episode + 1} episodes.")
                    success = True
                    break
            else:
                consecutive_rewards = 0

            # Decay epsilon
            self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

        if not success:
            print("Failed to solve the problem.")

if __name__ == "__main__":
    env = gym.make('CartPole-v1')
    agent = QLearningAgent(env)
    agent.train(10000)


Episode: 1, Total Reward: 18.0, Average Reward: 18.0, Epsilon: 1.0
Episode: 2, Total Reward: 33.0, Average Reward: 25.5, Epsilon: 0.995
Episode: 3, Total Reward: 16.0, Average Reward: 22.333333333333332, Epsilon: 0.990025
Episode: 4, Total Reward: 25.0, Average Reward: 23.0, Epsilon: 0.985074875
Episode: 5, Total Reward: 11.0, Average Reward: 20.6, Epsilon: 0.9801495006250001
Episode: 6, Total Reward: 11.0, Average Reward: 19.0, Epsilon: 0.9752487531218751
Episode: 7, Total Reward: 13.0, Average Reward: 18.142857142857142, Epsilon: 0.9703725093562657
Episode: 8, Total Reward: 25.0, Average Reward: 19.0, Epsilon: 0.9655206468094844
Episode: 9, Total Reward: 10.0, Average Reward: 18.0, Epsilon: 0.960693043575437
Episode: 10, Total Reward: 12.0, Average Reward: 17.4, Epsilon: 0.9558895783575597
Episode: 11, Total Reward: 18.0, Average Reward: 17.454545454545453, Epsilon: 0.9511101304657719
Episode: 12, Total Reward: 17.0, Average Reward: 17.416666666666668, Epsilon: 0.946354579813443
Epis