In [7]:
import numpy as np
import gym
from collections import deque

class QLearningAgent:
    def __init__(self, env, learning_rate=0.1, discount_factor=0.99, epsilon=0.1, bins=(30, 30, 30, 30)):
        self.env = env
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.bins = bins
        self.q_table = np.zeros(self.bins + (env.action_space.n,))

        # Define upper and lower bounds for each dimension of the state space
        self.state_bounds = list(zip(env.observation_space.low, env.observation_space.high))

    def discretize_state(self, state):
        discretized_state = []
        for i in range(len(state)):
            bounds = self.state_bounds[i]
            val = min(max(state[i], bounds[0]), bounds[1])  # Clip the value to be within bounds
            discretized_val = np.linspace(bounds[0], bounds[1], self.bins[i])
            index = np.abs(discretized_val - val).argmin()  # Find the index of the closest value
            discretized_state.append(index)
        return tuple(discretized_state)

    def get_action(self, state):
        discretized_state = self.discretize_state(state)
        if np.random.rand() < self.epsilon:
            return self.env.action_space.sample()  # Random action
        else:
            return np.argmax(self.q_table[discretized_state])  # Greedy action

    def update_q_table(self, state, action, reward, next_state):
        state_index = self.discretize_state(state)
        next_state_index = self.discretize_state(next_state)
        best_next_action = np.argmax(self.q_table[next_state_index])
        td_target = reward + self.discount_factor * self.q_table[next_state_index + (best_next_action,)]
        td_error = td_target - self.q_table[state_index + (action,)]
        self.q_table[state_index + (action,)] += self.learning_rate * td_error

    def train(self, num_episodes):
        recent_rewards = deque(maxlen=100)
        consecutive_rewards = 0
        success = False
        for episode in range(num_episodes):
            state = self.env.reset()
            total_reward = 0
            done = False

            while not done:
                action = self.get_action(state)
                next_state, reward, done, _ = self.env.step(action)
                self.update_q_table(state, action, reward, next_state)
                total_reward += reward
                state = next_state

            recent_rewards.append(total_reward)
            avg_reward = np.mean(recent_rewards)
            print(f"Episode: {episode + 1}, Total Reward: {total_reward}, Average Reward: {avg_reward}")

            if avg_reward >= 195:
                consecutive_rewards += 1
                if consecutive_rewards >= 100:
                    print(f"Problem solved after {episode + 1} episodes.")
                    success = True
                    break
            else:
                consecutive_rewards = 0

        if not success:
            print("Failed to solve the problem.")

if __name__ == "__main__":
    env = gym.make('CartPole-v1')
    agent = QLearningAgent(env)
    agent.train(1000)


Episode: 1, Total Reward: 10.0, Average Reward: 10.0
Episode: 2, Total Reward: 10.0, Average Reward: 10.0
Episode: 3, Total Reward: 10.0, Average Reward: 10.0
Episode: 4, Total Reward: 11.0, Average Reward: 10.25
Episode: 5, Total Reward: 10.0, Average Reward: 10.2
Episode: 6, Total Reward: 13.0, Average Reward: 10.666666666666666
Episode: 7, Total Reward: 10.0, Average Reward: 10.571428571428571
Episode: 8, Total Reward: 10.0, Average Reward: 10.5
Episode: 9, Total Reward: 9.0, Average Reward: 10.333333333333334
Episode: 10, Total Reward: 10.0, Average Reward: 10.3
Episode: 11, Total Reward: 11.0, Average Reward: 10.363636363636363
Episode: 12, Total Reward: 12.0, Average Reward: 10.5
Episode: 13, Total Reward: 10.0, Average Reward: 10.461538461538462
Episode: 14, Total Reward: 11.0, Average Reward: 10.5
Episode: 15, Total Reward: 12.0, Average Reward: 10.6
Episode: 16, Total Reward: 12.0, Average Reward: 10.6875
Episode: 17, Total Reward: 10.0, Average Reward: 10.647058823529411
Epis