In [None]:
from collections import defaultdict
import numpy as np
import random
import gym

In [None]:
env = gym.make('CartPole-v0')

In [None]:
class HERReplayBuffer:
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.buffer = []
        self.idx = 0

    def push(self, state, action, reward, next_state, done, goal):
        experience = (state, action, reward, next_state, done, goal)
        if len(self.buffer) < self.buffer_size:
            self.buffer.append(experience)
        else:
            self.buffer[self.idx] = experience
            self.idx = (self.idx + 1) % self.buffer_size

    def sample(self, batch_size):
        samples = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones, goals = zip(*samples)
        return states, actions, rewards, next_states, dones, goals

    def __len__(self):
        return len(self.buffer)

class HER:
    def __init__(self, env, q_network, buffer_size=1000000, batch_size=128, gamma=0.99, alpha=0.001, eps=0.9, her_k=4, n_epochs=50):
        self.env = env
        self.q_network = q_network
        self.replay_buffer = HERReplayBuffer(buffer_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.alpha = alpha
        self.eps = eps
        self.her_k = her_k
        self.n_epochs = n_epochs

    def train(self):
        for epoch in range(self.n_epochs):
            self.env.reset()
            state = self.env.get_state()
            done = False
            steps = 0

            while not done:
                steps += 1
                action = self.select_action(state)
                next_state, reward, done = self.env.step(action)
                goal = self.env.get_goal()

                self.replay_buffer.push(state, action, reward, next_state, done, goal)

                if len(self.replay_buffer) >= self.batch_size:
                    self.update_q_network()

                state = next_state

            print(f"Epoch {epoch}: steps={steps}")

    def update_q_network(self):
        states, actions, rewards, next_states, dones, goals = self.replay_buffer.sample(self.batch_size)

        states = np.array(states)
        actions = np.array(actions)
        rewards = np.array(rewards)
        next_states = np.array(next_states)
        dones = np.array(dones)
        goals = np.array(goals)

        q_values = self.q_network.predict(states)
        q_values_next = self.q_network.predict(next_states)

        max_q_values_next = np.max(q_values_next, axis=1)
        targets = q_values.copy()

        for i in range(self.batch_size):
            if dones[i]:
                targets[i][actions[i]] = rewards[i]
            else:
                future_reward = self.gamma * max_q_values_next[i]
                targets[i][actions[i]] = rewards[i] + future_reward

        self.q_network.train_on_batch(states, targets)

    def select_action(self, state):
        if np.random.uniform() < self.eps:
            q_values = self.q_network.predict(np.array([state]))
            action = np.argmax(q_values)
        else:
            action = self.env.action_space.sample()
        return action

In [None]:
def train_her_q_learning(num_episodes, max_steps, batch_size, alpha, gamma, beta, goal):
    """
    Train a Q-learning agent using Hindsight Experience Replay (HER) on a gym environment.
    """
    # Initialize replay buffer
    replay_buffer = HERReplayBuffer(buffer_size=1000)
    
    def epsilon_greedy(Q, state, num_actions, epsilon):
        """
        Epsilon-greedy policy.
        """
        if np.random.uniform() < epsilon:
            action = np.argmax(Q[state, :])
        else:
            action = np.random.randint(num_actions)
        return action
    
    # Initialize Q-table
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    
    for episode in range(num_episodes):
        # Reset environment and get initial state
        state = env.reset()
        goal_state = goal(env)
        episode_reward = 0
        
        for t in range(max_steps):
            # Choose action
            action = epsilon_greedy(Q, state, env.action_space.n, epsilon)
            
            # Take action and observe next state and reward
            next_state, reward, done, _ = env.step(action)
            
            # Add transition to replay buffer
            transition = (state, action, reward, next_state, done, goal_state)
            replay_buffer.add(transition)
            
            # Update Q-table using experience replay
            if len(replay_buffer) >= batch_size:
                batch = replay_buffer.sample(batch_size)
                her_batch = HER(batch, replay_buffer, goal_state)
                for her_transition in her_batch:
                    s, a, r, s_prime, done, goal_s = her_transition
                    td_error = r + gamma * np.max(Q[s_prime, :]) - Q[s, a]
                    Q[s, a] += alpha * td_error
            
            state = next_state
            episode_reward += reward
            
            if done:
                break
        
        # Print episode reward
        print(f"Episode {episode+1} reward: {episode_reward}")


In [None]:
env = gym.make('CartPole-v1')
train_her_q_learning(num_episodes=1000, max_steps=500, batch_size=32, alpha=0.1, gamma=0.99, beta=0.8, goal=get_cartpole_goal)