In [1]:
import gym
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Create the CartPole-v1 environment
env = gym.make('CartPole-v1')

# Define the size of the action space and observation space
num_actions = env.action_space.n
num_states = env.observation_space.shape[0]

# Define the parameters for the DQN
learning_rate = 0.001
discount_factor = 0.99
epsilon_initial = 1.0
epsilon_decay = 0.995
epsilon_min = 0.01
batch_size = 32
memory_size = 1000
target_update_frequency = 100

# Define the DQN model
def create_model():
    model = Sequential([
        Dense(24, input_shape=(num_states,), activation='relu'),
        Dense(24, activation='relu'),
        Dense(num_actions, activation='linear')
    ])
    model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate))
    return model

# Define the replay memory
class ReplayMemory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []

    def push(self, transition):
        self.memory.append(transition)
        if len(self.memory) > self.capacity:
            del self.memory[0]

    def sample(self, batch_size):
        samples = np.random.choice(len(self.memory), batch_size, replace=False)
        states, actions, rewards, next_states, dones = [], [], [], [], []
        for idx in samples:
            state, action, reward, next_state, done = self.memory[idx]
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            next_states.append(next_state)
            dones.append(done)
        return np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(dones)

# Initialize the DQN model and target network
model = create_model()
target_model = create_model()

# Initialize the replay memory
memory = ReplayMemory(memory_size)

# Initialize variables for tracking training progress
total_rewards = []
epsilon = epsilon_initial

# Main training loop
for episode in range(1000):
    state = env.reset()
    state = np.reshape(state, [1, num_states])
    done = False
    episode_reward = 0
    
    while not done:
        # Choose action using epsilon-greedy policy
        if np.random.rand() <= epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(model.predict(state))  # Exploit
        
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, num_states])
        episode_reward += reward
        
        # Store transition in replay memory
        memory.push((state, action, reward, next_state, done))
        
        state = next_state  # Update state for the next step
        
       
        # Sample a random minibatch from replay memory and train the model
        if len(memory.memory) >= batch_size:
            minibatch = memory.sample(batch_size)
            states, actions, rewards, next_states, dones = minibatch

            # Reshape states and next_states to match the input shape of the model
            states = np.reshape(states, (batch_size, num_states))
            next_states = np.reshape(next_states, (batch_size, num_states))

            targets = model.predict(states)
            next_state_targets = target_model.predict(next_states)

        
        # Update epsilon
        if epsilon > epsilon_min:
            epsilon *= epsilon_decay
        
        # Render the environment
        env.render()
    
    # Update the target network periodically
    if episode % target_update_frequency == 0:
        target_model.set_weights(model.get_weights())
    
    # Record episode reward
    total_rewards.append(episode_reward)
    
    # Print episode statistics
    print(f"Episode {episode + 1}: Total Reward = {episode_reward}")

# Close the environment after training
env.close()


2024-03-24 20:05:46.477453: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-24 20:05:46.508367: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-24 20:05:47.553724: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2024-03-24 20:05:47.553750: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: Ardianto
2024-03-24 20:05:47.553753: I tensorflow/compiler/xla/stream_executor/cuda/

ValueError: cannot reshape array of size 2 into shape (1,4)