In [None]:
import gym
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

# Create environment
env = gym.make('CartPole-v0')

states = env.observation_space.shape[0]  # 4
actions = env.action_space.n  # 2

# Test the environment (FIXED HERE)
episodes = 10
for episode in range(1, episodes + 1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = random.choice([0, 1])
        # Correctly unpack 4 values for older Gym versions
        next_state, reward, done, info = env.step(action)
        score += reward
        state = next_state

    print(f'Episode: {episode} Score: {score}')

# Build the model (remainder of code stays the same)
def build_model(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1, states)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

model = build_model(states, actions)
model.summary()

# Build and compile the agent
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(
        model=model,
        memory=memory,
        policy=policy,
        nb_actions=actions,
        nb_steps_warmup=10,
        target_model_update=1e-2
    )
    return dqn

dqn = build_agent(model, actions)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])

# Train the agent
dqn.fit(env, nb_steps=50000, visualize=True, verbose=1)

  logger.warn(
  deprecation(
  deprecation(
If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


Episode: 1 Score: 17.0
Episode: 2 Score: 12.0
Episode: 3 Score: 13.0
Episode: 4 Score: 28.0
Episode: 5 Score: 18.0
Episode: 6 Score: 24.0
Episode: 7 Score: 13.0
Episode: 8 Score: 14.0
Episode: 9 Score: 34.0
Episode: 10 Score: 30.0
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 4)                 0         
                                                                 
 dense (Dense)               (None, 24)                120       
                                                                 
 dense_1 (Dense)             (None, 24)                600       
                                                                 
 dense_2 (Dense)             (None, 2)                 50        
                                                                 
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________

  updates=self.state_updates,
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


   10/10000 [..............................] - ETA: 3:24 - reward: 1.0000

  batch_idxs = np.random.random_integers(low, high - 1, size=size)


   22/10000 [..............................] - ETA: 6:18 - reward: 1.0000

  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)


   31/10000 [..............................] - ETA: 5:25 - reward: 1.0000

  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)


   40/10000 [..............................] - ETA: 4:57 - reward: 1.0000

  batch_idxs = np.random.random_integers(low, high - 1, size=size)


112 episodes - episode_reward: 87.670 [9.000, 200.000] - loss: 4.055 - mae: 19.388 - mean_q: 39.202

Interval 2 (10000 steps performed)
51 episodes - episode_reward: 196.333 [136.000, 200.000] - loss: 9.618 - mae: 40.564 - mean_q: 81.884

Interval 3 (20000 steps performed)
53 episodes - episode_reward: 189.868 [151.000, 200.000] - loss: 7.908 - mae: 43.503 - mean_q: 87.549

Interval 4 (30000 steps performed)
51 episodes - episode_reward: 194.451 [160.000, 200.000] - loss: 6.074 - mae: 41.744 - mean_q: 83.712

Interval 5 (40000 steps performed)
done, took 1024.350 seconds


<keras.callbacks.History at 0x23a0528bc70>

: 