In [1]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
import matplotlib.pyplot as plt
from collections import deque
import random
import mlflow
import wandb
from wandb.integration.keras import WandbCallback
import os

2024-07-10 17:45:00.270122: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-10 17:45:00.278154: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-10 17:45:00.288333: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-10 17:45:00.288353: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-10 17:45:00.294913: I tensorflow/core/platform/cpu_feature_gua

In [2]:
# Configure the environment
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.get_logger().setLevel('ERROR')

In [3]:
def build_model():
    model = Sequential([
        Input(shape=(6,)),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dense(3, activation='linear')
    ])
    return model

In [4]:
class ReplayBuffer:
    def __init__(self, max_size=1000):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

In [10]:
def train_model(model, env, episodes, batch_size):
    replay_buffer = ReplayBuffer(1000)
    optimizer = tf.keras.optimizers.Adam(learning_rate=wandb.config.learning_rate)
    loss_fn = tf.keras.losses.MeanSquaredError()
    rewards_history = []

    with mlflow.start_run():
        
        for episode in range(episodes):
            result = env.reset()
            if isinstance(result, tuple):
                observation, info = result
            else:
                observation = result
                info = {}

            total_reward = 0
            while True:
                action = env.action_space.sample()
                next_observation, reward, terminated, truncated, info = env.step(action)

                obs_array = np.array(observation, dtype=float).reshape(1, -1)
                next_obs_array = np.array(next_observation, dtype=float).reshape(1, -1)
                replay_buffer.add((obs_array, action, reward, next_obs_array, terminated or truncated))
                total_reward += reward

                if len(replay_buffer.buffer) > batch_size:
                    batch = replay_buffer.sample(batch_size)
                    for obs, act, rew, next_obs, done in batch:
                        target = rew + 0.99 * np.max(model.predict(next_obs))
                        target_vec = model.predict(obs)[0]
                        target_vec[act] = target

                        with tf.GradientTape() as tape:
                            preds = model(obs)
                            loss = loss_fn(target_vec[None, :], preds)
                        grads = tape.gradient(loss, model.trainable_variables)
                        optimizer.apply_gradients(zip(grads, model.trainable_variables))

                observation = next_observation
                if terminated or truncated:
                    break

            rewards_history.append(total_reward)
            mlflow.log_metric("reward", total_reward, step=episode)
            wandb.log({"reward": total_reward, "episode": episode})
            print(f"Episode {episode + 1}/{episodes}: Total Reward: {total_reward}")

    return rewards_history

In [11]:
wandb.init(project="acrobot-DQN-rl-experiment", config={
    "learning_rate": 0.001,
    "batch_size": 64,
    "episodes": 20,
    "architecture": "DenseNet",
    "optimizer": "Adam"
})

mlflow.set_experiment("Acrobot-RL-Experiment")
mlflow.tensorflow.autolog()
env = gym.make('Acrobot-v1')
model = build_model()
rewards = train_model(model, env, episodes=wandb.config.episodes, batch_size=wandb.config.batch_size)
# Log the model to W&B
model.save("Acrobot_DQN.h5")
wandb.save("Acrobot_DQN.h5")

  if not isinstance(terminated, (bool, np.bool8)):
I0000 00:00:1720655283.079400  575598 service.cc:145] XLA service 0x7fdb2c006f70 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1720655283.079430  575598 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 4060, Compute Capability 8.9
2024-07-10 17:48:03.083236: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-07-10 17:48:03.106822: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 234ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step


I0000 00:00:1720655283.260141  575598 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10

KeyboardInterrupt: 