In [1]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
import matplotlib.pyplot as plt
from collections import deque
import random
import mlflow
import mlflow.tensorflow
import wandb
from wandb.integration.keras import WandbMetricsLogger, WandbEvalCallback, WandbCallback
import os

2024-07-10 16:24:09.956010: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-10 16:24:09.964369: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-10 16:24:09.974753: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-10 16:24:09.974768: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-10 16:24:09.981787: I tensorflow/core/platform/cpu_feature_gua

In [2]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.get_logger().setLevel('ERROR')

In [3]:
def build_model():
    model = Sequential([
        Input(shape=(6,)),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dense(3, activation='linear')
    ])
    return model

In [4]:
class ReplayBuffer:
    def __init__(self, max_size=1000):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

In [5]:
def train_model(model, env, episodes=20, batch_size=64):
    replay_buffer = ReplayBuffer(1000)
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    loss_fn = tf.keras.losses.MeanSquaredError()
    rewards_history = []

    with mlflow.start_run():
        mlflow.log_params({"episodes": episodes, "batch_size": batch_size})
        wandb.config.update({"episodes": episodes, "batch_size": batch_size})
        
        for episode in range(episodes):
            result = env.reset()
            if isinstance(result, tuple):
                observation, info = result
            else:
                observation = result
                info = {}

            total_reward = 0
            while True:
                action = env.action_space.sample()
                next_observation, reward, terminated, truncated, info = env.step(action)

                obs_array = np.array(observation, dtype=float).reshape(1, -1)
                next_obs_array = np.array(next_observation, dtype=float).reshape(1, -1)
                replay_buffer.add((obs_array, action, reward, next_obs_array, terminated or truncated))
                total_reward += reward

                if len(replay_buffer.buffer) > batch_size:
                    batch = replay_buffer.sample(batch_size)
                    for obs, act, rew, next_obs, done in batch:
                        target = rew + 0.99 * np.max(model.predict(next_obs))
                        target_vec = model.predict(obs)[0]
                        target_vec[act] = target

                        with tf.GradientTape() as tape:
                            preds = model(obs)
                            loss = loss_fn(target_vec[None, :], preds)
                        grads = tape.gradient(loss, model.trainable_variables)
                        optimizer.apply_gradients(zip(grads, model.trainable_variables))

                observation = next_observation
                if terminated or truncated:
                    break

            rewards_history.append(total_reward)
            mlflow.log_metric("reward", total_reward, step=episode)
            wandb.log({"reward": total_reward, "episode": episode})
            print(f"Episode {episode + 1}/{episodes}: Total Reward: {total_reward}")

    return rewards_history

In [6]:
wandb.init(project="acrobot-DQN-rl-experiment")
mlflow.set_experiment("Acrobot-RL-Experiment")
env = gym.make('Acrobot-v1')
model = build_model()
mlflow.tensorflow.autolog()
rewards = train_model(model, env, episodes=20)
# Log the model to W&B
model.save("Acrobot_DQN.h5")
wandb.save("Acrobot_DQN.h5")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mamoz[0m ([33marashmozhdehi[0m). Use [1m`wandb login --relogin`[0m to force relogin


2024/07/10 16:24:14 INFO mlflow.tracking.fluent: Experiment with name 'Acrobot-RL-Experiment' does not exist. Creating a new experiment.
2024-07-10 16:24:14.385447: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-07-10 16:24:14.694384: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-07-10 16:24:14.694413: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-07-10 16:24:14.697208: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 545ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


I0000 00:00:1720650255.618196   18187 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10