In [4]:
import os
import random
import numpy as np
import tensorflow as tf
import functools
import json
from datetime import datetime
import matplotlib.pyplot as plt
from tqdm import trange

from tf_agents.environments import suite_gym, ParallelPyEnvironment, tf_py_environment
from tf_agents.agents.sac.sac_agent import SacAgent
from tf_agents.networks.actor_distribution_network import ActorDistributionNetwork
from tf_agents.networks import sequential, nest_map
from tf_agents.keras_layers import inner_reshape
from tf_agents.utils import common
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.drivers import dynamic_step_driver
from tf_agents.system import multiprocessing

try:
    multiprocessing.enable_interactive_mode()
except RuntimeError as e:
    if "context has already been set" not in str(e):
        raise
except ValueError as e:
    if "Multiprocessing already initialized" not in str(e):
        raise

# Habilitar uso de GPU
physical_devices = tf.config.list_physical_devices('GPU')
for g in physical_devices:
    tf.config.experimental.set_memory_growth(g, True)
print("Usando GPU:", tf.config.list_logical_devices('GPU'))

# --- Helper para construir red critic personalizada ---
dense = functools.partial(tf.keras.layers.Dense, activation='relu', kernel_initializer='glorot_uniform')

def create_identity_layer():
    return tf.keras.layers.Lambda(lambda x: x)

def create_sequential_critic_network(obs_units, act_units, joint_units):
    def split(inputs):
        return {'observation': inputs[0], 'action': inputs[1]}
    obs_net   = sequential.Sequential([dense(u) for u in obs_units]) if obs_units else create_identity_layer()
    act_net   = sequential.Sequential([dense(u) for u in act_units]) if act_units else create_identity_layer()
    joint_net = sequential.Sequential([dense(u) for u in joint_units]) if joint_units else create_identity_layer()
    value_layer = tf.keras.layers.Dense(1, kernel_initializer='glorot_uniform')
    return sequential.Sequential([
        tf.keras.layers.Lambda(split),
        nest_map.NestMap({'observation': obs_net, 'action': act_net}),
        nest_map.NestFlatten(),
        tf.keras.layers.Concatenate(),
        joint_net,
        value_layer,
        inner_reshape.InnerReshape(current_shape=[1], new_shape=[])
    ], name='sequential_critic')


def run_sac_seed(seed,
                 #env_name = "MountainCarContinuous-v0",
                 env_name="Pendulum-v1",
                 num_parallel=64*4,
                 collect_steps=128*2,
                 batch_size=256*4,
                 replay_buffer_max=200_000,
                 learning_rate=1e-4,
                 num_iterations=100_000,#50_000,
                 eval_interval=5_000):
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    random.seed(seed); np.random.seed(seed); tf.random.set_seed(seed)

    def make_env(): return suite_gym.load(env_name)
    py_env = ParallelPyEnvironment([make_env] * num_parallel)
    train_env = tf_py_environment.TFPyEnvironment(py_env)
    eval_env  = tf_py_environment.TFPyEnvironment(suite_gym.load(env_name))

    with tf.device('/GPU:0'):
        train_step = tf.Variable(0)

        actor_net  = ActorDistributionNetwork(
            train_env.observation_spec(),
            train_env.action_spec(),
            fc_layer_params=(256,256)
        )
        critic_net1 = create_sequential_critic_network((256,256), None, (256,256))
        critic_net2 = create_sequential_critic_network((256,256), None, (256,256))

        agent = SacAgent(
            time_step_spec=train_env.time_step_spec(),
            action_spec=train_env.action_spec(),
            actor_network=actor_net,
            critic_network=critic_net1,
            critic_network_2=critic_net2,
            actor_optimizer=tf.keras.optimizers.Adam(learning_rate),
            critic_optimizer=tf.keras.optimizers.Adam(learning_rate),
            alpha_optimizer=tf.keras.optimizers.Adam(learning_rate),
            target_update_tau=0.005,
            target_update_period=1,
            td_errors_loss_fn=common.element_wise_squared_loss,
            gamma=0.99,
            reward_scale_factor=2.0,
            train_step_counter=train_step
        )
        agent.initialize()
        agent.train = common.function(agent.train)

    buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=num_parallel,
        max_length=replay_buffer_max
    )
    dataset = buffer.as_dataset(sample_batch_size=batch_size, num_steps=2).prefetch(tf.data.AUTOTUNE)
    iterator = iter(dataset)

    driver = dynamic_step_driver.DynamicStepDriver(
        train_env, agent.collect_policy, observers=[buffer.add_batch], num_steps=collect_steps
    )
    driver.run()  # Warm-up

    @tf.function
    def train_step_fn():
        experience, _ = next(iterator)
        return agent.train(experience)

    episodes = []
    evals = []
    ep_rewards = np.zeros(num_parallel)
    ep_steps = np.zeros(num_parallel, dtype=int)
    ep_count = np.zeros(num_parallel, dtype=int)

    def update_episodes(time_step):
        nonlocal ep_rewards, ep_steps, ep_count
        rewards = time_step.reward.numpy()
        dones = time_step.is_last().numpy()
        ep_rewards += rewards
        ep_steps += 1
        for i, done in enumerate(dones):
            if done:
                ep_count[i] += 1
                episodes.append({
                    "total_timesteps": int(ep_steps[i]),
                    "episode_num": int(ep_count[i]),
                    "episode_timesteps": int(ep_steps[i]),
                    "reward": float(ep_rewards[i])
                })
                ep_rewards[i] = 0.0
                ep_steps[i] = 0

    start_time = datetime.now().isoformat(timespec='seconds')

    pbar = trange(num_iterations + 1, desc=f"Seed {seed}", dynamic_ncols=True)
    pbar.set_postfix({"eval_return": "N/A"})
    for step in pbar:
        time_step, _ = driver.run()
        update_episodes(time_step)
        train_step_fn()

        if step % eval_interval == 0:
            ts = eval_env.reset(); total = 0.0
            while not ts.is_last():
                action_step = agent.policy.action(ts)
                ts = eval_env.step(action_step.action)
                total += ts.reward.numpy().item()
            evals.append({
                "at_timesteps": int(step),
                "evaluation_over_1_episode": float(total)
            })
            print(f"[Step {step:>5}] Eval return = {total:.2f}")
            pbar.set_postfix({"eval_return": f"{total:.2f}"})
            pbar.update(0)

    data = {
        "experiment": {
            "policy": "SAC",
            "environment": env_name,
            "seed": seed,
            "start_time": start_time
        },
        "episodes": episodes,
        "evaluations": evals
    }

    folder = "jsons"
    os.makedirs(folder, exist_ok=True)
    ts = datetime.now().strftime("%Y%m%dT%H%M%S")
    fname = f"sac_{seed}_{ts}.json"
    path = os.path.join(folder, fname)
    with open(path, 'w') as f:
        json.dump(data, f, indent=2)
    print(f"Saved JSON to {path}")

    return episodes, evals, agent



def main():
    seeds = [0]
    for s in seeds:
        eps, evs = run_sac_seed(seed=s)
        steps = [e['at_timesteps'] for e in evs]
        vals = [e['evaluation_over_1_episode'] for e in evs]
        plt.figure(figsize=(6,4))
        plt.plot(steps, vals, marker='o')
        plt.xlabel('Steps'); plt.ylabel('Eval Return')
        plt.title(f'SAC Eval (seed {s})')
        plt.grid(True)
        plt.show()

#if __name__ == '__main__':
    #main()


Usando GPU: [LogicalDevice(name='/device:GPU:0', device_type='GPU')]


In [5]:
print('a')

a


In [3]:


seeds = [0,1,2]
for s in seeds:
    eps, evs,agent = run_sac_seed(seed=s)

    # Plot evaluation returns
    steps = [e['at_timesteps'] for e in evs]
    vals = [e['evaluation_over_1_episode'] for e in evs]
    plt.figure(figsize=(6, 4))
    plt.plot(steps, vals, marker='o', label='Eval Return')
    plt.xlabel('Steps')
    plt.ylabel('Eval Return')
    plt.title(f'SAC Eval (seed {s})')
    plt.grid(True)
    plt.legend()
    plt.show()

    window = 100
    rewards = [e['reward'] for e in eps]
    avg_rewards = [np.mean(rewards[i:i+window]) for i in range(0, len(rewards) - window + 1, window)]
    
    plt.plot(avg_rewards)
    plt.xlabel("Episode window (x100)")
    plt.ylabel("Avg reward per 100 episodes")
    plt.title("Smoothed Episode Rewards")
    plt.grid(True)
    plt.show()



Process ForkProcess-20:
Traceback (most recent call last):
  File "/home/pinoprie/.local/share/mamba/envs/sac-tfagents/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/pinoprie/.local/share/mamba/envs/sac-tfagents/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/pinoprie/.local/share/mamba/envs/sac-tfagents/lib/python3.8/site-packages/tf_agents/system/system_multiprocessing.py", line 158, in __call__
    return target(*args, **kwargs)
  File "/home/pinoprie/.local/share/mamba/envs/sac-tfagents/lib/python3.8/site-packages/tf_agents/environments/parallel_py_environment.py", line 458, in _worker
    env = cloudpickle.loads(self._pickled_env_constructor)()
  File "/tmp/2641513/ipykernel_3829108/1698676426.py", line 73, in make_env
    def make_env(): return suite_gym.load(env_name)
  File "/home/pinoprie/.local/share/mamba/envs/sac-tfagents/lib/python3.8/site-packages/gin/c

KeyboardInterrupt: 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress

seeds = [0, 1, 2,3]
all_rewards = []

for s in seeds:
    eps, evs, agent = run_sac_seed(seed=s)

    # Store episode rewards
    rewards = [e['reward'] for e in eps]
    all_rewards.append(rewards)

# Find the minimum common length (in case training was cut short)
min_len = min(len(r) for r in all_rewards)
all_rewards = [r[:min_len] for r in all_rewards]

# Convert to numpy array for easier math
reward_array = np.array(all_rewards)  # shape: (seeds, episodes)
mean_rewards = np.mean(reward_array, axis=0)
std_rewards = np.std(reward_array, axis=0)

episodes = np.arange(min_len)

# Linear trend line
slope, intercept, *_ = linregress(episodes, mean_rewards)
trend_line = slope * episodes + intercept

# Plot: Mean ± Std Dev and Trend
plt.figure(figsize=(10, 6))
plt.plot(episodes, mean_rewards, color='blue', label='Mean Reward')
plt.fill_between(episodes, mean_rewards - std_rewards, mean_rewards + std_rewards, color='blue', alpha=0.2, label='±1 Std Dev')
plt.plot(episodes, trend_line, 'r--', label=f"Trend: {slope:.2f}x + {intercept:.2f}")

plt.title('Mean Episode Rewards Across Seeds - SAC on Pendulum-v1')#MountainCarContinuous-v0')
plt.xlabel('Episode Number')
plt.ylabel('Mean Total Reward')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('figure2mountaincar')
plt.show()


2025-05-19 22:17:20.302672: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_13' with dtype resource
	 [[{{node Placeholder/_13}}]]
2025-05-19 22:17:20.303102: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_11' with dtype resource
	 [[{{node Placeholder/_11}}]]
Seed 0:   0%|             | 5/100001 [00:04<41:17, 40.37it/s, eval_return=-911.37]

[Step     0] Eval return = -911.37


Seed 0:   5%|▍       | 5003/100001 [03:49<8:29:56,  3.10it/s, eval_return=-296.50]

[Step  5000] Eval return = -296.50


Seed 0:  10%|▋      | 10004/100001 [07:35<5:56:39,  4.21it/s, eval_return=-125.36]

[Step 10000] Eval return = -125.36


Seed 0:  15%|█      | 15004/100001 [11:21<5:31:52,  4.27it/s, eval_return=-124.78]

[Step 15000] Eval return = -124.78


Seed 0:  20%|█▍     | 20005/100001 [15:06<5:18:57,  4.18it/s, eval_return=-124.43]

[Step 20000] Eval return = -124.43


Seed 0:  25%|█▊     | 25005/100001 [18:50<4:54:08,  4.25it/s, eval_return=-118.52]

[Step 25000] Eval return = -118.52


Seed 0:  30%|██     | 30003/100001 [22:34<6:13:01,  3.13it/s, eval_return=-125.39]

[Step 30000] Eval return = -125.39


Seed 0:  35%|██▍    | 35004/100001 [26:17<4:19:17,  4.18it/s, eval_return=-117.42]

[Step 35000] Eval return = -117.42


Seed 0:  40%|██▊    | 40005/100001 [30:01<3:56:26,  4.23it/s, eval_return=-121.33]

[Step 40000] Eval return = -121.33


Seed 0:  45%|███▏   | 45003/100001 [33:43<4:45:10,  3.21it/s, eval_return=-123.60]

[Step 45000] Eval return = -123.60


Seed 0:  50%|████▌    | 50004/100001 [37:25<3:13:47,  4.30it/s, eval_return=-8.97]

[Step 50000] Eval return = -8.97


Seed 0:  55%|███▊   | 55005/100001 [41:07<2:59:30,  4.18it/s, eval_return=-137.88]

[Step 55000] Eval return = -137.88


Seed 0:  60%|████▊   | 60003/100001 [44:50<3:28:25,  3.20it/s, eval_return=-12.30]

[Step 60000] Eval return = -12.30


Seed 0:  65%|████▌  | 65004/100001 [48:31<2:15:06,  4.32it/s, eval_return=-134.14]

[Step 65000] Eval return = -134.14


Seed 0:  70%|████▉  | 70005/100001 [52:13<1:56:01,  4.31it/s, eval_return=-126.19]

[Step 70000] Eval return = -126.19


Seed 0:  75%|█████▎ | 75003/100001 [55:56<2:12:36,  3.14it/s, eval_return=-130.33]

[Step 75000] Eval return = -130.33


Seed 0:  80%|█████▌ | 80003/100001 [59:42<1:44:07,  3.20it/s, eval_return=-133.15]

[Step 80000] Eval return = -133.15


Seed 0:  85%|███████▋ | 85004/100001 [1:03:24<59:13,  4.22it/s, eval_return=-5.27]

[Step 85000] Eval return = -5.27


Seed 0:  90%|██████▎| 90005/100001 [1:07:06<38:57,  4.28it/s, eval_return=-124.63]

[Step 90000] Eval return = -124.63


Seed 0:  95%|██████▋| 95003/100001 [1:10:49<27:06,  3.07it/s, eval_return=-351.95]

[Step 95000] Eval return = -351.95


Seed 0: 100%|██████| 100001/100001 [1:14:32<00:00, 22.36it/s, eval_return=-127.07]

[Step 100000] Eval return = -127.07





Saved JSON to jsons/sac_0_20250519T233153.json


In [6]:
agent

<tf_agents.agents.sac.sac_agent.SacAgent at 0x7f8685a00730>

In [6]:
from tf_agents.policies import policy_saver

# after agent.initialize() and training finishes:
saver = policy_saver.PolicySaver(agent.policy)
saver.save('saved_policypendulum1')   # this will create a directory 'saved_policy'




INFO:tensorflow:Assets written to: saved_policypendulum1/assets


INFO:tensorflow:Assets written to: saved_policypendulum1/assets


In [7]:
import shutil

# Replace 'my_folder' with your folder name
shutil.make_archive('my_folderpend1', 'zip', 'saved_policypendulum1')


'/home/pinoprie/new_rl/RL-project/SAC/my_folderpend1.zip'

In [13]:
#zip -r my_folder.zip saved_policy/


In [1]:
import gym
print(gym.__version__)


0.23.0


In [22]:
import tensorflow as tf
import tf_agents
import tensorflow_probability as tfp

print("TensorFlow:", tf.__version__)
print("TF-Agents:", tf_agents.__version__)
print("TensorFlow Probability:", tfp.__version__)


TensorFlow: 2.12.0
TF-Agents: 0.17.0
TensorFlow Probability: 0.20.1


In [None]:

!pip install "tensorflow==2.12.0" "tensorflow-probability==0.20.1" "tf-agents==0.17.0" "gym==0.23.0" "numpy>=1.23" "matplotlib>=3.5"
