<a href="https://colab.research.google.com/github/YasJanam/RL1/blob/main/EpisodicActorCritic_SeparateModels_6/EpisodicActorCritic_SeparateModels_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Requirement**

In [1]:
#!pip uninstall -y gym gymnasium box2d box2d-py

In [None]:
!pip install gymnasium==0.29.1
!pip install swig
!pip install box2d-py
!pip install stable_baselines3

In [3]:
import gymnasium as gym
from stable_baselines3 import A2C
#import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  return datetime.utcnow().replace(tzinfo=utc)


#### **actor and critic**

In [4]:
class Actor(nn.Module):
  def __init__(self,state_dim,action_dim,hidden_dim):
    super(Actor,self).__init__()
    self.fc = nn.Sequential(
        nn.Linear(state_dim,hidden_dim),
        nn.ReLU(),
        nn.Linear(hidden_dim,action_dim),
        nn.Softmax(dim=-1)
    )
  def forward(self,inp):
    return self.fc(inp)

In [5]:
class Critic(nn.Module):
  def __init__(self,state_dim,hidden_dim):
    super(Critic,self).__init__()
    self.fc = nn.Sequential(
        nn.Linear(state_dim,hidden_dim),
        nn.ReLU(),
        nn.Linear(hidden_dim,1)
    )
  def forward(self,inp):
    return self.fc(inp)

#### **Actor-Critic Trainer**

In [6]:
class ActorCriticTrainer:
  def __init__(self,env,hidden_dim=128,gamma=0.99,actor_lr=0.01,critic_lr=0.001,env_seed=0,
               actor_path = None, critic_path = None,
               num_episodes=1000,log_interval=100,max_rewards=float('inf')):
    super().__init__()
    self.env = env
    self.hidden_dim = hidden_dim
    self.gamma = gamma
    self.actor_lr = actor_lr
    self.critic_lr = critic_lr
    self.env_seed = env_seed
    self.num_episodes = num_episodes
    self.log_interval = log_interval
    self.max_rewards = max_rewards
    state_dim = self.env.observation_space.shape[0]
    action_dim = self.env.action_space.n
    self.actor = Actor(state_dim,action_dim,hidden_dim).to(device)
    self.critic = Critic(state_dim,hidden_dim).to(device)
    self.actor_opt = optim.Adam(self.actor.parameters(),lr=self.actor_lr)
    self.critic_opt = optim.Adam(self.critic.parameters(),lr=self.critic_lr)


  def train(self):
    self.actor.train()
    self.critic.train()
    for episode in range(self.num_episodes):
      # ---- rollout ----
      done = False
      state, _ = self.env.reset() if self.env_seed==0 else self.env.reset(seed=self.env_seed)
      log_probs,values,rewards = [], [], []

      while not done:
        state_t = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

        probs = self.actor(state_t)
        value = self.critic(state_t)

        dist = Categorical(probs=probs)
        action = dist.sample()

        next_state, reward, terminated, truncated, _ = self.env.step(action.item())
        done = terminated or truncated
        log_probs.append(dist.log_prob(action))
        values.append(value)
        rewards.append(reward)
        state = next_state

      values = torch.cat(values).squeeze(-1)

      # ---- compute returns ----
      returns, G = [], 0
      for r in reversed(rewards):
        G = r + self.gamma * G
        returns.insert(0,G)
      returns = torch.tensor(returns).to(device)
      #returns = (returns - returns.mean()) / (returns.std() + 1e-8)

      # ---- compute advantage ----
      advantages = returns - values.detach()
      #advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)پ
      advantages = advantages.clamp(-10, 10)

      # ---- update actor ----
      self.actor_opt.zero_grad()
      actor_loss = -(torch.stack(log_probs) * advantages).mean()
      actor_loss.backward()

      #if episode % self.log_interval == 0:
      #  print("-------------")
      #  actor_mgrad = self.ComputeMeanGrad(self.actor)
      #  print(f"Actor Gard: {actor_mgrad:.4f}")

      nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5)
      self.actor_opt.step()

      # ---- update critic ----
      self.critic_opt.zero_grad()
      critic_loss = (returns - values).pow(2).mean()
      critic_loss.backward()

      #if episode % self.log_interval == 0:
      #  critic_mgrad = self.ComputeMeanGrad(self.critic)
      #  print(f"Critic Grad: {critic_mgrad:.4f}")

      nn.utils.clip_grad_norm_(self.critic.parameters(), 0.4)
      self.critic_opt.step()

      # ---- log ----
      total_reward = sum(rewards)
      if episode % self.log_interval == 0 or episode == (self.num_episodes -1) :
        print(f"Episode {episode}, Total Reward: {total_reward:.1f}")

    self.env.close()

  def ComputeMeanGrad(self,model):
      total_grad = 0
      for p in model.parameters():
        if p.grad is not None:
          total_grad += p.grad.abs().mean().item()
      return total_grad

#### **Test**

In [7]:
def test(actor,critic,env,env_seed=0):
  state, _ = env.reset() if env_seed==0 else env.reset(seed=env_seed)
  done = False
  total_reward = 0

  while not done:
      state_tensor = torch.tensor(state, dtype=torch.float32)

      with torch.no_grad():
          probs = actor(state_tensor)
          value = critic(state_tensor)
      dist = torch.distributions.Categorical(probs=probs)
      action = dist.sample().item()

      next_state, reward, terminated, truncated, _ = env.step(action)
      done = terminated or truncated

      total_reward += reward
      state = next_state

  return total_reward

In [8]:
def tests(actor,critic,env,env_seed=0,num_tests=5):
  total_rewards = []
  for _ in range(num_tests):
    total_reward = test(actor,critic,env,env_seed)
    total_rewards.append(total_reward)
  rewards = [float(x) for x in total_rewards]
  return rewards

#### Labratory

##### CartPole

In [None]:
cartpole = gym.make("CartPole-v1")

In [None]:
cp_trainer = ActorCriticTrainer(env=cartpole,hidden_dim=40,actor_lr=0.0005,critic_lr=0.0004,gamma=0.99,
                                log_interval=700,num_episodes=7000)
cp_trainer.train()

Episode 0, Total Reward: 18.0
Episode 700, Total Reward: 11.0
Episode 1400, Total Reward: 50.0
Episode 2100, Total Reward: 142.0
Episode 2800, Total Reward: 78.0
Episode 3500, Total Reward: 255.0
Episode 4200, Total Reward: 188.0
Episode 4900, Total Reward: 393.0
Episode 5600, Total Reward: 430.0
Episode 6300, Total Reward: 500.0
Episode 6999, Total Reward: 500.0


In [None]:
res = tests(actor = cp_trainer.actor,critic = cp_trainer.critic,env = cartpole,num_tests = 10)
res

[423.0, 292.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0]

##### LunarLander

In [9]:
ll = gym.make('LunarLander-v2')

  from pkg_resources import resource_stream, resource_exists
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
  return datetime.utcnow().replace(tzinfo=utc)


In [None]:
ll_trainer = ActorCriticTrainer(env=ll,hidden_dim=40,actor_lr=0.0005,critic_lr=0.0004,gamma=0.99,
                                log_interval=2000,num_episodes=7000)
ll_trainer.train()

Episode 0, Total Reward: -194.9


  return datetime.utcnow().replace(tzinfo=utc)


Episode 2000, Total Reward: -81.3
Episode 4000, Total Reward: -178.1
Episode 6000, Total Reward: -34.2
Episode 6999, Total Reward: -62.3


In [None]:
# 2
ll_trainer.train()

Episode 0, Total Reward: -45.9
Episode 2000, Total Reward: 31.5
Episode 4000, Total Reward: 39.6
Episode 6000, Total Reward: 2.8
Episode 6999, Total Reward: 34.0


In [None]:
# save
torch.save(ll_trainer.actor.state_dict(), "actor_trained.pth")
torch.save(ll_trainer.critic.state_dict(), "critic_trained.pth")

In [None]:
# result
res = tests(actor = ll_trainer.actor,critic = ll_trainer.critic,env = ll,num_tests = 10)
res

[81.03415902349798,
 30.944868637314514,
 70.45445010959597,
 70.84390214167546,
 -0.25081166664865395,
 52.35377144599157,
 3.6194775417261558,
 59.24852726197628,
 -203.7417427151157,
 29.010743791729887]

#### **stable_baselines3**

In [None]:
# validation learned a2c model
def val_A2C_model(env,learned_model,num_episodes):
  rewards = []
  for i in range(num_episodes):
      obs, info = env.reset()
      total_reward = 0
      done = False
      while not done:
        action, _state = learned_model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = env.step(action)
        total_reward += reward
        done = terminated or truncated
      rewards.append(total_reward)
  env.close()
  return rewards

##### PartPole

In [None]:
model = A2C("MlpPolicy", cartpole, verbose=1)
model.learn(total_timesteps=1000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 16.6     |
|    ep_rew_mean        | 16.6     |
| time/                 |          |
|    fps                | 638      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.691   |
|    explained_variance | -0.175   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 2.09     |
|    value_loss         | 9.36     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 18.1     |
|    ep_rew_mean        | 18.1     |
| time/                 |          |
|    fps                | 578      |
|    iterations         | 200      |
|    time_elapsed 

<stable_baselines3.a2c.a2c.A2C at 0x7d57004508c0>

In [None]:
cp_rewards = val_A2C_model(cartpole,model,10)
cp_rewards

[23.0, 28.0, 20.0, 23.0, 30.0, 20.0, 30.0, 21.0, 25.0, 23.0]

##### LunarLander

In [None]:
ll_model = A2C("MlpPolicy", ll, verbose=1)
ll_model.learn(total_timesteps=5000)

In [None]:
ll_rewards = val_A2C_model(ll,ll_model,10)
ll_rewards

[np.float64(138.5631329471401),
 np.float64(211.4076834151261),
 np.float64(-142.2841124696246),
 np.float64(-838.3672558662665),
 np.float64(-836.1269178705114),
 np.float64(7.190556798827771),
 np.float64(-152.2407845465102),
 np.float64(-788.3913196737847),
 np.float64(-792.6554356381653),
 np.float64(-865.6415001411785)]