In [274]:
import gym 
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical

In [275]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cpu


In [276]:
# Policy and value model
class ActorCriticNetwork(nn.Module):
    def __init__(self, obs_space_size, action_space_size):
        super(ActorCriticNetwork, self).__init__()
        self.shared_layer = nn.Sequential(
            nn.Linear(obs_space_size, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU()
        )

        self.policy_layer = nn.Sequential(
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, action_space_size),
            nn.Softmax(dim=-1)
        )

        self.value_layer = nn.Sequential(
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def value(self, obs):
        shared_out = self.shared_layer(obs)
        value = self.value_layer(shared_out)
        return value

    def policy(self, obs):
        shared_out = self.shared_layer(obs)
        policy_logits = self.policy_layer(shared_out)
        return policy_logits

    def forward(self, obs):
        shared_out = self.shared_layer(obs)
        policy_logits = self.policy_layer(shared_out)
        value = self.value_layer(shared_out)
        return policy_logits, value



In [277]:
class PPOTrainer():
    def __init__(self, 
                actor_critic,
                ppo_clip_val=0.2,
                target_k1_div=0.01,
                max_policy_train_iters=80,
                value_train_lters=80,
                policy_lr=3e-4,
                value_lr=1e-2):
            
        self.actor_critic = actor_critic
        self.ppo_clip_val = ppo_clip_val
        self.target_k1_div = target_k1_div
        self.max_policy_train_iters = max_policy_train_iters
        self.value_train_lters = value_train_lters

        policy_params = list(self.actor_critic.shared_layer.parameters()) + list(self.actor_critic.policy_layer.parameters())
        self.policy_optim = optim.Adam(policy_params, lr=policy_lr)

        value_params = list(self.actor_critic.shared_layer.parameters()) + list(self.actor_critic.value_layer.parameters())
        self.value_optim = optim.Adam(value_params, lr=value_lr)

    def train_policy(self, obs, actions, old_log_probs, gaes):
        for _ in range(self.max_policy_train_iters):
            self.policy_optim.zero_grad()
            print(obs)
            new_logits, new_values = self.actor_critic(obs)
            new_log_probs = Categorical(logits=new_logits).log_prob(actions)
            new_log_probs = new_logits.log_prob(actions)

            ratio = torch.exp(new_log_probs - old_log_probs)
            clipped_ratio = torch.clamp(ratio, 1-self.ppo_clip_val, 1+self.ppo_clip_val)

            clipped_loss = clipped_ratio * gaes
            full_loss = ratio * gaes
            policy_loss = -torch.min(clipped_loss, full_loss).mean()

            kl_div = (old_log_probs - new_log_probs).mean()
            if kl_div > self.target_k1_div:
                break

    def train_value(self, obs, returns):
        for _ in range(self.value_train_lters):
            self.value_optim.zero_grad()
            values = self.actor_critic.value(obs)
            value_loss = nn.MSELoss()(values, returns)
            value_loss.backward()
            self.value_optim.step()




In [278]:
def discount_rewards(rewards, gamma=0.99):
    new_rewards = [float(rewards[-1])]
    for r in rewards[-2::-1]:
        new_rewards.append(r + gamma * new_rewards[-1])
    return new_rewards[::-1]

In [279]:
def compute_gae(rewards, values, gamma=0.99, lam=0.95):
    deltas = [r + gamma * v_ - v for r, v_, v in zip(rewards[:-1], values[1:], values[:-1])]
    deltas.append(rewards[-1] - values[-1])
    gaes = [deltas[-1]]
    for delta in deltas[-2::-1]:
        gaes.append(delta + gamma * lam * gaes[-1])
    return gaes[::-1]

In [280]:
def rollout(model, env, max_steps=1000):
    train_data = [[], [], [], [], []]
    obs = env.reset()
    obs = obs[0]

    ep_rewards = 0
    for _ in range(max_steps):
        print(obs)
        logits, value = model(torch.tensor([obs], dtype=torch.float32, device=DEVICE))
        action_dist = Categorical(logits=logits)
        action = action_dist.sample()
        log_prob = action_dist.log_prob(action).item()

        new_obs, reward, done, _, _ = env.step(action.item())

        # Append items with consistent shapes
        train_data[0].append(np.array([obs]))  # Ensure obs has consistent shape
        train_data[1].append(action.item())
        train_data[2].append(log_prob)
        train_data[3].append(reward)
        train_data[4].append(done)
        obs = new_obs
        ep_rewards += reward
        if done:
            break

    # Convert each sublist to a NumPy array
    train_data = [np.asarray(item) for item in train_data]

    # Ensure consistent shapes when using np.asarray
    train_data[0] = np.concatenate(train_data[0])  # Concatenate arrays in the first element

    train_data[3] = compute_gae(train_data[2], train_data[3], gamma=0.99, lam=0.95)

    return train_data, ep_rewards


In [281]:
env = gym.make("CartPole-v0")
model = ActorCriticNetwork(env.observation_space.shape[0], env.action_space.n).to(DEVICE)
train_data, ep_rewards = rollout(model, env)


[ 0.0219925  -0.03346879  0.04478336 -0.02850489]
[ 0.02132313 -0.2292034   0.04421326  0.27796456]
[ 0.01673906 -0.03473918  0.04977255 -0.00045225]
[ 0.01604427 -0.2305383   0.0497635   0.3075098 ]
[ 0.01143351 -0.4263327   0.0559137   0.6154623 ]
[ 0.00290685 -0.23203473  0.06822295  0.34090084]
[-0.00173384 -0.03794643  0.07504097  0.07048763]
[-0.00249277 -0.23405953  0.07645071  0.38587075]
[-0.00717396 -0.43017882  0.08416813  0.701645  ]
[-0.01577754 -0.23631814  0.09820103  0.43659964]
[-0.0205039  -0.43268305  0.10693302  0.7585527 ]
[-0.02915756 -0.23918438  0.12210408  0.5013409 ]
[-0.03394125 -0.43579683  0.13213089  0.82987326]
[-0.04265719 -0.6324535   0.14872836  1.1610205 ]
[-0.05530626 -0.8291662   0.17194878  1.4963973 ]
[-0.07188958 -1.0259106   0.20187671  1.837466  ]


  logger.warn(
  if not isinstance(terminated, (bool, np.bool8)):


In [282]:
print(train_data[0])

[[ 2.1992503e-02 -3.3468794e-02  4.4783358e-02 -2.8504891e-02]
 [ 2.1323126e-02 -2.2920340e-01  4.4213258e-02  2.7796456e-01]
 [ 1.6739057e-02 -3.4739178e-02  4.9772549e-02 -4.5225484e-04]
 [ 1.6044274e-02 -2.3053829e-01  4.9763504e-02  3.0750981e-01]
 [ 1.1433508e-02 -4.2633271e-01  5.5913702e-02  6.1546230e-01]
 [ 2.9068540e-03 -2.3203473e-01  6.8222947e-02  3.4090084e-01]
 [-1.7338406e-03 -3.7946429e-02  7.5040966e-02  7.0487626e-02]
 [-2.4927692e-03 -2.3405953e-01  7.6450713e-02  3.8587075e-01]
 [-7.1739596e-03 -4.3017882e-01  8.4168129e-02  7.0164502e-01]
 [-1.5777536e-02 -2.3631814e-01  9.8201029e-02  4.3659964e-01]
 [-2.0503899e-02 -4.3268305e-01  1.0693302e-01  7.5855267e-01]
 [-2.9157560e-02 -2.3918438e-01  1.2210408e-01  5.0134093e-01]
 [-3.3941247e-02 -4.3579683e-01  1.3213089e-01  8.2987326e-01]
 [-4.2657185e-02 -6.3245350e-01  1.4872836e-01  1.1610205e+00]
 [-5.5306256e-02 -8.2916617e-01  1.7194878e-01  1.4963973e+00]
 [-7.1889579e-02 -1.0259106e+00  2.0187671e-01  1.83746

In [283]:
ppo = PPOTrainer(model)

In [284]:
ep_rewards = []
obs = env.reset()
for i in range(1000):
    train_data, rewards = rollout(model, env)
    ep_rewards.append(rewards)

    obs = torch.tensor(train_data[0], dtype=torch.float32, device=DEVICE)
    actions = torch.tensor(train_data[1], dtype=torch.int64, device=DEVICE)
    old_log_probs = torch.tensor(train_data[2], dtype=torch.float32, device=DEVICE)
    gae = torch.tensor(train_data[3], dtype=torch.float32, device=DEVICE)
    returns = torch.tensor(discount_rewards(train_data[2]), dtype=torch.float32, device=DEVICE)
    act_log_probs = torch.tensor(train_data[4], dtype=torch.float32, device=DEVICE)
    
    ppo.train_policy(obs, actions, old_log_probs, gae)
    ppo.train_value(obs, returns)
    # write rewards 
    if i % 10 == 0:
        print(f"Episode {i}, rewards: {rewards}")

[0.03236194 0.03110879 0.00581208 0.04430943]
[ 0.03298412  0.22614692  0.00669827 -0.24653408]
[0.03750705 0.03092995 0.00176759 0.04825409]
[ 0.03812565 -0.16421731  0.00273267  0.34149417]
[0.03484131 0.03086566 0.00956255 0.04967423]
[ 0.03545862 -0.16439208  0.01055604  0.34535882]
[0.03217078 0.03057812 0.01746321 0.05602321]
[ 0.03278234  0.22544537  0.01858368 -0.23109919]
[ 0.03729125  0.4202969   0.01396169 -0.51786274]
[ 0.04569719  0.2249812   0.00360444 -0.22081311]
[ 0.05019681  0.02980792 -0.00081182  0.07300462]
[ 0.05079297 -0.16530238  0.00064827  0.3654313 ]
[0.04748692 0.02981034 0.0079569  0.07295285]
[ 0.04808313  0.22481732  0.00941595 -0.21720906]
[0.05257947 0.02956204 0.00507177 0.07842913]
[ 0.05317071 -0.16563225  0.00664035  0.3727079 ]
[0.04985807 0.02939475 0.01409451 0.08212611]
[ 0.05044596 -0.16592638  0.01573703  0.37922236]


[ 0.04712744 -0.36126825  0.02332148  0.67682534]
[ 0.03990207 -0.16647796  0.03685799  0.39157528]
[0.03657251 0.02810205 0.0446895  0.11073729]
[ 0.03713455  0.22255608  0.04690424 -0.1675182 ]
[ 0.04158568  0.41697636  0.04355388 -0.4450431 ]
[ 0.0499252   0.22126612  0.03465302 -0.13895534]
[0.05435053 0.02566543 0.03187391 0.1644555 ]
[ 0.05486383 -0.16989796  0.03516302  0.46702093]
[ 0.05146587 -0.3654986   0.04450344  0.77057636]
[ 0.0441559  -0.17101645  0.05991497  0.49222165]
[ 0.04073557 -0.36693007  0.0697594   0.8031692 ]
[ 0.03339697 -0.17283046  0.08582278  0.5332206 ]
[ 0.02994036 -0.36904794  0.09648719  0.8516644 ]
[ 0.0225594  -0.5653436   0.11352048  1.1730616 ]
[ 0.01125253 -0.7617432   0.13698171  1.4990681 ]
[-0.00398233 -0.5685255   0.16696307  1.2521034 ]
[-0.01535284 -0.3758889   0.19200514  1.0160261 ]
tensor([[ 3.2362e-02,  3.1109e-02,  5.8121e-03,  4.4309e-02],
        [ 3.2984e-02,  2.2615e-01,  6.6983e-03, -2.4653e-01],
        [ 3.7507e-02,  3.0930e-02,

AttributeError: 'Tensor' object has no attribute 'log_probs'