# Baseline 

In [4]:
# import gym
# import time 
# env = gym.make("BipedalWalker-v3", render_mode="human", hardcore=True)
# env.action_space.seed(42)

# observation, info = env.reset(seed=42)
# env.render()
# for _ in range(1000):
#     observation, reward, terminated, truncated, info = env.step(env.action_space.sample())

#     if terminated or truncated:
#         observation, info = env.reset()
# env.close()

In [12]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [38]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random

from collections import deque
from torch.autograd import Variable

# Hyperparameters
BATCH_SIZE = 128
GAMMA = 0.99
TAU = 0.001
LR_ACTOR = 1e-3
LR_CRITIC = 1e-3
BUFFER_SIZE = int(1e6)
MAX_EPISODES = 1000
MAX_STEPS = 2000
EPSILON = 1.0
EPSILON_DECAY = 1e-6
EPSILON_MIN = 0.01

# Environment
env = gym.make('BipedalWalker-v3', hardcore = True)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_high = env.action_space.high
action_low = env.action_space.low

# Actor network
class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, action_dim)
        
    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = nn.functional.tanh(self.fc3(x))
        return x.detach().numpy() * action_high

# Critic network
class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 1)
        
    def forward(self, state, action):
        x = torch.cat((state, action), 1)
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Replay buffer
class ReplayBuffer:
    def __init__(self):
        self.buffer = deque(maxlen=BUFFER_SIZE)
        
    def add(self, state, action, reward, next_state, done):
        state = torch.from_numpy(state).float().to(device)
        action = torch.from_numpy(action).float().to(device)
        reward =  torch.tensor(reward).to(device)
        next_state =  torch.from_numpy(next_state).float().to(device)
        done =  torch.tensor(done).to(device)
        self.buffer.append((state, action, reward, next_state, done))
        
    def sample(self, batch_size):
        states, actions, rewards, next_states, dones = zip(*random.sample(self.buffer, batch_size))
        states = torch.from_numpy(states).float().to(device)
        actions = torch.from_numpy(actions).float().to(device)
        rewards =  torch.tensor(rewards).to(device)
        next_states =  torch.from_numpy(next_states).float().to(device)
        dones =  torch.tensor(dones).to(device)
        return np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(dones)
        
    def __len__(self):
        return len(self.buffer)

# DDPG solver
class DDPG:
    def __init__(self):
        self.actor = Actor()
        self.target_actor = Actor()
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.critic = Critic()
        self.target_critic = Critic()
        self.target_critic.load_state_dict(self.critic.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=LR_ACTOR)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=LR_CRITIC)
        
        # Move models and buffer to GPU
        self.actor = self.actor.to(device)
        self.critic = self.critic.to(device)
        self.target_actor = self.target_actor.to(device)
        self.target_critic = self.target_critic.to(device)
        
        self.replay_buffer = ReplayBuffer()
        self.epsilon = EPSILON

        
        
    def act(self, state):
        state = Variable(torch.from_numpy(state).float().unsqueeze(0)).to(device)
        self.actor.eval()
        action = self.actor(state).data
        self.actor.train()
        action += self.epsilon * np.random.normal(size=action_dim)
        return np.clip(action, action_low, action_high)[0]
        
    def learn(self):
        if len(self.replay_buffer) < BATCH_SIZE:
            return
        
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(BATCH_SIZE)
        
        states = torch.from_numpy(states).float().to(device)
        actions = torch.from_numpy(actions).float().to(device)
        rewards =  torch.tensor(rewards).to(device)
        next_states =  torch.from_numpy(next_states).float().to(device)
        dones =  torch.tensor(dones).to(device)

        states = Variable(states)
        actions = Variable(actions)
        rewards = Variable(rewards)
        next_states = Variable(next_states)
        dones = Variable(dones)
        
        # Update critic
        next_actions = self.target_actor(next_states)
        q_targets_next = self.target_critic(next_states, torch.tensor(next_actions))
        q_targets = rewards + (GAMMA * q_targets_next * (1 - dones))
        q_expected = self.critic(states, actions)
        critic_loss = nn.functional.mse_loss(q_expected, q_targets)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1)
        self.critic_optimizer.step()
        
        # Update actor
        actions_pred = self.actor(states)
        actor_loss = -self.critic(states, torch.tensor(actions_pred)).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        
        # Update target networks
        self.soft_update(self.critic, self.target_critic)
        self.soft_update(self.actor, self.target_actor)
        
        # Decay exploration rate
        self.epsilon = max(self.epsilon - EPSILON_DECAY, EPSILON_MIN)
        
    def soft_update(self, local_model, target_model):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(TAU * local_param.data + (1 - TAU) * target_param.data)


self.finish_x: 542.0


In [39]:
ddpg = DDPG()
ddpg.actor.to(device)
ddpg.critic.to(device)
ddpg.target_actor.to(device)
ddpg.target_critic.to(device)
scores = []
for episode in range(1, MAX_EPISODES+1):
    state = env.reset()[0]
    score = 0
    for step in range(1, MAX_STEPS+1):
        action = ddpg.act(state)
        next_state, reward, done, _,_ = env.step(action)
        ddpg.replay_buffer.add(state, action, reward, next_state, done)
        state = next_state
        score += reward
        ddpg.learn()
        if done:
            break
    scores.append(score)
    print('Episode %d Score: %.2f' % (episode, score))
    if episode % 100 == 0:
        print('Episode %d Average Score: %.2f' % (episode, np.mean(scores[-100:])))

Episode 1 Score: -127.59
states :  (tensor([ 0.5961,  0.2319, -0.2773, -0.0436,  0.7094, -0.7375,  0.4313,  0.4022,
         0.0000,  0.9606,  0.0400,  1.4814, -1.3929, -0.6133,  0.0000,  1.0000,
         0.5814, -0.0600,  0.3247,  0.3313,  0.3465,  0.3722,  0.4127,  0.4757,
         0.5782,  0.7618,  0.9465,  1.0000]), tensor([-2.0812e-01, -2.7174e-02,  4.6508e-04, -5.3312e-01,  2.6755e-01,
         9.3706e-01,  2.2377e-01, -1.0000e+00,  0.0000e+00,  8.0576e-01,
        -4.0000e-02,  2.1033e+00,  9.0738e-01,  2.5440e-01, -1.0000e+00,
         0.0000e+00,  1.3541e+00, -6.0000e-02,  7.5813e-01,  7.7483e-01,
         8.1222e-01,  8.7531e-01,  9.7415e-01,  1.0000e+00,  1.0000e+00,
         1.0000e+00,  1.0000e+00,  1.0000e+00]), tensor([ 0.3040, -0.0102, -0.0320, -0.6601, -0.4286, -0.5815,  0.0435, -1.0000,
         0.0000, -0.0148,  0.0400,  0.5856,  1.5369,  0.2112, -1.0000,  0.0000,
         0.2788,  0.0600,  0.5382,  0.5500,  0.5763,  0.6206,  0.6900,  0.7982,
         0.9744,  1.0000

ValueError: only one element tensors can be converted to Python scalars