In [None]:
!pip install box2d==2.3.10
!pip install -U gym

Collecting box2d==2.3.10
[?25l  Downloading https://files.pythonhosted.org/packages/a9/0b/d48d42dd9e19ce83a3fb4eee074e785b6c6ea612a2244dc2ef69427d338b/Box2D-2.3.10-cp36-cp36m-manylinux1_x86_64.whl (1.3MB)
[K     |▎                               | 10kB 21.2MB/s eta 0:00:01[K     |▌                               | 20kB 4.2MB/s eta 0:00:01[K     |▊                               | 30kB 5.9MB/s eta 0:00:01[K     |█                               | 40kB 7.5MB/s eta 0:00:01[K     |█▎                              | 51kB 4.9MB/s eta 0:00:01[K     |█▌                              | 61kB 5.8MB/s eta 0:00:01[K     |█▊                              | 71kB 6.6MB/s eta 0:00:01[K     |██                              | 81kB 7.3MB/s eta 0:00:01[K     |██▎                             | 92kB 5.8MB/s eta 0:00:01[K     |██▌                             | 102kB 6.3MB/s eta 0:00:01[K     |██▊                             | 112kB 6.3MB/s eta 0:00:01[K     |███                             |

In [None]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image


import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from torch.distributions import Categorical


env = gym.make('LunarLanderContinuous-v2')

print(env.action_space)
print(env.observation_space)
print(env.spec.reward_threshold)

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
  from IPython import display

plt.ion()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)



Box(2,)
Box(8,)
200
cuda


In [None]:

class Critic(nn.Module):
  def __init__(self):
    super(Critic, self).__init__()
    self.fc1 = nn.Linear(env.observation_space.shape[0], 400)
    f1 = 1 / np.sqrt(self.fc1.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc1.weight.data, -f1, f1)
    torch.nn.init.uniform_(self.fc1.bias.data, -f1, f1)
    self.bn1 = nn.LayerNorm(400)

    self.fc2 = nn.Linear(400, 300)
    f2 = 1 / np.sqrt(self.fc2.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc2.weight.data, -f2, f2)
    torch.nn.init.uniform_(self.fc2.bias.data, -f2, f2)
    self.bn2 = nn.LayerNorm(300)

    self.action_value = nn.Linear(2, 300)

    f3 = 0.003
    self.q = nn.Linear(300, 1)
    torch.nn.init.uniform_(self.q.weight.data, -f3, f3)
    torch.nn.init.uniform_(self.q.bias.data, -f3, f3)

    self.to(device)


  def forward(self, state, action):
    # print(np.shape(xs))
    state_value = self.fc1(state)
    state_value = self.bn1(state_value)
    state_value = F.relu(state_value)

    state_value = self.fc2(state_value)
    state_value = self.bn2(state_value)

    action_value = F.relu(self.action_value(action))

    state_action_value = F.relu(torch.add(state_value, action_value))
    return self.q(state_action_value)


class Actor(nn.Module):
  def __init__(self):
    super(Actor, self).__init__()
    self.fc1 = nn.Linear(env.observation_space.shape[0], 400)
    f1 = 1 / np.sqrt(self.fc1.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc1.weight.data, -f1, f1)
    torch.nn.init.uniform_(self.fc1.bias.data, -f1, f1)
    self.bn1 = nn.LayerNorm(400)

    self.fc2 = nn.Linear(400, 300)
    f2 = 1 / np.sqrt(self.fc2.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc2.weight.data, -f2, f2)
    torch.nn.init.uniform_(self.fc2.bias.data, -f2, f2)
    self.bn2 = nn.LayerNorm(300)

    f3 = 0.003
    self.mu = nn.Linear(300, env.action_space.shape[0])
    torch.nn.init.uniform_(self.mu.weight.data, -f3, f3)
    torch.nn.init.uniform_(self.mu.bias.data, -f3, f3)

    self.to(device)
    
  def forward(self, x):
    x = self.fc1(x)
    x = self.bn1(x)
    x = F.relu(x)

    x = self.fc2(x)
    x = self.bn2(x)
    x = F.relu(x)
    return torch.tanh(self.mu(x))

In [None]:
# From OpenAI Baselines:
# https://github.com/openai/baselines/blob/master/baselines/ddpg/noise.py
class OrnsteinUhlenbeckActionNoise:
    def __init__(self, mu, sigma=0.15, theta=.2, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

    def __repr__(self):
        return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)

In [None]:
# Replay transisiton
Replay_transition = namedtuple('Replay_transition',('state', 'action', 'next_state', 'reward'))

class Experience_Replay_Memory(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Replay_transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

class ReplayBuffer(object):
  def __init__(self, max_size, input_shape, n_actions):
    self.mem_size = max_size
    self.mem_cntr = 0
    self.state_memory = np.zeros((self.mem_size, *input_shape))
    self.next_state_memory = np.zeros((self.mem_size, *input_shape))
    self.action_memory = np.zeros((self.mem_size, n_actions))
    self.reward_memory = np.zeros(self.mem_size)
    self.terminal_memory = np.zeros(self.mem_size, dtype=np.float32)

  def store_transition(self, state, action, reward, next_state, done):
    index = self.mem_cntr % self.mem_size
    self.state_memory[index] = state
    self.next_state_memory[index] = next_state
    self.action_memory[index] = action
    self.reward_memory[index] = reward
    self.terminal_memory[index] = 1 - done
    self.mem_cntr += 1

  def sample(self, batch_size):
    max_mem = min(self.mem_cntr, self.mem_size)
    batch = np.random.choice(max_mem, batch_size)

    states = self.state_memory[batch]
    next_states = self.next_state_memory[batch]
    actions = self.action_memory[batch]
    rewards = self.reward_memory[batch]
    terminal = self.terminal_memory[batch]

    return states, actions, rewards, next_states, terminal

In [None]:
BATCH_SIZE = 64
np.random.seed(0)
class DDPG():
  def __init__(self):
    
    self.actor = Actor()
    self.actor_target = Actor()
    self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=0.000025)

    self.critic = Critic()
    self.critic_target = Critic()
    self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=0.00025)

    self.gamma = 0.99
    self.tau = 0.001
    self.memory = ReplayBuffer(50000, [8], env.action_space.shape[0])
    self.noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(env.action_space.shape[0]))

    self.update_network_parameters()


  def select_action(self, state):
    self.actor.eval()
    state = torch.tensor(state, dtype=torch.float).to(device)
    mu = self.actor(state).to(device)
    noise = torch.Tensor(self.noise())
    mu_prime = mu + noise.to(device) # add noise for exploration
    self.actor.train()
    # Clip the output according to the action space of the env
    # mu_prime = mu_prime.clamp(env.action_space.low[0], env.action_space.high[0])
    return mu_prime.cpu().detach().numpy()


  def optimize_model(self):
    if self.memory.mem_cntr < BATCH_SIZE:
      return

    state, action, reward, next_state, done = self.memory.sample(BATCH_SIZE)

    reward = torch.tensor(reward, dtype=torch.float).to(device)
    done = torch.tensor(done, dtype=torch.float).to(device)
    state = torch.tensor(state, dtype=torch.float).to(device)
    next_state = torch.tensor(next_state, dtype=torch.float).to(device)
    action = torch.tensor(action, dtype=torch.float).to(device)

    self.actor_target.eval()
    self.critic_target.eval()
    self.critic.eval()

    target_actions = self.actor_target(next_state)
    critic_value = self.critic_target(next_state, target_actions)
    state_value = self.critic(state, action)

    target= []

    for j in range(BATCH_SIZE):
      target.append(reward[j] + self.gamma * critic_value[j]*done[j])
    target = torch.tensor(target).to(device)
    target = target.view(BATCH_SIZE, 1)

    self.critic.train()
    self.critic_optimizer.zero_grad()
    critic_loss = F.smooth_l1_loss(target, state_value)
    critic_loss.backward()
    self.critic_optimizer.step()


    self.critic.eval()
    self.actor_optimizer.zero_grad()
    mu = self.actor(state)
    self.actor.train()
    actor_loss = -self.critic(state, mu)
    actor_loss = torch.mean(actor_loss)
    actor_loss.backward()
    self.actor_optimizer.step()

    self.update_network_parameters()


  def update_network_parameters(self):
    # self.actor_target.load_state_dict(self.actor.state_dict())
    # self.critic_target.load_state_dict(self.critic.state_dict())

    for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
      target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))
       
    for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
      target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))


  def train(self):
    running_reward = 10
    total_rewards = []
    # run till it solves
    for i_episode in count(1):
      state = env.reset()
      r = 0
      for t in count():
        action = self.select_action(state)
        next_state, reward, done, _ = env.step(action)

        # reward = torch.tensor([reward], dtype=torch.float32, device=device)
        # state_tensor = torch.tensor([state], dtype=torch.float32, device=device)
        # next_state_tensor = torch.tensor([next_state], dtype=torch.float32, device=device)
        # action_tensor = torch.tensor([action], dtype=torch.float32, device=device)
        # done_tensor = torch.tensor([done], device=device)
        # print(state_tensor.dtype)
        self.memory.store_transition(state, action, reward, next_state, done)
        
        r += reward #.item()
        self.optimize_model()
        # Move to the next state
        state = next_state

        if done:
          break

      total_rewards.append(r)
      #exponential moving average
      running_reward = 0.05 * r + (1 - 0.05) * running_reward
      avg_rewards = np.mean(total_rewards[-100:])
      if i_episode % 10 == 0:
          print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}\tAvg_reward: {:.2f}'.format(
                i_episode, r, running_reward, avg_rewards))
      if running_reward > env.spec.reward_threshold:
          print("Solved! Running reward is now {} and "
                "the last episode runs to {} time steps!".format(running_reward, t))
          break


agent = DDPG()
agent.train()


Episode 10	Last reward: -196.70	Average reward: -102.10	Avg_reward: -269.35
Episode 20	Last reward: -350.62	Average reward: -172.71	Avg_reward: -272.06
Episode 30	Last reward: -226.02	Average reward: -203.97	Avg_reward: -265.62
Episode 40	Last reward: -236.46	Average reward: -237.26	Avg_reward: -270.07
Episode 50	Last reward: -164.61	Average reward: -222.18	Avg_reward: -256.54
Episode 60	Last reward: -236.47	Average reward: -186.53	Avg_reward: -235.78
Episode 70	Last reward: -183.28	Average reward: -197.68	Avg_reward: -232.76
Episode 80	Last reward: -397.61	Average reward: -208.97	Avg_reward: -231.20
Episode 90	Last reward: -103.06	Average reward: -195.54	Avg_reward: -225.42
Episode 100	Last reward: -306.25	Average reward: -246.27	Avg_reward: -234.63
Episode 110	Last reward: -130.54	Average reward: -214.24	Avg_reward: -225.04
Episode 120	Last reward: -82.07	Average reward: -168.78	Avg_reward: -208.04
Episode 130	Last reward: -126.88	Average reward: -170.01	Avg_reward: -199.73
Episode 1