In [11]:
import wandb
import torch
import gym

import tensorflow as tf
import torch.nn as nn
import numpy as np
import torch.optim as optim
from collections import deque
from torch.distributions import Beta
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
import torch.nn.functional as F

print('TF with GPU: ' + str(tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None)))
print('Torch with GPU: {}'.format(torch.cuda.is_available()))
wandb.login()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

TF with GPU: True
Torch with GPU: True


In [26]:
config = {
    "num_episodes": 1000,
    "num_epochs": 2000,
    "image_batch": 4,
    "lr": 1e-3,
    "logging_interval": 5,
    "render": False,
    "action_count": 4,
    "memory_cap": 1280,
    "ppo_batch": 64,
    "ppo_epoch": 10,
    "loss_clip": 0.1,
    "gamma_discount": 0.99
}

action_type = np.dtype([
    ('last_state', np.float64, (config['image_batch'], 96, 96)),
    ('curr_state', np.float64, (config['image_batch'], 96, 96)),
    ('act', np.float64, (3,)),
    ('act_prob', np.float64),
    ('reward', np.float64)
])

def rgb2gray(rgb, norm=True):
    gray = np.dot(rgb[..., :], [0.299, 0.587, 0.114])
    if norm:
        gray = gray / 128. - 1.
    return gray

In [14]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(config['image_batch'], 8, kernel_size=(4, 4), stride=(2,2)),
            nn.ReLU(),
            nn.Conv2d(8, 16, kernel_size=(3, 3), stride=(2,2)),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2,2)),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2,2)),
            nn.ReLU(),
            nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1,1)),
            nn.ReLU(),
            nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1,1)),
            nn.ReLU(),
        )
        self.v = nn.Sequential(
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        self.fc = nn.Sequential(
            nn.Linear(256, 64),
            nn.ReLU()
        )
        self.alpha_head = nn.Sequential(
            nn.Linear(64, 3),
            nn.Softplus()
        )
        self.beta_head = nn.Sequential(
            nn.Linear(64, 3),
            nn.Softplus()
        )
        self.apply(self._weights_init)

    @staticmethod
    def _weights_init(m):
        if isinstance(m, nn.Conv2d):
            nn.init.xavier_uniform_(m.weight, gain=nn.init.calculate_gain('relu'))
            nn.init.constant_(m.bias, 0.1)

    def forward(self, x):
        x = self.cnn(x).view(-1, 256)
        v = self.v(x)
        x = self.fc(x)
        alpha = self.alpha_head(x) + 1
        beta = self.beta_head(x) + 1

        return (alpha, beta), v

In [15]:
class Agent():

    def __init__(self):
        self.net = Net().double().to(device)
        self.optimizer = optim.Adam(self.net.parameters(), lr=config['lr'])

        self.memory = np.empty(config['memory_cap'], dtype=action_type)
        self.memory_index = 0

    def choose_action(self, curr_state):
        with torch.no_grad():
            action, _ = self.net(torch.from_numpy(curr_state).double().to(device).unsqueeze(0))
        distribution = Beta(action[0], action[1])
        act = distribution.sample()
        act_prob = distribution.log_prob(act).sum(dim=1)
        return act.squeeze().cpu().numpy(), act_prob.item()

    def insert_into_memory(self, memory_sample):
        self.memory[self.memory_index] = memory_sample
        self.memory_index += 1
        if self.memory_index == config['memory_cap']:
            self.memory_index = 0

    def ready_for_update(self):
        return True if self.memory_index == 0 else False

    def update_self(self):
        last_state = torch.tensor(self.memory['last_state'], dtype=torch.double).to(device)
        curr_state = torch.tensor(self.memory['curr_state'], dtype=torch.double).to(device)
        action = torch.tensor(self.memory['act'], dtype=torch.double).to(device)
        rew = torch.tensor(self.memory['reward'], dtype=torch.double).to(device).view(-1, 1)

        action_prob = torch.tensor(self.memory['act_prob'], dtype=torch.double).to(device).view(-1, 1)

        with torch.no_grad():
            target_v = rew + config['gamma_discount'] * self.net(curr_state)[1]
            adv = target_v - self.net(last_state)[1]

        for _ in range(config['ppo_epoch']):
            for index in BatchSampler(SubsetRandomSampler(range(config['memory_cap'])), config['ppo_batch'], False):

                alpha, beta = self.net(last_state[index])[0]
                dist = Beta(alpha, beta)
                a_logp = dist.log_prob(action[index]).sum(dim=1, keepdim=True)
                ratio = torch.exp(a_logp - action_prob[index])

                surr1 = ratio * adv[index]
                surr2 = torch.clamp(ratio, 1.0 - config['loss_clip'], 1.0 + config['loss_clip']) * adv[index]
                action_loss = -torch.min(surr1, surr2).mean()
                value_loss = F.smooth_l1_loss(self.net(last_state[index])[1], target_v[index])
                loss = action_loss + 2. * value_loss

                self.optimizer.zero_grad()
                loss.backward()
                # nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm)
                self.optimizer.step()

In [27]:
agent = Agent()
env = gym.make('CarRacing-v0')

reward_threshold = env.spec.reward_threshold

run = wandb.init(project="car-racing")

wandb.config.update(config)

total_rewards = deque(maxlen=100)

for i_episode in range(config['num_episodes']):
    observation = env.reset()
    old_state = [rgb2gray(observation)] * config['image_batch']
    new_state = [rgb2gray(observation)] * config['image_batch']
    done = False
    total_reward = 0

    for _ in range(config['num_epochs']):
        if config['render'] and i_episode == config['num_episodes'] - 1:
            env.render()

        todo_action, todo_action_prob = agent.choose_action(np.asarray(new_state))

        inner_reward = 0
        for i in range(config['action_count']):
            observation, reward, done, _ = env.step(todo_action)

            if np.mean(observation[:, :, 1]) > 185.0:
                reward -= 0.05

            inner_reward += reward

            if done:
                break

        new_state.pop(0)
        new_state.append(rgb2gray(observation))

        agent.insert_into_memory((old_state, new_state, todo_action, todo_action_prob, inner_reward))
        if agent.ready_for_update():
            print('Updating agent')
            agent.update_self()

        old_state.pop(0)
        old_state.append(rgb2gray(observation))

        total_reward += inner_reward
        if done:
            break

    total_rewards.append(total_reward)
    wandb.log({"episode_reward": total_reward}, commit=True)

    if i_episode % config['logging_interval'] == 0:
        print('Total reward for episode {} is {}'.format(i_episode, total_reward))
    if np.mean(total_rewards) >= reward_threshold:
        print('Env solved in {} episodes'.format(i_episode))
        break

if np.mean(total_rewards) < reward_threshold:
    print('Env not solved in {} episodes'.format(config['num_episodes']))
    print('Mean of score for last 100 runs is {}'.format(np.mean(total_rewards)))

env.close()
# run.finish()

wandb: wandb version 0.10.28 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


Track generation: 1096..1379 -> 283-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1064..1343 -> 279-tiles track
Total reward for episode 0 is -6.474820143884826
Track generation: 1035..1298 -> 263-tiles track
Track generation: 1015..1273 -> 258-tiles track
Track generation: 1111..1393 -> 282-tiles track
Track generation: 1132..1419 -> 287-tiles track
Track generation: 1169..1465 -> 296-tiles track
Updating agent
Track generation: 1130..1426 -> 296-tiles track
Track generation: 991..1249 -> 258-tiles track
Track generation: 1079..1353 -> 274-tiles track
Track generation: 1127..1413 -> 286-tiles track
Track generation: 1051..1318 -> 267-tiles track
Updating agent
Track generation: 1188..1489 -> 301-tiles track
Track generation: 1282..1607 -> 325-tiles track
Track generation: 1191..1493 -> 302-tiles track
Track generation: 985..1242 -> 257-tiles track
Track generation: 1083..1358 -> 275-tiles track
Updating agent
Track genera

MemoryError: 