In [1]:
import torch
from torch import nn
from torch import optim
from torch.utils.tensorboard import SummaryWriter

import matplotlib.pyplot as plt
import numpy as np

import gym

import yaml # for parsing hyperparameters
import time
from collections import namedtuple

print(torch.cuda.is_available())

True


In [2]:
DEVICE = torch.device('cuda')

with open('hyperparameters.yaml') as f:
    hp = yaml.safe_load(f)['hyperparameters']

In [3]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, actions_size):
        super(Net, self).__init__()
        self.pred = nn.Sequential(
            nn.Linear(obs_size, hidden_size)
            , nn.ReLU()
            , nn.Linear(hidden_size, hidden_size)
            , nn.ReLU()
            , nn.Linear(hidden_size, actions_size)
        )

    def forward(self, x):
        x = self.pred(x)
        return x

In [4]:
env = gym.make('CartPole-v0')
env._max_episode_steps = 4000

obs_size = env.observation_space.shape[0]
actions_size = env.action_space.n

net = Net(obs_size, hp['hidden_size'], actions_size)
net = net.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

writer = SummaryWriter(comment='CartPole')

ep = namedtuple('ep', field_names=['reward', 'steps'])
ep_step = namedtuple('ep_step', field_names=['observation', 'action'])

In [5]:
def generate_batch(env, net, batch_size):
    batch = []
    ep_reward = 0.
    ep_steps = []
    obs = env.reset()
    softmax = nn.Softmax(dim=1)

    while True:
        if len(batch) == batch_size - 1 or len(batch) == batch_size/2 - 1:
           env.render()

        obs_t = torch.FloatTensor([obs])
        obs_t = obs_t.cuda()
        actions_t = softmax(net(obs_t))
        actions = actions_t.cpu().data.numpy()[0]
        action = np.random.choice(len(actions), p=actions)
        next_obs, reward, is_done, _ = env.step(action)
        ep_reward += reward
        step = ep_step(obs, action)
        ep_steps.append(step)

        if is_done:
            e = ep(ep_reward, ep_steps)
            batch.append(e)
            ep_reward = 0.
            ep_steps = []
            next_obs = env.reset()

            if len(batch) == batch_size:
                yield batch
                batch = []

        obs = next_obs

In [6]:
def filter_batch(batch, percentile):
    rewards = list(map(lambda s: s.reward, batch)) # gets reward values
    rewards_bound = np.percentile(rewards, percentile)
    rewards_mean = float(np.mean(rewards))
    train_obs = []
    train_actions = []

    for reward, steps in batch:
        if reward < rewards_bound:
            continue
        train_obs.extend(map(lambda s: s.observation, steps)) # gets obs values
        train_actions.extend(map(lambda s: s.action, steps)) # gets action values

    train_obs_t = torch.FloatTensor(train_obs)
    train_actions_t = torch.LongTensor(train_actions)

    return train_obs_t, train_actions_t, rewards_bound, rewards_mean

In [7]:
def train():
    for iter_no, batch in enumerate(generate_batch(env, net, hp['batch_size'])):
        obs_t, actions_t, reward_bound, reward_mean = filter_batch(batch, hp['keep_%'])

        obs_t = obs_t.cuda()
        actions_t = actions_t.cuda()

        optimizer.zero_grad()
        action_score_t = net(obs_t)
        loss_t = criterion(action_score_t, actions_t)
        loss_t.backward()
        optimizer.step()

        print(f'Epoch: {iter_no:} \t Reward: {reward_mean:.2f} \t Boundary: {reward_bound:.2f}')
        writer.add_scalar('Loss', loss_t.item(), iter_no)
        writer.add_scalar('Reward', reward_mean, iter_no)
        writer.add_scalar('Boundary', reward_bound, iter_no)

        if reward_mean > 4000:
            print('Model reached reward limit..')
            break

    writer.close()
    env.close()
    torch.save(net.state_dict(), './model/model.weights')
    print('Model Saved.')

In [None]:
train()

Epoch: 0 	 Reward: 19.78 	 Boundary: 34.20
Epoch: 1 	 Reward: 22.38 	 Boundary: 36.40
Epoch: 2 	 Reward: 21.28 	 Boundary: 31.00
Epoch: 3 	 Reward: 21.59 	 Boundary: 40.80
Epoch: 4 	 Reward: 24.47 	 Boundary: 39.70
Epoch: 5 	 Reward: 29.94 	 Boundary: 50.60
Epoch: 6 	 Reward: 27.84 	 Boundary: 44.00
Epoch: 7 	 Reward: 27.09 	 Boundary: 49.60
Epoch: 8 	 Reward: 32.47 	 Boundary: 61.30
Epoch: 9 	 Reward: 29.88 	 Boundary: 51.40
Epoch: 10 	 Reward: 33.28 	 Boundary: 55.80
Epoch: 11 	 Reward: 31.72 	 Boundary: 45.00
Epoch: 12 	 Reward: 34.00 	 Boundary: 63.00
Epoch: 13 	 Reward: 32.66 	 Boundary: 46.50
Epoch: 14 	 Reward: 36.72 	 Boundary: 63.90
Epoch: 15 	 Reward: 35.12 	 Boundary: 59.40
Epoch: 16 	 Reward: 37.53 	 Boundary: 58.60
Epoch: 17 	 Reward: 36.28 	 Boundary: 52.60
Epoch: 18 	 Reward: 37.66 	 Boundary: 77.30
Epoch: 19 	 Reward: 40.94 	 Boundary: 64.00
Epoch: 20 	 Reward: 35.91 	 Boundary: 60.70
Epoch: 21 	 Reward: 42.91 	 Boundary: 64.90
Epoch: 22 	 Reward: 44.78 	 Boundary: 82.7