 <a href="https://colab.research.google.com/github/wingated/cs474_labs_f2019/blob/master/DL_Lab9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from itertools import chain
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
from torchvision import transforms, utils, datasets
from tqdm import tqdm
from torch.nn.parameter import Parameter
import pdb
import torchvision
from functools import reduce
import os
import gzip
import tarfile
import gc
from PIL import Image
import io
from IPython.core.ultratb import AutoFormattedTB
__ITB__ = AutoFormattedTB(mode='Verbose', color_scheme='LightBg', tb_offset=1)



In [0]:
class EmbeddingNetwork(nn.Module):
  def __init__(self, state_dim, hidden_dim, output_dim):
    super(EmbeddingNetwork, self).__init__()
    self.net = nn.Sequential(
      nn.Linear(input_dim, output_dim/2),
      nn.ReLU(),
      nn.Linear(output_dim/2, output_dim),
      nn.ReLU(),
    )

  def forward(self, x):
    return self.net(x)



In [0]:
class ValueNetwork(nn.Module):
  def __init__(self, state_dim, hidden_dim):
    super(ValueNetwork, self).__init__()
    self.net = nn.Sequential(
      nn.Linear(state_dim, hidden_dim),
      nn.ReLU(),
      nn.Linear(hidden_dim, hidden_dim),
      nn.ReLU(),
      nn.Linear(hidden_dim, 1)
    )

  def forward(self, x):
    return self.net(x)



In [0]:
class PolicyNetwork(nn.Module):
  def __init__(self, state_dim, hidden_dim, action_dim):
    super(PolicyNetwork, self).__init__()
    self.net = nn.Sequential(
      nn.Linear(state_dim, hidden_dim),
      nn.ReLU(),
      nn.Linear(hidden_dim, hidden_dim),
      nn.ReLU(),
      nn.Linear(hidden_dim, action_dim),
      nn.Softmax(dim=1)
    )

  def forward(self, x):
    return self.net(x)



In [0]:

class AdvantageDataset(Dataset):
  def __init__(self, experience):
    super(AdvantageDataset, self).__init__()
    self._exp = experience
    self._num_runs = len(experience)
    self._length = reduce(lambda acc, x: acc + len(x[0]), experience, 0)

  def __getitem__(self, index):
    idx = 0
    seen_data = 0
    current_exp = self._exp[0][0]
    while seen_data + len(current_exp) - 1 < index:
      seen_data += len(current_exp)
      idx += 1
      current_exp = self._exp[idx][0]
    chosen_exp = current_exp[index - seen_data]
    # What should be returned is the action taken and the advantage for that?
    return chosen_exp

  def __len__(self):
    return self._length

  def averageLength(self):
    return self._length/self._num_runs



In [0]:
def select_action(probabilities):
  batch_size = probabilities.shape[0]
  actions = np.empty((batch_size, 1), dtype=np.uint8)
  probs_np = probabilities.cpu().detach().numpy()
  for i in range(batch_size):
      action_one_hot = np.random.multinomial(1, probs_np[i])
      action_idx = np.argmax(action_one_hot)
      actions[i, 0] = action_idx
  return actions

def likelihood_of_getting(indicies, distribution):
  try:
    return distribution[range(distribution.shape[0]), indicies.long()[:, 0]].unsqueeze(1)
  except Exception as e:
    print("Indicies {}".format(indicies.size()))
    print("Distribution {}".format(distribution.size()))
    print("Here")
    raise Exception(e)
    pass
    # print(indicies.size())
    # print(distribution.size())

def mean(values):
  return sum(values)/len(values)

def discrete_entropy(array):
    log_prob = torch.log(array)
    return -torch.sum(log_prob * array, dim=1, keepdim=True)

def main(device):
  env = gym.make('CartPole-v0')
  policy = PolicyNetwork(4, 256, 2).to(device)
  value = ValueNetwork(4, 256).to(device)

  policy_optim = optim.Adam(chain(policy.parameters(), value.parameters()), lr=3e-4, weight_decay=0.01, betas=(0.9, 0.999))

  # ... more stuff here...
  value_criterion = nn.MSELoss()

  # Hyperparameters
  epochs = 1000  # 1000
  env_samples = 100
  steps_to_take = 200
  gamma = 0.99
  value_epochs = 4
  batch_size = 32
  epsilon = 0.2
  entropy_coef = 1e-3
  ppo_pos = 1 + epsilon
  ppo_neg = 1 - epsilon

  loop = tqdm(total=epochs, position=0, leave=False)

  for _ in range(epochs):
    rewards = []
    value_losses = []
    policy_losses = []
    entropy_losses = []
    # generate rollouts
    rollouts = []
    for sample_num in range(env_samples):
      
      state = env.reset()
      rollout_reward = 0
      rollout_experience = []
      for step in range(steps_to_take):
        # if sample_num == 0:
        #   env.render()
        # don't forget to reset the environment at the beginning of each episode!
        # rollout for a certain number of steps!
        state_in = torch.as_tensor(state).float().unsqueeze(0)
        # print(state_in.size())
        action_distribution = policy(state_in)
        # your agent here (this takes random actions)
        action = select_action(action_distribution)[0]
        # print(action_distribution.size())
        # print(action[0])
        # print(env.action_space)
        # print(env)
        new_state, reward, done, info = env.step(int(action))
        if done:
          reward = 0
          discounted_reward = 0
          rollout_experience.append(
              (state, reward, done, action, action_distribution.detach().numpy()[0], discounted_reward))
          state = new_state
          rollout_reward += 0
          break
        discounted_reward = (gamma**(steps_to_take-step))*reward
        
        rollout_experience.append(
            (state, reward, done, action, action_distribution.detach().numpy()[0], discounted_reward))
        state = new_state
        rollout_reward += (gamma**(steps_to_take-step))*reward

      rollouts.append((rollout_experience, rollout_reward))

    # print('avg standing time:', standing_len / env_samples)

    # Approximate the value function
    value_dataset = AdvantageDataset(rollouts)
    value_loader = DataLoader(
        value_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    for _ in range(value_epochs):
      for value_data in value_loader:
        policy_optim.zero_grad()
        state, reward, done, action, action_distribution, discounted_reward = value_data
        state_tensor = torch.as_tensor(state).detach().float()
        reward_float = torch.as_tensor(discounted_reward).unsqueeze(1).float()
        expected_ret = value(state_tensor)

        value_loss = value_criterion(expected_ret, reward_float)
        advantage = reward_float - expected_ret.detach()
        current_action_distribution = policy(state_tensor)
        likelihood_new = likelihood_of_getting(action, current_action_distribution)
        likelihood_old = likelihood_of_getting(action, action_distribution)
        if likelihood_new is None or likelihood_old is None:
          print("Problem!")
          continue
        ratio = likelihood_new/likelihood_old
        policy_calc = torch.min(advantage*ratio, torch.clamp(ratio, ppo_neg, ppo_pos)*advantage)
        policy_loss = -torch.mean(policy_calc)
        entropy_loss = -entropy_coef * torch.mean(discrete_entropy(current_action_distribution))
        rewards.append(torch.mean(reward_float).item())
        policy_losses.append(policy_loss.item())
        value_losses.append(value_loss.item())
        entropy_losses.append(entropy_loss.item())
        total_loss = policy_loss + .1*value_loss + entropy_loss
        
        total_loss.backward()
        policy_optim.step()

    loop.set_description('Reward: {}, Value: {}, Policy: {}, Entropy: {}, Average Length: {}'.format(mean(rewards), mean(value_losses), mean(policy_losses), mean(entropy_losses), value_dataset.averageLength()))
    loop.update()
  env.close()
main('cpu')


Reward: 0.14853906407952308, Value: 0.0015484575650979032, Policy: -0.007003044583793131, Entropy: -0.0006931119596369431, Average Length: 22.19:  42%|████▏     | 420/1000 [17:50<16:06,  1.67s/it]

.3257

