In [1]:
import gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import random

In [2]:
env = gym.make("CartPole-v0").env
print(env.reset())

# Build a simple neural network that predicts policy logits. 
# Keep it simple: CartPole isn't worth deep architectures.
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(4, 200)
        self.fc2 = nn.Linear(200, 100)
        self.actor = nn.Linear(100, 2)
        self.critic = nn.Linear(100,1)

    def forward(self, x):
        x = (F.relu(self.fc1(x)))
        x = (F.relu(self.fc2(x)))
        action = self.actor(x)
        value = self.critic(x)
        return action, value

[-0.00048518  0.04345835 -0.00324085 -0.02986179]


In [3]:
def predict_probs(states):
    """ 
    Predict action probabilities given states.
    :param states: numpy array of shape [batch, state_shape]
    :returns: numpy array of shape [batch, n_actions]
    """
    # convert states, compute logits, use softmax to get probability
    state = torch.tensor(states, dtype=torch.float32)
    
    logits, _ = network(state)
    # print(logits)
    prob  = F.softmax(logits, dim=1).detach().numpy()
    return prob


def generate_session(t_max=50):
    """ 
    play a full session with REINFORCE agent and train at the session end.
    returns sequences of states, actions andrewards
    """
    # arrays to record session
    states, actions, rewards = [], [], []
    s = env.reset()

    for t in range(t_max):
        # action probabilities array aka pi(a|s)
        action_probs = predict_probs(np.array([s]))[0]


        # Sample action with given probabilities.
        a = np.random.choice(2, p = action_probs)
        new_s, r, done, info = env.step(a)

        # record session history to train later
        states.append(s)
        actions.append(a)
        rewards.append(r)

        s = new_s
        if done:
            break

    return states, actions, rewards


In [4]:
def get_cumulative_rewards(rewards,  # rewards at each step
                           gamma=0.99  # discount for reward
                           ):
    """
    take a list of immediate rewards r(s,a) for the whole session 
    compute cumulative returns (a.k.a. G(s,a) in Sutton '16)
    G_t = r_t + gamma*r_{t+1} + gamma^2*r_{t+2} + ...

    The simple way to compute cumulative rewards is to iterate from last to first time tick
    and compute G_t = r_t + gamma*G_{t+1} recurrently

    You must return an array/list of cumulative rewards with as many elements as in the initial rewards.
    """
    num = len(rewards)
    gammas = np.zeros(num)
    gammas[num-1] = rewards[num-1]

    for i in reversed(range(num - 1)):
      gammas[i] = rewards[i] + gamma*gammas[i+1]


    return gammas

def to_one_hot(y_tensor, ndims):
    """ helper: take an integer vector and convert it to 1-hot matrix. """
    y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1)
    y_one_hot = torch.zeros(
        y_tensor.size()[0], ndims).scatter_(1, y_tensor, 1)
    return y_one_hot


In [5]:
def train_on_session(states, actions, rewards, gamma=0.99, entropy_coef=1e-2):
    """
    Takes a sequence of states, actions and rewards produced by generate_session.
    Updates agent's weights by following the policy gradient above.
    Please use Adam optimizer with default parameters.
    """

    # cast everything into torch tensors
    states = torch.tensor(states, dtype=torch.float32)
    actions = torch.tensor(actions, dtype=torch.int32)
    cumulative_returns = np.array(get_cumulative_rewards(rewards, gamma))
    cumulative_returns = torch.tensor(cumulative_returns, dtype=torch.float32)

    # predict logits, probas and log-probas using an agent.
    logits, value = network(states)
    # print(value)
    probs = F.softmax(logits, -1)
    log_probs = F.log_softmax(logits, -1)

    assert all(isinstance(v, torch.Tensor) for v in [logits, probs, log_probs]), \
        "please use compute using torch tensors and don't use predict_probs function"

    # select log-probabilities for chosen actions, log pi(a_i|s_i)
    log_probs_for_actions = torch.sum(
        log_probs * to_one_hot(actions, env.action_space.n), dim=1)
   
    # Compute loss here. Don't forgen entropy regularization with `entropy_coef` 
    # print(torch.squeeze(value), cumulative_returns)
    
    entropy = torch.mean(probs*log_probs)
    loss =  -torch.mean(log_probs_for_actions*(cumulative_returns- value) - entropy_coef*entropy) + torch.mean(F.smooth_l1_loss(torch.squeeze(value), cumulative_returns))
    # loss = (- torch.mean(log_probs_for_actions*cumulative_returns) - entropy_coef*entropy)

    # Gradient descent step
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # technical: return session rewards to print them later
    return np.sum(rewards)

In [None]:

t_max = 500
network = Net()
optimizer = torch.optim.Adam(network.parameters(), 1e-3)

for i in range(100):
    rewards = [train_on_session(*generate_session(t_max))
               for _ in range(100)]  # generate new sessions
    print("mean reward:%.3f" % (np.mean(rewards)))
    if np.mean(rewards) >= 0.95*t_max:
        print("You Win!")  # but you can train even further
        break

In [24]:
s = env.reset()

for t in range(1000):
    action_probs = predict_probs(np.array([s]))[0]
    a = np.random.choice(2, p = action_probs)
    s, r, done, info = env.step(a)
    env.render()
    if done:
        
        break
print(t)
env.close()


999
