# Портирование обучения агента Pong by Andrej Karpathy на агента GrandPrix.
 Отличия:
 sigmoid -> softmax
 разные признаки окончания эпизода.


In [45]:
""" Trains an agent with (stochastic) Policy Gradients on GrandPrix. Uses OpenAI Gym. """
import numpy as np
import pickle
import gym
import gym_grand_prix

# hyperparameters
H = 200 # number of hidden layer neurons
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
resume = False # resume from previous checkpoint?
render = False

# model initialization
n_rays = 5
n_actions = 5

D = 2 + n_rays # input dimensionality

if resume:
    model = pickle.load(open('model.p', 'rb'))
else:
    model = {}
    model['W1'] = np.random.randn(H, D) / np.sqrt(D) # "Xavier" initialization
    model['W2'] = np.random.randn(n_actions, H) / np.sqrt(H)

grad_buffer = { k : np.zeros_like(v) for k, v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k, v in model.items() } # rmsprop memory

def softmax(x):
    s = np.exp(x).sum()
    return np.exp(x) / s

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    discounted_r[-1] = r[-1]
    for t in reversed(range(1, len(r))):
        discounted_r[t - 1] = r[t - 1] + gamma*discounted_r[t]
    return discounted_r

def policy_forward(x):
    h = np.dot(model['W1'], x)
    h[h < 0] = 0 # ReLU nonlinearity
    logp = np.dot(model['W2'], h)
    p = softmax(logp)
    return p, h # return probabilies of taking actions 0-4, and hidden state

def policy_backward(epx, eph, epdlogp):
    """ backward pass. (eph is array of intermediate hidden states) """
    dW2 = np.dot(eph.T, epdlogp)
    dh = np.outer(epdlogp, model['W2'])
    dh[eph <= 0] = 0 # backpro prelu
    dW1 = np.dot(dh.T, epx)
    return {'W1':dW1, 'W2':dW2}

In [46]:
env = gym.make("GrandPrix-v0")
track = 3
options = {
    'nrays': n_rays,
    'seed': track, # задает номер трека
}
if render:
    options['display'] = 1
    env.setOptions(options)
    
xs, hs, dlogps, drs, acs = [], [], [], [], []
running_reward = None
n_steps = 200 # число шагов в эпизоде
n_episodes = 200
possible_actions = ((0, 0), (1, .75), (-1, .75), (0, .75), (0, -.75))

for ep in range(1, n_episodes + 1):
    observation = env.reset()
    reward_sum = 0.

    for step in range(n_steps):
        if render: 
            env.render()
        x = observation

        # forward the policy network and sample an action from the returned probability
        aprob, h = policy_forward(x)
        action = np.random.choice([i for i in range(n_actions)], p=aprob) # roll the dice!

        # record various intermediates (needed later for backprop)
        xs.append(x) # observation
        hs.append(h) # hidden state
        acs.append(action)
        dlogps.append(aprob[action]) # grad that encourages the action that was taken to be taken 
                    #(see http://cs231n.github.io/neural-networks-2/#losses if confused)

        # step the environment and get new measurements
        observation, reward, done, info = env.step(possible_actions[action])
        reward_sum += reward

        drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    # stack together all inputs, hidden states, action gradients, and rewards for this episode
    epx = np.vstack(xs)
    eph = np.vstack(hs)
    epdlogp = np.vstack(dlogps)
    epr = np.vstack(drs)
    xs, hs, dlogps, drs, acs = [], [], [], [], [] # reset array memory

    # compute the discounted reward backwards through time
    discounted_epr = discount_rewards(epr)
    # standardize the rewards to be unit normal (helps control the gradient estimator variance)
    discounted_epr -= np.mean(discounted_epr)
    discounted_epr /= np.std(discounted_epr)

    epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
    grad = policy_backward(epx, eph, epdlogp)
    for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

    # perform rmsprop parameter update every batch_size episodes
    if ep % batch_size == 0:
        for k,v in model.items():
            g = grad_buffer[k] # gradient
            rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
            model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
            grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

    # boring book-keeping
    reward_sum /= n_steps
    running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
    print('ep %d: total %.4f. running mean: %.4f' % (ep, reward_sum, running_reward))
    if ep % 100 == 0: pickle.dump(model, open('model.p', 'wb'))

IndexError: boolean index did not match indexed array along dimension 1; dimension is 1000 but corresponding boolean dimension is 200