In [1]:
import gym
import numpy as np

from gym.wrappers import AtariPreprocessing
gym.new_step_api=True
env = gym.make('Pong-v0')

H = 1600 # number of hidden layer neurons
D = 80 * 80 # input dimensionality: 80x80 grid
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(H) / np.sqrt(H)
# hyperparameters
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

def sigmoid(x): 
  return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def prepro(I):
  I=np.asarray(I)
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  
  I[I == 144] = 0 # erase background (background type 1)
  
  I[I == 109] = 0 # erase background (background type 2)
  
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(float).ravel()

def discount_rewards(r):
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r

def policy_forward(x):
  h = np.dot(model['W1'], x)
  h[h<0] = 0 # ReLU nonlinearity
  logp = np.dot(model['W2'], h)
  p = sigmoid(logp)
  return p, h # return probability of taking action 2, and hidden state

def policy_backward(epx, eph, epdlogp):
  """ backward pass. (eph is array of intermediate hidden states) """
  dW2 = np.dot(eph.T, epdlogp).ravel()
  dh = np.outer(epdlogp, model['W2'])
  dh[eph <= 0] = 0 # backpro prelu
  dW1 = np.dot(dh.T, epx)
  return {'W1':dW1, 'W2':dW2}

def model_step(model, observation, prev_x):

  cur_x = prepro(observation)
  x = cur_x - prev_x if prev_x is not None else np.zeros(D)
  prev_x = cur_x
  
  # forward the policy network and sample an action from the returned probability
  aprob, _ = policy_forward(x)
  action = 2 if aprob >= 0.5 else 3 # roll the dice!
  
  return action, prev_x

def play_game(env, model):
  observation = env.reset()

  frames = []
  cumulated_reward = 0

  prev_x = None # used in computing the difference frame

  for t in range(1000):
      frames.append(env.render(mode = 'rgb_array'))
      action, prev_x = model_step(model, observation, prev_x)
      observation, reward, done, info = env.step(action)
      cumulated_reward += reward
      if done:
          print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
          break
  print("Episode finished without success, accumulated reward = {}".format(cumulated_reward))
  env.close()
  display_frames_as_gif(frames)

def train_model(env, model, total_episodes = 100):
  hist = []
  observation = env.reset()

  prev_x = None # used in computing the difference frame
  xs,hs,dlogps,drs = [],[],[],[]
  running_reward = None
  reward_sum = 0
  episode_number = 0

  while True:
  
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
      episode_number += 1

      # stack together all inputs, hidden states, action gradients, and rewards for this episode
      epx = np.vstack(xs)
      eph = np.vstack(hs)
      epdlogp = np.vstack(dlogps)
      epr = np.vstack(drs)
      xs,hs,dlogps,drs = [],[],[],[] # reset array memory

      # compute the discounted reward backwards through time
      discounted_epr = discount_rewards(epr)
      # standardize the rewards to be unit normal (helps control the gradient estimator variance)
      discounted_epr -= np.mean(discounted_epr)
      discounted_epr /= np.std(discounted_epr)

      epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
      grad = policy_backward(epx, eph, epdlogp)
      for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

      # perform rmsprop parameter update every batch_size episodes
      if episode_number % batch_size == 0:
        for k,v in model.items():
          g = grad_buffer[k] # gradient
          rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
          model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
          grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

      # boring book-keeping
      running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
      hist.append((episode_number, reward_sum, running_reward))
      print ('episode %f, reward total was %f. running mean: %f' % (episode_number, reward_sum, running_reward))
      reward_sum = 0
      observation = env.reset() # reset env
      prev_x = None
      if episode_number == total_episodes: 
        return hist

   
    

  logger.warn(
  deprecation(
  deprecation(


In [None]:
%time hist1 = train_model(env, model, total_episodes=5500)

  logger.deprecation(


episode 1.000000, reward total was -21.000000. running mean: -21.000000
episode 2.000000, reward total was -20.000000. running mean: -20.990000
episode 3.000000, reward total was -21.000000. running mean: -20.990100
episode 4.000000, reward total was -20.000000. running mean: -20.980199
episode 5.000000, reward total was -21.000000. running mean: -20.980397
episode 6.000000, reward total was -21.000000. running mean: -20.980593
episode 7.000000, reward total was -19.000000. running mean: -20.960787
episode 8.000000, reward total was -21.000000. running mean: -20.961179
episode 9.000000, reward total was -18.000000. running mean: -20.931567
episode 10.000000, reward total was -21.000000. running mean: -20.932252
episode 11.000000, reward total was -21.000000. running mean: -20.932929
episode 12.000000, reward total was -20.000000. running mean: -20.923600
episode 13.000000, reward total was -19.000000. running mean: -20.904364
episode 14.000000, reward total was -17.000000. running mean

episode 114.000000, reward total was -21.000000. running mean: -20.558624
episode 115.000000, reward total was -20.000000. running mean: -20.553037
episode 116.000000, reward total was -20.000000. running mean: -20.547507
episode 117.000000, reward total was -19.000000. running mean: -20.532032
episode 118.000000, reward total was -21.000000. running mean: -20.536712
episode 119.000000, reward total was -20.000000. running mean: -20.531345
episode 120.000000, reward total was -21.000000. running mean: -20.536031
episode 121.000000, reward total was -20.000000. running mean: -20.530671
episode 122.000000, reward total was -21.000000. running mean: -20.535364
episode 123.000000, reward total was -21.000000. running mean: -20.540010
episode 124.000000, reward total was -20.000000. running mean: -20.534610
episode 125.000000, reward total was -20.000000. running mean: -20.529264
episode 126.000000, reward total was -21.000000. running mean: -20.533972
episode 127.000000, reward total was -

episode 225.000000, reward total was -20.000000. running mean: -20.301828
episode 226.000000, reward total was -19.000000. running mean: -20.288810
episode 227.000000, reward total was -20.000000. running mean: -20.285922
episode 228.000000, reward total was -21.000000. running mean: -20.293063
episode 229.000000, reward total was -20.000000. running mean: -20.290132
episode 230.000000, reward total was -21.000000. running mean: -20.297231
episode 231.000000, reward total was -20.000000. running mean: -20.294259
episode 232.000000, reward total was -20.000000. running mean: -20.291316
episode 233.000000, reward total was -21.000000. running mean: -20.298403
episode 234.000000, reward total was -21.000000. running mean: -20.305419
episode 235.000000, reward total was -21.000000. running mean: -20.312365
episode 236.000000, reward total was -19.000000. running mean: -20.299241
episode 237.000000, reward total was -19.000000. running mean: -20.286249
episode 238.000000, reward total was -

episode 336.000000, reward total was -20.000000. running mean: -20.263320
episode 337.000000, reward total was -21.000000. running mean: -20.270686
episode 338.000000, reward total was -19.000000. running mean: -20.257980
episode 339.000000, reward total was -20.000000. running mean: -20.255400
episode 340.000000, reward total was -18.000000. running mean: -20.232846
episode 341.000000, reward total was -20.000000. running mean: -20.230517
episode 342.000000, reward total was -18.000000. running mean: -20.208212
episode 343.000000, reward total was -20.000000. running mean: -20.206130
episode 344.000000, reward total was -19.000000. running mean: -20.194069
episode 345.000000, reward total was -21.000000. running mean: -20.202128
episode 346.000000, reward total was -21.000000. running mean: -20.210107
episode 347.000000, reward total was -19.000000. running mean: -20.198006
episode 348.000000, reward total was -21.000000. running mean: -20.206026
episode 349.000000, reward total was -

episode 447.000000, reward total was -21.000000. running mean: -20.293390
episode 448.000000, reward total was -21.000000. running mean: -20.300456
episode 449.000000, reward total was -20.000000. running mean: -20.297451
episode 450.000000, reward total was -21.000000. running mean: -20.304477
episode 451.000000, reward total was -21.000000. running mean: -20.311432
episode 452.000000, reward total was -21.000000. running mean: -20.318318
episode 453.000000, reward total was -21.000000. running mean: -20.325135
episode 454.000000, reward total was -20.000000. running mean: -20.321883
episode 455.000000, reward total was -20.000000. running mean: -20.318664
episode 456.000000, reward total was -21.000000. running mean: -20.325478
episode 457.000000, reward total was -20.000000. running mean: -20.322223
episode 458.000000, reward total was -21.000000. running mean: -20.329001
episode 459.000000, reward total was -21.000000. running mean: -20.335711
episode 460.000000, reward total was -

episode 558.000000, reward total was -20.000000. running mean: -20.275110
episode 559.000000, reward total was -21.000000. running mean: -20.282359
