In [None]:
"""
Given an episode :

1. Get first state --> Forward pass through nn and get an action 
2. Remember state, action, probability and reward for the action
3. Get next state and so on ...
4. At end --> you have list of all states, actions and corresponsng rewards + gradients
Note : gradient  = [1 0 0 0 0 0 ] - [ 0.8 0.1 0.1 0 0 0 0] = [0.2 -0.9 -0.9 0 0 0 0 ] 

END OF EPISODE
# Calculate discounted rewards viz a list
# gradients = gradients*dicounted_rewards 
# gradients.shape = [episode length, action_size]
  for each pass, 6 gradients are calculated corresponding to 6 actions. Policy network is updated to favor the 
  action with higher gradients
# Y = agent.probs + agent.learning_rate * np.squeeze(np.vstack([gradients])) 



"""

In [1]:
import gym
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Reshape, Flatten
from keras.optimizers import Adam
from keras.layers.convolutional import Convolution2D


class PGAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.99
        self.learning_rate = 0.001
        self.states = []
        self.gradients = []
        self.rewards = []
        self.probs = []
        self.model = self._build_model()
        self.model.summary()

    def _build_model(self):
        model = Sequential()
        model.add(Reshape((80, 80,1), input_shape=(self.state_size,)))
        model.add(Convolution2D(32, 6, 6, subsample=(3, 3), border_mode='same',
                                activation='relu', init='he_uniform'))
        model.add(Flatten())
        model.add(Dense(64, activation='relu', init='he_uniform'))
        model.add(Dense(32, activation='relu', init='he_uniform'))
        model.add(Dense(self.action_size, activation='softmax'))
        opt = Adam(lr=self.learning_rate)
        model.compile(loss='categorical_crossentropy', optimizer=opt)
        return model

    def remember(self, state, action, prob, reward):
        y = np.zeros([self.action_size])
        y[action] = 1
        self.gradients.append(np.array(y).astype('float32') - prob) # how is y - prob = gradient?
        self.states.append(state)
        self.rewards.append(reward)

    def act(self, state):
        state = state.reshape([1, state.shape[0]]) # Column vector to row vector
        aprob = self.model.predict(state, batch_size=1).flatten() # predict model 
        self.probs.append(aprob)
        prob = aprob / np.sum(aprob) # redundant
        action = np.random.choice(self.action_size, 1, p=prob)[0]
        return action, prob

    def discount_rewards(self, rewards):
        discounted_rewards = np.zeros_like(rewards)
        running_add = 0
        for t in reversed(range(0, rewards.size)):
            if rewards[t] != 0:
                running_add = 0
            running_add = running_add * self.gamma + rewards[t]
            discounted_rewards[t] = running_add
        return discounted_rewards

    def train(self):
        gradients = np.vstack(self.gradients)
        rewards = np.vstack(self.rewards)
        rewards = self.discount_rewards(rewards)
        rewards = rewards / np.std(rewards - np.mean(rewards))
        gradients *= rewards
        X = np.squeeze(np.vstack([self.states]))
        Y = self.probs + self.learning_rate * np.squeeze(np.vstack([gradients]))
        self.model.train_on_batch(X, Y)
        self.states, self.probs, self.gradients, self.rewards = [], [], [], []

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

def preprocess(I):
    I = I[35:195]
    I = I[::2, ::2, 0]
    I[I == 144] = 0
    I[I == 109] = 0
    I[I != 0] = 1
    return I.astype(np.float).ravel()



  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
env = gym.make("Pong-v0")
state = env.reset()
prev_x = None
score = 0
episode = 0

state_size = 80 * 80
action_size = env.action_space.n
agent = PGAgent(state_size, action_size)
#     agent.load('pong.h5')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_1 (Reshape)          (None, 80, 80, 1)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 27, 27, 32)        1184      
_________________________________________________________________
flatten_1 (Flatten)          (None, 23328)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                1493056   
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 198       
Total params: 1,496,518
Trainable params: 1,496,518
Non-trainable params: 0
_________________________________________________________________


  def __init__(self):
  self.logger.info('Logger initialized for {}'.format(self.__class__.__name__))
  


In [3]:
while True:
#         env.render()

    cur_x = preprocess(state)
    x = cur_x - prev_x if prev_x is not None else np.zeros(state_size)
    prev_x = cur_x

    action, prob = agent.act(x)
    state, reward, done, info = env.step(action)
    score += reward
    agent.remember(x, action, prob, reward)

    if done:
        break
        