<a href="https://colab.research.google.com/github/arbi11/YCBS-277/blob/master/YCBS_277_PG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
"""
Simple policy gradient in Keras

"""
import gym
import numpy as np

from keras import layers
from keras.models import Model
from keras import backend as K
from keras import utils as np_utils
from keras import optimizers


class Agent(object):

    def __init__(self, input_dim, output_dim, hidden_dims=[32, 32]):
        """Gym Playing Agent

        Args:
            input_dim (int): the dimension of state.
                Same as `env.observation_space.shape[0]`

            output_dim (int): the number of discrete actions
                Same as `env.action_space.n`

            hidden_dims (list): hidden dimensions

        Methods:

            private:
                __build_train_fn -> None
                    It creates a train function
                    It's similar to defining `train_op` in Tensorflow
                __build_network -> None
                    It create a base model
                    Its output is each action probability

            public:
                get_action(state) -> action
                fit(state, action, reward) -> None
        """

        self.input_dim = input_dim
        self.output_dim = output_dim

        self.__build_network(input_dim, output_dim, hidden_dims)
        self.__build_train_fn()

    def __build_network(self, input_dim, output_dim, hidden_dims=[32, 32]):
        """Create a base network"""
        self.X = layers.Input(shape=(input_dim,))
        net = self.X

        for h_dim in hidden_dims:
            net = layers.Dense(h_dim)(net)
            net = layers.Activation("relu")(net)

        net = layers.Dense(output_dim)(net)
        net = layers.Activation("softmax")(net)

        self.model = Model(inputs=self.X, outputs=net)

    def __build_train_fn(self):
        """Create a train function

        It replaces `model.fit(X, y)` because we use the output of model and use it for training.

        For example, we need action placeholder
        called `action_one_hot` that stores, which action we took at state `s`.
        Hence, we can update the same action.

        This function will create
        `self.train_fn([state, action_one_hot, discount_reward])`
        which would train the model.

        """
        action_prob_placeholder = self.model.output
        action_onehot_placeholder = K.placeholder(shape=(None, self.output_dim),
                                                  name="action_onehot")
        discount_reward_placeholder = K.placeholder(shape=(None,),
                                                    name="discount_reward")

        action_prob = K.sum(action_prob_placeholder * action_onehot_placeholder, axis=1)
        log_action_prob = K.log(action_prob)

        loss = - log_action_prob * discount_reward_placeholder
        loss = K.mean(loss)

        adam = optimizers.Adam()

        updates = adam.get_updates(params=self.model.trainable_weights,
                                   loss=loss)

        self.train_fn = K.function(inputs=[self.model.input,
                                           action_onehot_placeholder,
                                           discount_reward_placeholder],
                                   outputs=[],
                                   updates=updates)

    def get_action(self, state):
        """Returns an action at given `state`

        Args:
            state (1-D or 2-D Array): It can be either 1-D array of shape (state_dimension, )
                or 2-D array shape of (n_samples, state_dimension)

        Returns:
            action: an integer action value ranging from 0 to (n_actions - 1)
        """
        shape = state.shape

        if len(shape) == 1:
            assert shape == (self.input_dim,), "{} != {}".format(shape, self.input_dim)
            state = np.expand_dims(state, axis=0)

        elif len(shape) == 2:
            assert shape[1] == (self.input_dim), "{} != {}".format(shape, self.input_dim)

        else:
            raise TypeError("Wrong state shape is given: {}".format(state.shape))

        action_prob = np.squeeze(self.model.predict(state))
        assert len(action_prob) == self.output_dim, "{} != {}".format(len(action_prob), self.output_dim)
        return np.random.choice(np.arange(self.output_dim), p=action_prob)

    def fit(self, S, A, R):
        """Train a network

        Args:
            S (2-D Array): `state` array of shape (n_samples, state_dimension)
            A (1-D Array): `action` array of shape (n_samples,)
                It's simply a list of int that stores which actions the agent chose
            R (1-D Array): `reward` array of shape (n_samples,)
                A reward is given after each action.

        """
        action_onehot = np_utils.to_categorical(A, num_classes=self.output_dim)
        discount_reward = compute_discounted_R(R)

        assert S.shape[1] == self.input_dim, "{} != {}".format(S.shape[1], self.input_dim)
        assert action_onehot.shape[0] == S.shape[0], "{} != {}".format(action_onehot.shape[0], S.shape[0])
        assert action_onehot.shape[1] == self.output_dim, "{} != {}".format(action_onehot.shape[1], self.output_dim)
        assert len(discount_reward.shape) == 1, "{} != 1".format(len(discount_reward.shape))

        self.train_fn([S, action_onehot, discount_reward])


def compute_discounted_R(R, discount_rate=.99):
    """Returns discounted rewards

    Args:
        R (1-D array): a list of `reward` at each time step
        discount_rate (float): Will discount the future value by this rate

    Returns:
        discounted_r (1-D array): same shape as input `R`
            but the values are discounted

    Examples:
        >>> R = [1, 1, 1]
        >>> compute_discounted_R(R, .99) # before normalization
        [1 + 0.99 + 0.99**2, 1 + 0.99, 1]
    """
    discounted_r = np.zeros_like(R, dtype=np.float32)
    running_add = 0
    for t in reversed(range(len(R))):

        running_add = running_add * discount_rate + R[t]
        discounted_r[t] = running_add

    discounted_r -= discounted_r.mean() / discounted_r.std()

    return discounted_r


def run_episode(env, agent):
    """Returns an episode reward

    (1) Play until the game is done
    (2) The agent will choose an action according to the policy
    (3) When it's done, it will train from the game play

    Args:
        env (gym.env): Gym environment
        agent (Agent): Game Playing Agent

    Returns:
        total_reward (int): total reward earned during the whole episode
    """
    done = False
    S = []
    A = []
    R = []

    s = env.reset()

    total_reward = 0

    while not done:

        a = agent.get_action(s)

        s2, r, done, info = env.step(a)
        total_reward += r

        S.append(s)
        A.append(a)
        R.append(r)

        s = s2

        if done:
            S = np.array(S)
            A = np.array(A)
            R = np.array(R)

            agent.fit(S, A, R)

    return total_reward


def main():
    try:
        env = gym.make("CartPole-v0")
        input_dim = env.observation_space.shape[0]
        output_dim = env.action_space.n
        agent = Agent(input_dim, output_dim, [16, 16])

        for episode in range(2000):
            reward = run_episode(env, agent)
            print(episode, reward)

    finally:
        env.close()


if __name__ == '__main__':
    main()













0 14.0
1 21.0
2 10.0
3 36.0
4 12.0
5 10.0
6 21.0
7 14.0
8 35.0
9 35.0
10 26.0
11 41.0
12 48.0
13 51.0
14 41.0
15 13.0
16 20.0
17 14.0
18 19.0
19 22.0
20 10.0
21 73.0
22 11.0
23 80.0
24 16.0
25 19.0
26 36.0
27 58.0
28 13.0
29 10.0
30 34.0
31 21.0
32 20.0
33 11.0
34 18.0
35 18.0
36 16.0
37 33.0
38 15.0
39 23.0
40 57.0
41 16.0
42 30.0
43 34.0
44 23.0
45 29.0
46 12.0
47 43.0
48 32.0
49 34.0
50 30.0
51 68.0
52 40.0
53 23.0
54 42.0
55 18.0
56 13.0
57 20.0
58 10.0
59 21.0
60 27.0
61 24.0
62 11.0
63 16.0
64 19.0
65 18.0
66 16.0
67 15.0
68 15.0
69 26.0
70 33.0
71 16.0
72 29.0
73 31.0
74 15.0
75 39.0
76 34.0
77 51.0
78 17.0
79 23.0
80 9.0
81 25.0
82 23.0
83 25.0
84 22.0
85 49.0
86 17.0
87 46.0
88 15.0
89 32.0
90 28.0
91 27.0
92 21.0
93 12.0
94 22.0
95 12.0
96 14.0
97 14.0
98 15.0
99 15.0
100 13.0
101 29.0
102 18.0
103 41.0
104 21.0
105 31.0
106 20.0
107 19.0
108 43.0
109 109.0
110 21.0
111 43.0
112 32.0
113 37.0
114 47.0
115 29.0
116 11.0
117 24.0
118 64.0
119 94.0
120 36.0
121 19.0
122 3

In [0]:
import keras

In [3]:
keras.__version__

'2.2.5'