In [None]:
import keras
import numpy as np
from keras.models import Model
from keras.layers import Dense, Input
import keras.backend as Backend
from keras.optimizers import Adam

class ActorCriticNetwork:
    def __init__(self, alpha, beta, gamma, input_dims, num_actions):
        self.alpha = alpha
        self.gamma = gamma
        self.beta = beta
        self.input_dims = input_dims
        self.action_space = [i for i in range(num_actions)]
        self.num_actions = num_actions
        self.actor_model, self.critic_model, self.prediction_model = self.build_net()

    def build_net(self):
        state = Input(shape=(self.input_dims,))
        layer = Dense(1024, activation='relu')(state)
        layer = Dense(512, activation='relu')(layer)
        probs = Dense(self.num_actions, activation='softmax')(layer)
        qvalue = Dense(1, activation='linear')(layer)

        qdiff = Input(shape=(None,)) #diffs is the qdiff or the temporal difference qtarget - qstate (qtarget = reward + gamma*qstate_)

        def policy_loss(y_true, y_pred):
            #y_true is a one hot vector of num_actions dimension
            y_pred = Backend.clip(y_pred, 1e-18, 1-1e-18)
            logprobs = y_true * Backend.log(y_pred)
            return Backend.sum(-logprobs * qdiff)

        actor_model = Model(inputs=[state, qdiff], outputs=[probs])
        actor_model.compile(optimizer=Adam(lr=self.alpha), loss=policy_loss)

        critic_model = Model(inputs=[state], outputs=[qvalue])
        critic_model.compile(optimizer=Adam(lr=self.beta), loss='mean_squared_error')

        predict_model = Model(inputs=[state], outputs=[probs])

        return actor_model, critic_model, predict_model

    def choose_action(self, state):
        state = state[np.newaxis, :]
        probs = self.prediction_model.predict(state)[0]
        action = np.random.choice(self.action_space, p=probs)

        return action

    def learn(self, state, state_, action, reward, done):
        state = state[np.newaxis, :]
        state_ = state_[np.newaxis, :]

        qstate = self.critic_model.predict(state)
        qstate_ = self.critic_model.predict(state_)
        qtarget = reward + self.gamma*qstate_*(1-int(done))
        qdiff = qtarget - qstate

        label = np.zeros((1, self.num_actions))
        label[0, action] = 1

        self.actor_model.fit([state, qdiff], label, verbose=0)
        self.critic_model.fit(state, qtarget, verbose=0)


In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt

if __name__ == '__main__':

    env = gym.make('CartPole-v1')
    num_epsiodes = 2000
    state_dims = env.observation_space.shape[0]
    num_actions = env.action_space.n

    agent = ActorCriticNetwork(alpha=0.00001, beta=0.00005, gamma=0.99, input_dims=state_dims, num_actions=num_actions)

    score_history = []
    score = 0

    for i in range(num_epsiodes):
        print('episode: ', i,'score: ', score)
        state = env.reset()
        done = False
        score = 0
        while not done:
            action = agent.choose_action(state)
            state_, reward, done, info = env.step(action)
            agent.learn(state, state_, action, reward, done)
            state = state_
            score += reward
        score_history.append(score)

episode:  0 score:  0
episode:  1 score:  15.0
episode:  2 score:  26.0
episode:  3 score:  23.0
episode:  4 score:  15.0
episode:  5 score:  14.0
episode:  6 score:  24.0
episode:  7 score:  11.0
episode:  8 score:  17.0
episode:  9 score:  53.0
episode:  10 score:  24.0
episode:  11 score:  14.0
episode:  12 score:  16.0
episode:  13 score:  15.0
episode:  14 score:  19.0
episode:  15 score:  21.0
episode:  16 score:  26.0
episode:  17 score:  14.0
episode:  18 score:  14.0
episode:  19 score:  16.0
episode:  20 score:  12.0
episode:  21 score:  11.0
episode:  22 score:  20.0
episode:  23 score:  19.0
episode:  24 score:  13.0
episode:  25 score:  17.0
episode:  26 score:  21.0
episode:  27 score:  13.0
episode:  28 score:  14.0
episode:  29 score:  11.0
episode:  30 score:  10.0
episode:  31 score:  11.0
episode:  32 score:  16.0
episode:  33 score:  18.0
episode:  34 score:  10.0
episode:  35 score:  10.0
episode:  36 score:  22.0
episode:  37 score:  14.0
episode:  38 score:  13.0