In [1]:
#First we import some libraries
#Json for loading and saving the model (optional)
import json
#matplotlib for rendering
import matplotlib.pyplot as plt
#numpy for handeling matrix operations
import numpy as np
#time, to, well... keep track of time
import time
#Python image libarary for rendering
from PIL import Image
#iPython display for making sure we can render the frames
from IPython import display
#seaborn for rendering
import seaborn
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD


#Setup matplotlib so that it runs nicely in iPython
%matplotlib inline
#setting up seaborn
seaborn.set()

class Catch(object):
    """
    Class catch is the actual game.
    In the game, fruits, represented by white tiles, fall from the top.
    The goal is to catch the fruits with a basked (represented by white tiles, this is deep learning, not game design).
    """
    def __init__(self, grid_size=10):
        self.grid_size = grid_size
        self.reset()

    def _update_state(self, action):
        """
        Input: action and states
        Ouput: new states and reward
        """
        state = self.state
        if action == 0:  # left
            action = -1
        elif action == 1:  # stay
            action = 0
        else:
            action = 1  # right
        f0, f1, basket = state[0]
        new_basket = min(max(1, basket + action), self.grid_size-1)
        f0 += 1
        out = np.asarray([f0, f1, new_basket])
        out = out[np.newaxis]

        assert len(out.shape) == 2
        self.state = out

    def _draw_state(self):
        im_size = (self.grid_size,)*2
        state = self.state[0]
        canvas = np.zeros(im_size)
        canvas[state[0], state[1]] = 1  # draw fruit
        canvas[-1, state[2]-1:state[2] + 2] = 1  # draw basket
        return canvas
        
    def _get_reward(self):
        fruit_row, fruit_col, basket = self.state[0]
        if fruit_row == self.grid_size-1:
            if abs(fruit_col - basket) <= 1:
                return 1
            else:
                return -1
        else:
            return 0

    def _is_over(self):
        if self.state[0, 0] == self.grid_size-1:
            return True
        else:
            return False

    def observe(self):
        canvas = self._draw_state()
        return canvas.reshape((1, -1))

    def act(self, action):
        self._update_state(action)
        reward = self._get_reward()
        game_over = self._is_over()
        return self.observe(), reward, game_over

    def reset(self):
        n = np.random.randint(0, self.grid_size-1, size=1)
        m = np.random.randint(1, self.grid_size-2, size=1)
        self.state = np.asarray([0, n, m])[np.newaxis]

        
"""
Here we define some variables used for the game and rendering later
"""
#last frame time keeps track of which frame we are at
last_frame_time = 0
#translate the actions to human readable words
translate_action = ["Left","Stay","Right","Create Ball","End Test"]
#size of the game field
grid_size = 10

def display_screen(action,points,input_t):
    #Function used to render the game screen
    #Get the last rendered frame
    global last_frame_time
    print("Action %s, Points: %d" % (translate_action[action],points))
    #Only display the game screen if the game is not over
    if("End" not in translate_action[action]):
        #Render the game with matplotlib
        plt.imshow(input_t.reshape((grid_size,)*2),
               interpolation='none', cmap='gray')
        #Clear whatever we rendered before
        display.clear_output(wait=True)
        #And display the rendering
        display.display(plt.gcf())
    #Update the last frame time
    last_frame_time = set_max_fps(last_frame_time)
    
    
def set_max_fps(last_frame_time,FPS = 16):
    current_milli_time = lambda: int(round(time.time() * 1000))
    sleep_time = 1./FPS - (current_milli_time() - last_frame_time)
    if sleep_time > 0:
        time.sleep(sleep_time)
    return current_milli_time()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [35]:
import numpy as np
import scipy
import random

import os
import keras
import tensorflow as tf
from keras.models import Sequential,load_model, Model
from keras.layers import Dense, Dropout, Flatten, Multiply
from keras.layers import Conv2D, MaxPooling2D, Input, Lambda
from keras.optimizers import Adam, Adamax, RMSprop
from keras import backend as K

MAX_TIMESTEP = 1000
MAX_EP = 100000
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 0.1
set_session(tf.Session(config=config))

random.seed(2)
np.random.seed(2)
tf.set_random_seed(2)

# ref : https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/

def categorical_crossentropy(target, output):
    _epsilon =  tf.convert_to_tensor(10e-8, dtype=output.dtype.base_dtype)
    output = tf.clip_by_value(output, _epsilon, 1. - _epsilon)
    return (- target * tf.log(output))


class Agent_ActorCritic():
    def __init__(self, env):
#         super(Agent_ActorCritic,self).__init__(env)

        self.log_path = './actor_critic.log'

        self.env = env
        self.actions_avialbe = 3
        self.feature_dim = 100
        self.t = 0
        self.prev_x = None
        self.actor_learning_rate  = 1e-3
        self.critic_learning_rate = 1e-3
        self.gamma = 0.9

        self.dummy_act_picked = np.zeros((1,self.actions_avialbe))

        # Actor
        input_frame  = Input(shape=(self.feature_dim,))
        act_picked = Input(shape=(self.actions_avialbe,))
        hidden_f = Dense(20,activation='relu')(input_frame)

        act_prob = Dense(self.actions_avialbe,activation='softmax')(hidden_f)
        selected_act_prob = Multiply()([act_prob,act_picked])
        selected_act_prob = Lambda(lambda x:K.sum(x, axis=-1, keepdims=True),output_shape=(1,))(selected_act_prob)

        model = Model(inputs=[input_frame,act_picked], outputs=[act_prob, selected_act_prob])

        opt = Adam(lr=self.actor_learning_rate)
        model.compile(loss=['mse',categorical_crossentropy], loss_weights=[0.0,1.0],optimizer=opt)
        self.actor = model

        # Critic
        model = Sequential()
        model.add(Dense(20,activation='relu',input_shape=(self.feature_dim,)))
        model.add(Dense(1))

        opt = Adam(lr=self.critic_learning_rate)
        model.compile(loss='mse', optimizer=opt)
        self.critic = model

    def init_game_setting(self):
        self.prev_x = None


    def train(self):
        # Init
        log = open(self.log_path,'w')
        log.write('reward,avg_reward\n')
        batch_size = 1 
        frames, prob_actions, dlogps, drs =[], [], [], []
        tr_x, tr_y = [],[]
        reward_record = []
        avg_reward = []
        reward_sum = 0
        ep_number = 0
        ep_step = 0 
        #explore_rate = 0
        self.env.reset()
        observation = self.env.observe()
        maxs = 0
        # Training progress
        #print(np.shape(observation))
        #print(type(observation))
        while True:
            
#             temp_p = self.actor.predict([observation,self.dummy_act_picked])[0].flatten()
            temp_p = self.actor.predict([observation,self.dummy_act_picked])
            
#             print(temp_p)
#             print(temp_p[0])
#             break
#             act = np.random.choice(np.arange(self.actions_avialbe), 
#                     p=self.actor.predict([np.expand_dims(observation,axis=0),self.dummy_act_picked])[0].flatten())


            act = np.random.choice(np.arange(self.actions_avialbe), 
                    p=temp_p[0][0])

            act_one_hot = np.zeros((1,self.actions_avialbe))
            act_one_hot[0,act]=1.0
            next_observation, reward, done = self.env.act(act)
            #if done: reward = -20
            
            reward_sum += reward
            predict_reward = self.critic.predict(observation)
            predict_next_reward = self.critic.predict(next_observation)

            td_target = np.expand_dims(reward,axis=0) + self.gamma*predict_next_reward
            td_error = td_target - predict_reward

            self.critic.train_on_batch(observation,td_target)
            self.actor.train_on_batch([observation,act_one_hot],[self.dummy_act_picked,td_error])

            observation = next_observation

            self.t += 1
            ep_step += 1

            if done or ep_step>MAX_TIMESTEP:
                ep_number += 1
                
                avg_reward.append(float(reward_sum))
                if len(avg_reward)>300: avg_reward.pop(0)

                print('EPISODE: {0:6d} / TIMESTEP: {1:8d} / REWARD: {2:5d} / AVG_REWARD: {3:2.3f} '.format(
                            ep_number, self.t, int(reward_sum), np.mean(avg_reward)))
                print('{:.4f},{:.4f}'.format(reward_sum,np.mean(avg_reward)),end='\n',file=log,flush=True)
                self.env.reset()
                observation = self.env.observe()
                reward_sum = 0.0
                ep_step = 0
            if (maxs < np.mean(avg_reward) or np.mean(avg_reward)==1)    and (ep_number > 300):
                maxs = np.mean(avg_reward)
                print("saved model maxs:", maxs, "caught fruits (300): ",(300*maxs+300)/2)
                self.actor.save('actor.h5')
                self.critic.save('critictor.h5')



    def make_action(self, observation, test=True):
        """
        Input:
            observation: np.array
                current RGB screen of game, shape: (210, 160, 3)

        Return:
            action: int
                the predicted action from trained model
        """
        pass

In [None]:
agent = Agent_ActorCritic(Catch(10))
agent.train()

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


EPISODE:      1 / TIMESTEP:        9 / REWARD:     1 / AVG_REWARD: 1.000 
EPISODE:      2 / TIMESTEP:       18 / REWARD:    -1 / AVG_REWARD: 0.000 
EPISODE:      3 / TIMESTEP:       27 / REWARD:     1 / AVG_REWARD: 0.333 
EPISODE:      4 / TIMESTEP:       36 / REWARD:    -1 / AVG_REWARD: 0.000 
EPISODE:      5 / TIMESTEP:       45 / REWARD:    -1 / AVG_REWARD: -0.200 
EPISODE:      6 / TIMESTEP:       54 / REWARD:     1 / AVG_REWARD: 0.000 
EPISODE:      7 / TIMESTEP:       63 / REWARD:     1 / AVG_REWARD: 0.143 
EPISODE:      8 / TIMESTEP:       72 / REWARD:    -1 / AVG_REWARD: 0.000 
EPISODE:      9 / TIMESTEP:       81 / REWARD:     1 / AVG_REWARD: 0.111 
EPISODE:     10 / TIMESTEP:       90 / REWARD:    -1 / AVG_REWARD: 0.000 
EPISODE:     11 / TIMESTEP:       99 / REWARD:    -1 / AVG_REWARD: -0.091 
EPISODE:     12 / TIMESTEP:      108 / REWARD:    -1 / AVG_REWARD: -0.167 
EPISODE:     13 / TIMESTEP:      117 / REWARD:     1 / AVG_REWARD: -0.077 
EPISODE:     14 / TIMESTEP:      1

EPISODE:    111 / TIMESTEP:      999 / REWARD:    -1 / AVG_REWARD: -0.441 
EPISODE:    112 / TIMESTEP:     1008 / REWARD:     1 / AVG_REWARD: -0.429 
EPISODE:    113 / TIMESTEP:     1017 / REWARD:     1 / AVG_REWARD: -0.416 
EPISODE:    114 / TIMESTEP:     1026 / REWARD:     1 / AVG_REWARD: -0.404 
EPISODE:    115 / TIMESTEP:     1035 / REWARD:    -1 / AVG_REWARD: -0.409 
EPISODE:    116 / TIMESTEP:     1044 / REWARD:     1 / AVG_REWARD: -0.397 
EPISODE:    117 / TIMESTEP:     1053 / REWARD:     1 / AVG_REWARD: -0.385 
EPISODE:    118 / TIMESTEP:     1062 / REWARD:    -1 / AVG_REWARD: -0.390 
EPISODE:    119 / TIMESTEP:     1071 / REWARD:    -1 / AVG_REWARD: -0.395 
EPISODE:    120 / TIMESTEP:     1080 / REWARD:     1 / AVG_REWARD: -0.383 
EPISODE:    121 / TIMESTEP:     1089 / REWARD:     1 / AVG_REWARD: -0.372 
EPISODE:    122 / TIMESTEP:     1098 / REWARD:    -1 / AVG_REWARD: -0.377 
EPISODE:    123 / TIMESTEP:     1107 / REWARD:    -1 / AVG_REWARD: -0.382 
EPISODE:    124 / TIMESTE

EPISODE:    223 / TIMESTEP:     2007 / REWARD:    -1 / AVG_REWARD: -0.435 
EPISODE:    224 / TIMESTEP:     2016 / REWARD:    -1 / AVG_REWARD: -0.438 
EPISODE:    225 / TIMESTEP:     2025 / REWARD:    -1 / AVG_REWARD: -0.440 
EPISODE:    226 / TIMESTEP:     2034 / REWARD:     1 / AVG_REWARD: -0.434 
EPISODE:    227 / TIMESTEP:     2043 / REWARD:     1 / AVG_REWARD: -0.427 
EPISODE:    228 / TIMESTEP:     2052 / REWARD:     1 / AVG_REWARD: -0.421 
EPISODE:    229 / TIMESTEP:     2061 / REWARD:    -1 / AVG_REWARD: -0.424 
EPISODE:    230 / TIMESTEP:     2070 / REWARD:     1 / AVG_REWARD: -0.417 
EPISODE:    231 / TIMESTEP:     2079 / REWARD:    -1 / AVG_REWARD: -0.420 
EPISODE:    232 / TIMESTEP:     2088 / REWARD:    -1 / AVG_REWARD: -0.422 
EPISODE:    233 / TIMESTEP:     2097 / REWARD:    -1 / AVG_REWARD: -0.425 
EPISODE:    234 / TIMESTEP:     2106 / REWARD:    -1 / AVG_REWARD: -0.427 
EPISODE:    235 / TIMESTEP:     2115 / REWARD:    -1 / AVG_REWARD: -0.430 
EPISODE:    236 / TIMESTE

EPISODE:    333 / TIMESTEP:     2997 / REWARD:     1 / AVG_REWARD: -0.380 
EPISODE:    334 / TIMESTEP:     3006 / REWARD:    -1 / AVG_REWARD: -0.387 
EPISODE:    335 / TIMESTEP:     3015 / REWARD:     1 / AVG_REWARD: -0.380 
EPISODE:    336 / TIMESTEP:     3024 / REWARD:    -1 / AVG_REWARD: -0.380 
EPISODE:    337 / TIMESTEP:     3033 / REWARD:    -1 / AVG_REWARD: -0.380 
EPISODE:    338 / TIMESTEP:     3042 / REWARD:    -1 / AVG_REWARD: -0.387 
EPISODE:    339 / TIMESTEP:     3051 / REWARD:    -1 / AVG_REWARD: -0.393 
EPISODE:    340 / TIMESTEP:     3060 / REWARD:     1 / AVG_REWARD: -0.387 
EPISODE:    341 / TIMESTEP:     3069 / REWARD:    -1 / AVG_REWARD: -0.387 
EPISODE:    342 / TIMESTEP:     3078 / REWARD:    -1 / AVG_REWARD: -0.387 
EPISODE:    343 / TIMESTEP:     3087 / REWARD:     1 / AVG_REWARD: -0.380 
EPISODE:    344 / TIMESTEP:     3096 / REWARD:    -1 / AVG_REWARD: -0.380 
EPISODE:    345 / TIMESTEP:     3105 / REWARD:     1 / AVG_REWARD: -0.373 
EPISODE:    346 / TIMESTE

EPISODE:    444 / TIMESTEP:     3996 / REWARD:    -1 / AVG_REWARD: -0.353 
EPISODE:    445 / TIMESTEP:     4005 / REWARD:     1 / AVG_REWARD: -0.347 
EPISODE:    446 / TIMESTEP:     4014 / REWARD:     1 / AVG_REWARD: -0.340 
EPISODE:    447 / TIMESTEP:     4023 / REWARD:    -1 / AVG_REWARD: -0.347 
EPISODE:    448 / TIMESTEP:     4032 / REWARD:     1 / AVG_REWARD: -0.340 
EPISODE:    449 / TIMESTEP:     4041 / REWARD:    -1 / AVG_REWARD: -0.340 
EPISODE:    450 / TIMESTEP:     4050 / REWARD:     1 / AVG_REWARD: -0.340 
EPISODE:    451 / TIMESTEP:     4059 / REWARD:    -1 / AVG_REWARD: -0.340 
EPISODE:    452 / TIMESTEP:     4068 / REWARD:    -1 / AVG_REWARD: -0.347 
EPISODE:    453 / TIMESTEP:     4077 / REWARD:    -1 / AVG_REWARD: -0.353 
EPISODE:    454 / TIMESTEP:     4086 / REWARD:     1 / AVG_REWARD: -0.347 
EPISODE:    455 / TIMESTEP:     4095 / REWARD:    -1 / AVG_REWARD: -0.347 
EPISODE:    456 / TIMESTEP:     4104 / REWARD:    -1 / AVG_REWARD: -0.347 
EPISODE:    457 / TIMESTE

EPISODE:    555 / TIMESTEP:     4995 / REWARD:    -1 / AVG_REWARD: -0.273 
EPISODE:    556 / TIMESTEP:     5004 / REWARD:     1 / AVG_REWARD: -0.273 
EPISODE:    557 / TIMESTEP:     5013 / REWARD:     1 / AVG_REWARD: -0.273 
EPISODE:    558 / TIMESTEP:     5022 / REWARD:    -1 / AVG_REWARD: -0.273 
EPISODE:    559 / TIMESTEP:     5031 / REWARD:    -1 / AVG_REWARD: -0.280 
EPISODE:    560 / TIMESTEP:     5040 / REWARD:    -1 / AVG_REWARD: -0.287 
EPISODE:    561 / TIMESTEP:     5049 / REWARD:     1 / AVG_REWARD: -0.280 
EPISODE:    562 / TIMESTEP:     5058 / REWARD:    -1 / AVG_REWARD: -0.287 
EPISODE:    563 / TIMESTEP:     5067 / REWARD:     1 / AVG_REWARD: -0.287 
EPISODE:    564 / TIMESTEP:     5076 / REWARD:    -1 / AVG_REWARD: -0.287 
EPISODE:    565 / TIMESTEP:     5085 / REWARD:     1 / AVG_REWARD: -0.280 
EPISODE:    566 / TIMESTEP:     5094 / REWARD:     1 / AVG_REWARD: -0.273 
EPISODE:    567 / TIMESTEP:     5103 / REWARD:    -1 / AVG_REWARD: -0.273 
EPISODE:    568 / TIMESTE

EPISODE:    666 / TIMESTEP:     5994 / REWARD:    -1 / AVG_REWARD: -0.300 
EPISODE:    667 / TIMESTEP:     6003 / REWARD:     1 / AVG_REWARD: -0.300 
EPISODE:    668 / TIMESTEP:     6012 / REWARD:    -1 / AVG_REWARD: -0.300 
EPISODE:    669 / TIMESTEP:     6021 / REWARD:    -1 / AVG_REWARD: -0.300 
EPISODE:    670 / TIMESTEP:     6030 / REWARD:    -1 / AVG_REWARD: -0.307 
EPISODE:    671 / TIMESTEP:     6039 / REWARD:     1 / AVG_REWARD: -0.300 
EPISODE:    672 / TIMESTEP:     6048 / REWARD:     1 / AVG_REWARD: -0.300 
EPISODE:    673 / TIMESTEP:     6057 / REWARD:    -1 / AVG_REWARD: -0.300 
EPISODE:    674 / TIMESTEP:     6066 / REWARD:     1 / AVG_REWARD: -0.300 
EPISODE:    675 / TIMESTEP:     6075 / REWARD:    -1 / AVG_REWARD: -0.300 
EPISODE:    676 / TIMESTEP:     6084 / REWARD:    -1 / AVG_REWARD: -0.300 
EPISODE:    677 / TIMESTEP:     6093 / REWARD:     1 / AVG_REWARD: -0.293 
EPISODE:    678 / TIMESTEP:     6102 / REWARD:    -1 / AVG_REWARD: -0.293 
EPISODE:    679 / TIMESTE

EPISODE:    777 / TIMESTEP:     6993 / REWARD:    -1 / AVG_REWARD: -0.313 
EPISODE:    778 / TIMESTEP:     7002 / REWARD:     1 / AVG_REWARD: -0.307 
EPISODE:    779 / TIMESTEP:     7011 / REWARD:     1 / AVG_REWARD: -0.300 
EPISODE:    780 / TIMESTEP:     7020 / REWARD:    -1 / AVG_REWARD: -0.300 
EPISODE:    781 / TIMESTEP:     7029 / REWARD:    -1 / AVG_REWARD: -0.300 
EPISODE:    782 / TIMESTEP:     7038 / REWARD:     1 / AVG_REWARD: -0.300 
EPISODE:    783 / TIMESTEP:     7047 / REWARD:    -1 / AVG_REWARD: -0.300 
EPISODE:    784 / TIMESTEP:     7056 / REWARD:     1 / AVG_REWARD: -0.300 
EPISODE:    785 / TIMESTEP:     7065 / REWARD:    -1 / AVG_REWARD: -0.300 
EPISODE:    786 / TIMESTEP:     7074 / REWARD:    -1 / AVG_REWARD: -0.307 
EPISODE:    787 / TIMESTEP:     7083 / REWARD:    -1 / AVG_REWARD: -0.307 
EPISODE:    788 / TIMESTEP:     7092 / REWARD:     1 / AVG_REWARD: -0.300 
EPISODE:    789 / TIMESTEP:     7101 / REWARD:     1 / AVG_REWARD: -0.293 
EPISODE:    790 / TIMESTE

In [22]:
from keras.models import load_model

env = Catch(10)
# # Critic
# model = Sequential()
# model.add(Dense(20,activation='relu',input_shape=(self.feature_dim,)))
# model.add(Dense(1))
# opt = Adam(lr=self.critic_learning_rate)
# model.compile(loss='mse', optimizer=opt)
# critic = mode.load('actor.h5')    
actor = load_model('actor.h5')
score = 0
for e in range(500):
    done = False
    env.reset()
    state = env.observe()
    state = np.reshape(state, [1, 100])
    print(str(e)+" "+str(score))
    dummy_act_picked=np.zeros((1,3))
    while not done:
#             if agent.render:
        temp_p = actor.predict([state,dummy_act_picked])
#         act = np.random.choice(np.arange(3), 
#                     p=temp_p[0][0])
#         print(act)
        act = np.argmax(temp_p[0][0])
#         print(act_one_hot)
        act_one_hot = np.zeros((1,3))
        act_one_hot[0,act]=1.0
        dummy_act_picked = act_one_hot
#         policy = actor.predict([state,act_one_hot])
#         action = np.argmax(policy)
# #             print(action)   
#             next_state, reward, done, info = env.step(action)
        next_state, reward, done = env.act(act)
        next_state = np.reshape(next_state, [1, 100])
        score = score + reward
#         display_screen(act,score,next_state)
        state = next_state
        

0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
11 11
12 12
13 13
14 14
15 15
16 16
17 17
18 18
19 19
20 20
21 21
22 22
23 23
24 24
25 25
26 26
27 27
28 28
29 29
30 30
31 31
32 32
33 33
34 34
35 35
36 36
37 37
38 38
39 39
40 40
41 41
42 42
43 43
44 44
45 45
46 46
47 47
48 48
49 49
50 50
51 51
52 52
53 53
54 54
55 55
56 56
57 57
58 58
59 59
60 60
61 61
62 62
63 63
64 64
65 65
66 66
67 67
68 68
69 69
70 70
71 71
72 72
73 73
74 74
75 75
76 76
77 77
78 78
79 79
80 80
81 81
82 82
83 83
84 84
85 85
86 86
87 87
88 88
89 89
90 90
91 91
92 92
93 93
94 94
95 95
96 96
97 97
98 98
99 99
100 100
101 101
102 102
103 103
104 104
105 105
106 106
107 107
108 108
109 109
110 110
111 111
112 112
113 113
114 114
115 115
116 116
117 117
118 118
119 119
120 120
121 121
122 122
123 123
124 124
125 125
126 126
127 127
128 128
129 129
130 130
131 131
132 132
133 133
134 134
135 135
136 136
137 137
138 138
139 139
140 140
141 141
142 142
143 143
144 144
145 145
146 146
147 147
148 148
149 149
150 150
151 151
152 