In [1]:
import time
import json
import random
import numpy as np
from Game import Game
from Agent import Agent
from GameState import GameState
from DataLoader import DataLoader
from keras.optimizers import Adam
from keras.models import Sequential
from IPython.display import clear_output
from keras.layers.core import Dense, Activation, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D

Using TensorFlow backend.


In [2]:
#game parameters

# possible actions: jump, do nothing
ACTIONS = 2

# decay rate of past observations original 0.99
GAMMA = 0.99

# timesteps to observe before training
OBSERVATION = 100.

# frames over which to anneal epsilon
EXPLORE = 100000

# final value of epsilon
FINAL_EPSILON = 0.0001

# starting value of epsilon
INITIAL_EPSILON = 0.1

# number of previous transitions to remember
REPLAY_MEMORY = 50000

# size of minibatch
BATCH = 16

FRAME_PER_ACTION = 1
LEARNING_RATE = 1e-4
img_rows , img_cols = 80,80

#We stack 4 frames
img_channels = 4

In [3]:
# Call only once to init file structure
# data_loader.init_cache(INITIAL_EPSILON)

In [3]:
def buildmodel(data_loader: DataLoader):
    print("Now we build the model")
    model = Sequential()
    model.add(Conv2D(32, (8, 8), padding='same', strides=(4, 4), input_shape=(img_cols,img_rows,img_channels)))  #80*80*4
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Activation('relu'))

    model.add(Conv2D(64, (4, 4),strides=(2, 2), padding='same'))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Activation('relu'))

    model.add(Conv2D(64, (3, 3),strides=(1, 1), padding='same'))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Activation('relu'))

    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))

    model.add(Dense(ACTIONS))
    model.add(Activation('linear'))

    adam = Adam(lr=LEARNING_RATE)
    model.compile(loss='mse', optimizer=adam)

    #create model file if not present
    if not data_loader.is_loss_file_present():
        model.save_weights('model.h5')

    print("We finish building the model")
    model.summary()
    return model

In [6]:
# main training module
# Parameters:
# * model => Keras Model to be trained
# * game_state => Game State module with access to game environment and dino
# * observe => flag to indicate wherther the model is to be trained(weight updates), else just play
def trainNetwork(model, game_state: GameState, data_loader: DataLoader, observe=False):
    last_time = time.time()

    # store the previous observations in replay memory
    D = data_loader.load_obj("D")

    # get the first state by doing nothing
    do_nothing = np.zeros(ACTIONS)

    #0 => do nothing,
    #1 => jump
    do_nothing[0] = 1

    # get next step after performing the action
    x_t, r_0, terminal = game_state.get_state(do_nothing)

    # stack 4 images to create placeholder input
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)

    #1*80*80*4
    s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2])

    initial_state = s_t

    if observe :
        #We keep observe, never train

        OBSERVE = 999999999
        epsilon = FINAL_EPSILON
        print ("Now we load weight")
        model.load_weights("model.h5")
        adam = Adam(lr=LEARNING_RATE)
        model.compile(loss='mse',optimizer=adam)
        print ("Weight load successfully")
    else:
        #We go to training mode

        OBSERVE = OBSERVATION
        epsilon = data_loader.load_obj("epsilon")
        model.load_weights("model.h5")
        adam = Adam(lr=LEARNING_RATE)
        model.compile(loss='mse',optimizer=adam)

    # resume from the previous time step stored in file system
    t = data_loader.load_obj("time")

    #endless running
    while True :

        loss = 0
        Q_sa = 0
        action_index = 0
        r_t = 0 #reward at 4
        a_t = np.zeros([ACTIONS]) # action at t

        # choose an action epsilon greedy
        if t % FRAME_PER_ACTION == 0:
            #parameter to skip frames for actions

            if  random.random() <= epsilon:
                #randomly explore an action

                print("----------Random Action----------")
                action_index = random.randrange(ACTIONS)
                a_t[action_index] = 1
            else:
                # predict the output

                # input a stack of 4 images, get the prediction
                q = model.predict(s_t)

                # choosing index with maximum q value
                max_Q = np.argmax(q)
                action_index = max_Q

                # 0 => do nothing,
                # 1 => jump
                a_t[action_index] = 1

        # We reduced the epsilon (exploration parameter) gradually
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        # run the selected action and observed next state and reward
        x_t1, r_t, terminal = game_state.get_state(a_t)

        # helpful for measuring frame rate
        print('fps: {0}'.format(1 / (time.time() - last_time)))
        last_time = time.time()

        # 1x80x80x1
        x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1)

        # append the new image to input stack and remove the first one
        s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3)

        # store the transition in D
        D.append((s_t, action_index, r_t, s_t1, terminal))
        if len(D) > REPLAY_MEMORY:
            D.popleft()

        # only train if done observing
        if t > OBSERVE:

            # sample a minibatch to train on
            minibatch = random.sample(D, BATCH)
            inputs = np.zeros((BATCH, s_t.shape[1], s_t.shape[2], s_t.shape[3]))   #32, 20, 40, 4
            targets = np.zeros((inputs.shape[0], ACTIONS))                         #32, 2

            # Now we do the experience replay
            for i in range(0, len(minibatch)):
                # 4D stack of images
                state_t = minibatch[i][0]

                # This is action index
                action_t = minibatch[i][1]

                # reward at state_t due to action_t
                reward_t = minibatch[i][2]

                # next state
                state_t1 = minibatch[i][3]

                # wheather the agent died or survided due the action
                terminal = minibatch[i][4]

                print('Agent State::', terminal)

                inputs[i:i + 1] = state_t

                # predicted q values
                targets[i] = model.predict(state_t)

                # predict q values for next step
                Q_sa = model.predict(state_t1)

                # if terminated, only equals reward
                if terminal:
                    targets[i, action_t] = reward_t
                else:
                    targets[i, action_t] = reward_t + GAMMA * np.max(Q_sa)

            loss += model.train_on_batch(inputs, targets)
            data_loader.store_loss(loss)
            data_loader.store_q_value(np.max(Q_sa))

        #reset game to initial frame if terminate
        s_t = initial_state if terminal else s_t1
        t = t + 1

        # save progress every 1000 iterations
        if t % 10 == 0:
            print("Now we save model")

            # pause game while saving to filesystem
            game_state._game.pause()

            model.save_weights("model.h5", overwrite=True)

            # saving episodes
            data_loader.save_obj(D, "D")

            # caching time steps
            data_loader.save_obj(t,"time")

            #cache epsilon to avoid repeated randomness in actions
            data_loader.save_obj(epsilon,"epsilon")

            data_loader.store_values_to_file()
            with open("model.json", "w") as outfile:
                json.dump(model.to_json(), outfile)

            clear_output()
            game_state._game.resume()

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif OBSERVE < t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"

        print("TIMESTEP", t, "/ STATE", state,             "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t,             "/ Q_MAX " , np.max(Q_sa), "/ Loss ", loss)

    print("Episode finished!")
    print("************************")
#

In [7]:
#main function
def playGame(observe=False):
    game = Game()
    dino = Agent(game)
    data_loader = DataLoader()
    game_state = GameState(dino, game, data_loader)
    model = buildmodel(data_loader)
    try:
        trainNetwork(model, game_state, data_loader, observe=observe)
    except StopIteration:
        game.end()

In [8]:
playGame(observe=False);

TIMESTEP 4860 / STATE explore / EPSILON 0.09524575899999274 / ACTION 1 / REWARD 0.1 / Q_MAX  31.831926 / Loss  0.47312474250793457
Jump
fps: 2.5193299812775813
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
TIMESTEP 4861 / STATE explore / EPSILON 0.09524475999999274 / ACTION 1 / REWARD 0.1 / Q_MAX  30.410334 / Loss  4.748752117156982
fps: 6.276051431760743
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
Agent State:: False
TIMESTEP 4862 / STATE explore / EPSILON 0.095243

ElementNotInteractableException: Message: element not interactable
  (Session info: chrome=89.0.4389.114)
