In [22]:
import numpy as np
from PIL import Image
import cv2 #opencv
import io
import time
import pandas as pd
import numpy as np
from IPython.display import clear_output
from random import randint
import os
from PIL import ImageGrab
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys

#keras imports
from keras.models import model_from_json
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.optimizers import SGD , Adam
from keras.callbacks import TensorBoard
from collections import deque
import random
import pickle
from io import BytesIO
import base64
import json

In [23]:
#game parameters
GAMMA = 0.99 # decay rate of past observations original 0.99
OBSERVATION = 100. # timesteps to observe before training
EXPLORE = 100  # frames over which to anneal epsilon
FINAL_EPSILON = 0.0001 # final value of epsilon
INITIAL_EPSILON = 0.1 # starting value of epsilon
REPLAY_MEMORY = 50000 # number of previous transitions to remember
BATCH = 32 # size of minibatch
FRAME_PER_ACTION = 1


In [24]:
#path variables
game_url = "chrome://dino"
loss_file_path = "./objects/loss_df.csv"
actions_file_path = "./objects/actions_df.csv"
q_value_file_path = "./objects/q_values.csv"
scores_file_path = "./objects/scores_df.csv"

#scripts
#create id for canvas for faster selection from DOM
init_script = "document.getElementsByClassName('runner-canvas')[0].id = 'runner-canvas'"

#get image from canvas
getbase64Script = "canvasRunner = document.getElementById('runner-canvas'); \
return canvasRunner.toDataURL().substring(22)"

In [25]:
'''
* Game class: Selenium interfacing between the python and browser
* __init__():  Launch the broswer window using the attributes in chrome_options
* get_crashed() : return true if the agent as crashed on an obstacles. Gets javascript variable from game decribing the state
* get_playing(): true if game in progress, false is crashed or paused
* restart() : sends a signal to browser-javascript to restart the game
* press_up(): sends a single to press up get to the browser
* get_score(): gets current game score from javascript variables.
* pause(): pause the game
* resume(): resume a paused game if not crashed
* end(): close the browser and end the game
'''
class Game:
    def __init__(self,custom_config=True):
        chrome_options = Options()
        chrome_options.add_argument("disable-infobars")
        self._driver = webdriver.Chrome(chrome_options=chrome_options)
        self._driver.set_window_position(x=-10,y=0)
        self._driver.set_window_size(200, 300)
        try:
            self._driver.get(game_url)
        except Exception as e:
            print('Exception', e)
        #modifying game before training
        if custom_config:
            self._driver.execute_script("Runner.config.ACCELERATION=0")
    def get_crashed(self):
        return self._driver.execute_script("return Runner.instance_.crashed")
    def get_playing(self):
        return self._driver.execute_script("return Runner.instance_.playing")
    def restart(self):
        self._driver.execute_script("Runner.instance_.restart()")
        
        time.sleep(0.25)# no actions are possible 
                        # for 0.25 sec after game starts, 
                        # skip learning at this time and make the model wait
    def press_up(self):
        self._driver.find_element_by_tag_name("body").send_keys(Keys.ARROW_UP)
    def get_score(self):
        score_array = self._driver.execute_script("return Runner.instance_.distanceMeter.digits")
        score = ''.join(score_array) # the javascript object is of type array with score in the formate[1,0,0] which is 100.
        return int(score)
    def pause(self):
        return self._driver.execute_script("return Runner.instance_.stop()")
    def resume(self):
        return self._driver.execute_script("return Runner.instance_.play()")
    def end(self):
        self._driver.close()


In [26]:
class DinoAgent:
    def __init__(self,game): #takes game as input for taking actions
        self._game = game; 
        self.jump(); #to start the game, we need to jump once
        time.sleep(.5) # no action can be performed for the first time when game starts
    def is_running(self):
        return self._game.get_playing()
    def is_crashed(self):
        return self._game.get_crashed()
    def jump(self):
        self._game.press_up()
    def duck(self):
        self._game.press_down()


In [27]:
'''
get_state(): accepts an array of actions, 
             performs the action on the agent 
returns :  new state, reward and if the game ended.
'''
class Game_sate:
    def __init__(self,agent,game):
        self._agent = agent
        self._game = game
    def get_state(self,actions):
        score = self._game.get_score() 
        reward = 0.1*score/10 # dynamic reward calculation
        is_over = False #game over
        if actions[1] == 1: #else do nothing
            self._agent.jump()
            reward = 0.1*score/11
        image = grab_screen() 
        
        if self._agent.is_crashed():
            self._game.restart()
            reward = -11/score
            is_over = True
        return image, reward, is_over #return the Experience tuple


In [28]:
def grab_screen(_driver = None):
    #bbox = region of interest on the entire screen
    screen =  np.array(ImageGrab.grab(bbox=(40,180,440,400))) 
    image = process_img(screen)#processing image as required
    return image
    
def process_img(image):
    #game is already in grey scale canvas, canny to get only edges and reduce unwanted objects(clouds)
    # resale image dimensions
    image = cv2.resize(image, (0,0), fx = 0.15, fy = 0.10) 
    #crop out the dino agent from the frame
    image = image[2:38,10:50] #img[y:y+h, x:x+w] 
    image = cv2.Canny(image, threshold1 = 100, threshold2 = 200) #apply the canny edge detection
    return  image    


In [29]:
#model hyper parameters
LEARNING_RATE = 1e-4
img_rows , img_cols = 40,20
img_channels = 4 #We stack 4 frames
ACTIONS = 2
def buildmodel():
    print("Now we build the model")
    model = Sequential()
    model.add(Conv2D(32, (8, 8), strides=(4, 4), padding='same',input_shape=(img_cols,img_rows,img_channels)))  #20*40*4
    model.add(Activation('relu'))
    model.add(Conv2D(64, (4, 4), strides=(2, 2), padding='same'))
    model.add(Activation('relu'))
    model.add(Conv2D(64, (3, 3), strides=(1, 1), padding='same'))
    model.add(Activation('relu'))
    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(ACTIONS))
    adam = Adam(lr=LEARNING_RATE)
    model.compile(loss='mse',optimizer=adam)
    print("We finish building the model")
    return model


In [30]:
''' 
Parameters:
* model => Keras Model to be trained
* game_state => Game State module with access to game environment and dino
* observe => flag to indicate wherther the model is to be trained(weight updates), else just play
'''
def trainNetwork(model,game_state):
    # store the previous observations in replay memory
    D = deque() #load from file system
    # get the first state by doing nothing
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] =1 #0 => do nothing,
                     #1=> jump
    
    x_t, r_0, terminal = game_state.get_state(do_nothing) # get next step after performing the action
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2).reshape(1,20,40,4) # stack 4 images to create placeholder input reshaped 1*20*40*4 
    
    OBSERVE = OBSERVATION
    epsilon = INITIAL_EPSILON
    t = 0
    while (True): #endless running
        
        loss = 0
        Q_sa = 0
        action_index = 0
        r_t = 0 #reward at t
        a_t = np.zeros([ACTIONS]) # action at t
        
        #choose an action epsilon greedy
        if  random.random() <= epsilon: #randomly explore an action
            print("----------Random Action----------")
            action_index = random.randrange(ACTIONS)
            a_t[action_index] = 1
        else: # predict the output
            q = model.predict(s_t)       #input a stack of 4 images, get the prediction
            max_Q = np.argmax(q)         # chosing index with maximum q value
            action_index = max_Q 
            a_t[action_index] = 1        # o=> do nothing, 1=> jump
                
        #We reduced the epsilon (exploration parameter) gradually
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE 

        #run the selected action and observed next state and reward
        x_t1, r_t, terminal = game_state.get_state(a_t)
        last_time = time.time()
        x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1) #1x20x40x1
        s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3) # append the new image to input stack and remove the first one
        
        # store the transition in D
        D.append((s_t, action_index, r_t, s_t1, terminal))
        if len(D) > REPLAY_MEMORY:
            D.popleft() 
        
        #only train if done observing; sample a minibatch to train on
        if t > OBSERVE:
            minibatch= random.sample(D, BATCH)
            inputs = np.zeros((BATCH, s_t.shape[1], s_t.shape[2], s_t.shape[3]))   #32, 20, 40, 4
            targets = np.zeros((inputs.shape[0], ACTIONS))                         #32, 2
            loss = 0

            for i in range(0, len(minibatch)):                
                state_t = minibatch[i][0]    # 4D stack of images
                action_t = minibatch[i][1]   #This is action index
                reward_t = minibatch[i][2]   #reward at state_t due to action_t
                state_t1 = minibatch[i][3]   #next state
                terminal = minibatch[i][4]   #wheather the agent died or survided due the action
                inputs[i:i + 1] = state_t    
                targets[i] = model.predict(state_t)  # predicted q values
                print('before')
                Q_sa = model.predict(state_t1)      #predict q values for next step
                print(f'after, {Q_sa}')
                if terminal:
                    targets[i, action_t] = reward_t # if terminated, only equals reward
                else:
                    targets[i, action_t] = reward_t + GAMMA * np.max(Q_sa)

                loss += model.train_on_batch(inputs, targets)
        s_t = s_t1 
        t = t + 1
        print("TIMESTEP", t, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t,"/ Q_MAX " , np.max(Q_sa), "/ Loss ", loss)


In [31]:
def trainBatch(model, minibatch, s_t):
  inputs = np.zeros((BATCH, s_t.shape[1], s_t.shape[2], s_t.shape[3]))   #32, 20, 40, 4
  targets = np.zeros((inputs.shape[0], ACTIONS))                         #32, 2
  loss = 0
  
  for i in range(0, len(minibatch)):                
                state_t = minibatch[i][0]    # 4D stack of images
                action_t = minibatch[i][1]   #This is action index
                reward_t = minibatch[i][2]   #reward at state_t due to action_t
                state_t1 = minibatch[i][3]   #next state
                terminal = minibatch[i][4]   #wheather the agent died or survided due the action
                inputs[i:i + 1] = state_t    
                targets[i] = model.predict(state_t)  # predicted q values
                print('before')
                Q_sa = model.predict(state_t1)      #predict q values for next step
                print(f'after, {Q_sa}')
                if terminal:
                    targets[i, action_t] = reward_t # if terminated, only equals reward
                else:
                    targets[i, action_t] = reward_t + GAMMA * np.max(Q_sa)

                loss += model.train_on_batch(inputs, targets)


In [32]:
#argument: observe, only plays if true, else trains
def playGame(observe=False):
    game = Game()
    dino = DinoAgent(game)
    game_state = Game_sate(dino,game)
    model = buildmodel()
    trainNetwork(model,game_state)


In [None]:
playGame(observe=False)

  self._driver = webdriver.Chrome(chrome_options=chrome_options)


Exception Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=87.0.4280.141)

Now we build the model
We finish building the model
TIMESTEP 1 / EPSILON 0.1 / ACTION 0 / REWARD 0.02 / Q_MAX  0 / Loss  0
TIMESTEP 2 / EPSILON 0.1 / ACTION 0 / REWARD 0.030000000000000006 / Q_MAX  0 / Loss  0
TIMESTEP 3 / EPSILON 0.1 / ACTION 0 / REWARD 0.04 / Q_MAX  0 / Loss  0
TIMESTEP 4 / EPSILON 0.1 / ACTION 0 / REWARD 0.05 / Q_MAX  0 / Loss  0
TIMESTEP 5 / EPSILON 0.1 / ACTION 0 / REWARD 0.06000000000000001 / Q_MAX  0 / Loss  0
TIMESTEP 6 / EPSILON 0.1 / ACTION 0 / REWARD 0.07 / Q_MAX  0 / Loss  0
TIMESTEP 7 / EPSILON 0.1 / ACTION 0 / REWARD 0.08 / Q_MAX  0 / Loss  0
TIMESTEP 8 / EPSILON 0.1 / ACTION 0 / REWARD 0.09 / Q_MAX  0 / Loss  0
TIMESTEP 9 / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
TIMESTEP 10 / EPSILON 0.1 / ACTION 0 / REWARD 0.11000000000000001 / Q_MAX  0 / Loss  0
TIMESTEP 11 / EPSILON 0.1 / ACTION 0 / REWARD 0.12000000000000002 / Q_MAX  0 / Loss  0


before
after, [[9.9922225e-05 0.0000000e+00]]
before
after, [[8.36062e-05 0.00000e+00]]
before
after, [[0.00011216 0.        ]]
before
after, [[0.00017404 0.        ]]
before
after, [[0.00024959 0.        ]]
before
after, [[-0.16129029 -0.2754026 ]]
before
after, [[ 8.4963453e-04 -3.0815212e-05]]
before
after, [[ 0.41985878 -0.23679335]]
before
after, [[ 0.24536122 -0.33099884]]
before
after, [[ 4.0662563e-03 -7.2413386e-06]]
before
after, [[ 0.46239975 -0.4319093 ]]
before
after, [[ 0.37802893 -0.29137233]]
before
after, [[ 0.5725067 -0.254966 ]]
before
after, [[ 0.53857464 -0.3850529 ]]
before
after, [[1.1308606e-02 2.8446004e-05]]
before
after, [[ 0.60654783 -0.35451752]]
before
after, [[ 0.50809914 -0.33519292]]
before
after, [[1.6568828e-02 7.1171322e-05]]
before
after, [[ 0.54915583 -0.36667648]]
before
after, [[ 2.0303501e-02 -4.8828962e-05]]
before
after, [[ 0.66288006 -0.29229027]]
before
after, [[ 0.5961643 -0.3618058]]
before
after, [[ 0.02685984 -0.00010622]]
before
after, 

before
after, [[0.05833036 0.0022403 ]]
before
after, [[1.4820821 0.412204 ]]
before
after, [[1.3958795  0.19638671]]
before
after, [[0.06288363 0.00191629]]
TIMESTEP 107 / EPSILON 0.094006 / ACTION 0 / REWARD -0.3235294117647059 / Q_MAX  0.06288363 / Loss  1.2867609793320298
