In [2]:
import numpy as np
from PIL import Image

import cv2
import io
import time
import random
import pickle
import os
from io import BytesIO
import base64
import json
import pandas as pd

from collections import deque
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys

from webdriver_manager.chrome import ChromeDriverManager

In [3]:
# https://github.com/Paperspace/DinoRunTutorial/blob/master/Reinforcement%20Learning%20Dino%20Run.ipynb

In [4]:
game_url = "chrome://dino"
chrome_driver_path = ChromeDriverManager().install()

loss_file_path = "./objects/loss.csv"
actions_file_path = "./objects/actions.csv"
q_value_file_path = "./objects/q_values.csv"
scores_file_path = "./objects/scores.csv"

init_script = "document.getElementsByClassName('runner-canvas')[0].id = 'runner-canvas'"
getbase64Script = "canvasRunner = document.getElementById('runner-canvas'); return canvasRunner.toDataURL().substring(22)"

In [5]:
def save_obj(obj, name):
    with open('objects/' + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('objects/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

def grab_screen(_driver):
    image_b64 = _driver.execute_script(getbase64Script)
    screen = np.array(Image.open(BytesIO(base64.b64decode(image_b64))))
    image = process_img(screen)
    return image

def process_img(image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image = image[:300, :500] # Crop Region of Interest (ROI)
    image = cv2.resize(image, (80, 80))
    return image

def show_img(graphs = False):
    while True:
        screen = (yield)
        window_title = "logs" if graphs else "game_play"
        cv2.namedWindow(window_title, cv2.WINDOW_NORMAL)
        imS = cv2.resize(screen, (800, 400))
        cv2.imshow(window_title, screen)
        if (cv2.waitKey(1) & 0xFF == ord('q')):
            cv2.destroyAllWindows()
            break

def obj_init(D, t, epsilon):    
    if not os.path.isdir('objects'):
        os.makedirs('objects')
    
    if not os.path.isfile('./objects/D.pkl'):
        save_obj(D, "D")
    if not os.path.isfile('./objects/time.pkl'):
        save_obj(t, "time")
    if not os.path.isfile('./objects/epsilon.pkl'):
        save_obj(epsilon, "epsilon")

In [6]:
class Game:
    def __init__(self, custom_config=True):
        chrome_options = Options()
        chrome_options.add_argument("disable-infobars")
        chrome_options.add_argument("--mute-audio")
        service = Service(chrome_driver_path)
        self._driver = webdriver.Chrome(service=service, options=chrome_options)
        self._driver.set_window_position(x=300,y=300)
        self._driver.set_window_size(900, 600)
        
        try : 
            self._driver.get(game_url)
        except:
            pass
        self._driver.execute_script("Runner.config.ACCELERATION=0")
        self._driver.execute_script(init_script)
    def get_crashed(self):
        return self._driver.execute_script("return Runner.instance_.crashed")
    def get_playing(self):
        return self._driver.execute_script("return Runner.instance_.playing")
    def restart(self):
        self._driver.execute_script("Runner.instance_.restart()")
    def press_up(self):
        self._driver.find_element("tag name", "body").send_keys(Keys.ARROW_UP)
    def press_down(self):
        self._driver.find_element("tag name", "body").send_keys(Keys.ARROW_DOWN)
    def get_score(self):
        score_array = self._driver.execute_script("return Runner.instance_.distanceMeter.digits")
        score = ''.join(score_array)
        return int(score)
    def pause(self):
        return self._driver.execute_script("return Runner.instance_.stop()")
    def resume(self):
        return self._driver.execute_script("return Runner.instance_.play()")
    def end(self):
        self._driver.close()

In [7]:
class DinoAgent:
    def __init__(self, game):
        self._game = game
        self.jump()
    def is_running(self):
        return self._game.get_playing()
    def is_crashed(self):
        return self._game.get_crashed()
    def jump(self):
        self._game.press_up()
    def duck(self):
        self._game.press_down()

In [8]:
class Game_state:
    def __init__(self, agent, game):
        self._agent = agent
        self._game = game
        self._display = show_img()
        self._display.__next__()
        
    def get_state(self, actions):
        actions_df.loc[len(actions_df)] = [actions]
        score = self._game.get_score()
        reward = 0.1
        is_over = False
        
        if actions[1] == 1:
            self._agent.jump()
            reward = -0.01
        
        image = grab_screen(self._game._driver)
        self._display.send(image)
        
        if self._agent.is_crashed():
            scores_df.loc[len(loss_df)] = score
            self._game.restart()
            reward = -10
            is_over = True
        
        return image, reward, is_over

In [9]:
loss_df = pd.read_csv(loss_file_path) if os.path.isfile(loss_file_path) else pd.DataFrame(columns = ['loss'])
scores_df = pd.read_csv(scores_file_path) if os.path.isfile(scores_file_path) else pd.DataFrame(columns = ['scores'])
actions_df = pd.read_csv(actions_file_path) if os.path.isfile(actions_file_path) else pd.DataFrame(columns = ['actions'])
q_values_df = pd.read_csv(q_value_file_path) if os.path.isfile(q_value_file_path) else pd.DataFrame(columns = ['qvalues'])

In [10]:
# Parameters
ACTIONS = 2
GAMMA = 0.99
OBSERBATION = 100.  # timesteps to observe before training
EXPLORE = 100000.  # frames over which to anneal epsilon
FINAL_EPSILON = 0.0001  # final value of epsilon
INITIAL_EPSILON = 0.01  # starting value of epsilon
REPLAY_MEMORY = 50000  # number of previous transitions to remember
BATCH_SIZE = 16  # size of minibatch
FRAME_PER_ACTION = 1
LEARNING_RATE = 1e-6
img_channels = 4  # We stack 4 frames

### Model

In [11]:

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class DinoNet(nn.Module):
    def __init__(self):
        super(DinoNet, self).__init__()
        self.conv1 = nn.Conv2d(4, 32, (8, 8), stride = 4, padding = 1)
        self.conv2 = nn.Conv2d(32, 64, (4, 4), stride = 2, padding = 1)
        self.conv3 = nn.Conv2d(64, 64, (3, 3), stride = 1, padding = 1)
        self.relu = nn.ReLU()
        self.max_pool2d = nn.MaxPool2d((2, 2))
        self.fc1 = nn.Linear(64, 32)
        self.fc2 = nn.Linear(32, ACTIONS)
        
    def forward(self, x):
        x = x.permute(0, 3, 1, 2)
        x = self.max_pool2d(self.relu(self.conv1(x)))
        x = self.max_pool2d(self.relu(self.conv2(x)))
        x = self.max_pool2d(self.relu(self.conv3(x)))
        x = x.reshape(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [12]:
model = DinoNet()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.MSELoss()

# create a new model if not exist
if not os.path.isdir("./model"):
    os.makedirs("./model")

In [13]:
def load_model():
    model.load_state_dict(torch.load(f"./latest.pth"))
    
load_model()

In [14]:
def trainNetwork(model, game_state, observe = False):
    last_time = time.time()
    D = load_obj("D")       # Load from file
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1       # 0 => do nothing, 1 => jump
    
    x_t, r_0, terminal = game_state.get_state(do_nothing)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # stack 4 images to create placeholder input
    s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2]) # 1*20*40*4
    initial_state = s_t
    
    if observe:
        OBSERVE = 999999999    # We keep observe, never train
        epsilon = FINAL_EPSILON
    
    else:
        OBSERVE = OBSERBATION
        epsilon = load_obj("epsilon")
    
    t = load_obj("time")        # resume from the previous time step
    while(True):
        loss_sum = 0
        Q_sa = 0
        action_index = 0
        random_action = 0
        r_t = 0      # reward at 4
        a_t = np.zeros([ACTIONS])   # action at t
        
        # choose an action epsilon greedy
        if t % FRAME_PER_ACTION == 0:
            if random.random() <= epsilon:
                random_action = 1
                action_index = random.randrange(ACTIONS)
                a_t[action_index] = 1
            else:
                q = model(torch.tensor(s_t).float())
                max_Q, action_index = torch.max(q, 1)
                a_t[action_index] = 1
                
        # reduce epsilon gradually
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE
            
            
        # run the selected action and observed next state and reward
        x_t1, r_t, terminal = game_state.get_state(a_t)
        last_time = time.time()
        x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1)
        s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3)
        
        # store the transition in D
        if len(D) > REPLAY_MEMORY:
            D.pop()
        D.append((s_t, action_index, r_t, s_t1, terminal))
        
        # only train if done observing
        if t > OBSERBATION:
            minibatch = random.sample(D, BATCH_SIZE)
            inputs = np.zeros((BATCH_SIZE, s_t.shape[1], s_t.shape[2], s_t.shape[3]))
            targets = np.zeros((inputs.shape[0], ACTIONS))
            
            for i in range(BATCH_SIZE):
                state_t = minibatch[i][0]   # 4D stack of images
                action_t = minibatch[i][1]  # Action index
                reward_t = minibatch[i][2]  # reward at state_t due to action_t
                state_t1 = minibatch[i][3]  # next state
                terminal = minibatch[i][4]  # wheather the agent died or survided due the action
                
                inputs[i:i+1] = state_t
                targets[i] = model(torch.tensor(state_t).float()).detach().numpy()
                Q_sa = model(torch.tensor(state_t1).float()).detach().numpy()
                
                if terminal:
                    targets[i, action_t] = reward_t
                else:
                    targets[i, action_t] = reward_t + GAMMA * np.max(Q_sa)

            # train
            outputs = model(torch.tensor(inputs).float())
            loss = loss_fn(outputs, torch.tensor(targets).float())
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            loss_sum += loss.item()
            loss_df.loc[len(loss_df)] = loss_sum
            q_values_df.loc[len(q_values_df)] = np.max(Q_sa)
    
        s_t = initial_state if terminal else s_t1   # reset game to initial frame if terminate
        t = t + 1
        
        # save progress every 1000 iterations
        if t % 1000 == 0:
            game_state._game.pause() #pause game while saving to filesystem
            torch.save(model.state_dict(), f"./model/episode_{t}.pth")
            torch.save(model.state_dict(), f"./latest.pth")
            save_obj(D,"D") #saving episodes
            save_obj(t,"time") #caching time steps
            save_obj(epsilon,"epsilon") #cache epsilon to avoid repeated randomness in actions
            loss_df.to_csv("./objects/loss_df.csv",index=False)
            scores_df.to_csv("./objects/scores_df.csv",index=False)
            actions_df.to_csv("./objects/actions_df.csv",index=False)
            q_values_df.to_csv(q_value_file_path,index=False)
            game_state._game.resume()
            
        print(f'timestep: {t}, random: {random_action}, epsilon: {round(epsilon, 3)}, action: {action_index}, reward: {r_t}, Q_max: {round(np.max(Q_sa),3)}, loss: {round(loss_sum, 3)}')
        

In [15]:
def playGame(observe=False):
    # obj_init([], 0, INITIAL_EPSILON)
    game = Game()
    dino = DinoAgent(game)
    game_state = Game_state(dino, game)
    try :
        trainNetwork(model, game_state, observe)
    except StopIteration:
        game.end()

In [16]:
playGame(observe=False)

timestep: 1, random: 0, epsilon: 0.01, action: tensor([0]), reward: 0.1, Q_max: 0, loss: 0
timestep: 2, random: 0, epsilon: 0.01, action: tensor([0]), reward: 0.1, Q_max: 0, loss: 0
timestep: 3, random: 0, epsilon: 0.01, action: tensor([0]), reward: 0.1, Q_max: 0, loss: 0
timestep: 4, random: 0, epsilon: 0.01, action: tensor([0]), reward: 0.1, Q_max: 0, loss: 0
timestep: 5, random: 0, epsilon: 0.01, action: tensor([0]), reward: 0.1, Q_max: 0, loss: 0
timestep: 6, random: 0, epsilon: 0.01, action: tensor([0]), reward: 0.1, Q_max: 0, loss: 0
timestep: 7, random: 0, epsilon: 0.01, action: tensor([0]), reward: 0.1, Q_max: 0, loss: 0
timestep: 8, random: 0, epsilon: 0.01, action: tensor([0]), reward: 0.1, Q_max: 0, loss: 0
timestep: 9, random: 0, epsilon: 0.01, action: tensor([0]), reward: 0.1, Q_max: 0, loss: 0
timestep: 10, random: 0, epsilon: 0.01, action: tensor([0]), reward: 0.1, Q_max: 0, loss: 0
timestep: 11, random: 0, epsilon: 0.01, action: tensor([0]), reward: 0.1, Q_max: 0, loss:

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=123.0.6312.106)
Stacktrace:
	GetHandleVerifier [0x01084CE3+225091]
	(No symbol) [0x00FB4E31]
	(No symbol) [0x00E59A7A]
	(No symbol) [0x00E3E312]
	(No symbol) [0x00EB517B]
	(No symbol) [0x00EC55A6]
	(No symbol) [0x00EAF2F6]
	(No symbol) [0x00E879B9]
	(No symbol) [0x00E8879D]
	sqlite3_dbdata_init [0x014F9A83+4064547]
	sqlite3_dbdata_init [0x0150108A+4094762]
	sqlite3_dbdata_init [0x014FB988+4072488]
	sqlite3_dbdata_init [0x011FC9E9+930953]
	(No symbol) [0x00FC0804]
	(No symbol) [0x00FBAD28]
	(No symbol) [0x00FBAE51]
	(No symbol) [0x00FACAC0]
	BaseThreadInitThunk [0x7596FCC9+25]
	RtlGetAppContainerNamedObjectPath [0x77217C5E+286]
	RtlGetAppContainerNamedObjectPath [0x77217C2E+238]
