In [1]:
import numpy as np
import matplotlib.pyplot as plt
import io
import cv2
import time
from random import randint
import random
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import math
from collections import deque
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Input
from keras.optimizers import Adam
%matplotlib inline

In [2]:
class Agent:
    def __init__(self, state_size, action_size, env):
        self.env = env
        self.state_size = state_size
        self.action_size = action_size
        self.maxScore = 0
        
        self.memory = deque(maxlen=50000)
        
        self.gam = 0.99 # discount
        self.eps = 1.0 # exploration rate
        self.eps_decay = .993
        self.eps_min = 0.01
        self.learning_rate = 0.001
        
        self.action_space = [*range(self.action_size)]
        
        self.model = self.build_model()
        self.select_model = self.build_model()
        
        self.batch_size=32
        
    def build_model(self):
        
        model = Sequential()
        model.add(Dense(128, input_dim=self.state_size, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
    
        return model
        
    def remember(self, state, action, reward, next_state, done, score):
        self.memory.append((state, action, reward, next_state, done, score))
        
    def rememberList(self, memoList):
        self.memory.extend(memoList)
        
    def act(self, state):
        if np.random.rand() <= self.eps:
            return env.action_space.sample()
        state = np.array([state])
        action_val = self.model.predict(state)
        return np.argmax(action_val[0])

    def replay(self):
        if len(self.memory) < self.batch_size:
            return
        minibatch = random.sample(self.memory, self.batch_size)
        
        states = np.array([i[0] for i in minibatch])
        
        rewards = np.array([i[2] for i in minibatch])
        
        actions = np.array([i[1] for i in minibatch])

        next_states = np.array([i[3] for i in minibatch])
        
        done = np.array([i[4] for i in minibatch])
        
        ind = np.array([i for i in range(self.batch_size)])
        
        next_state_select = self.select_model.predict_on_batch(next_states)
        
        action_select = np.argmax(next_state_select , axis=1)
        
        next_state_eval = self.model.predict_on_batch(next_states)
        
        action_eval = next_state_eval[[ind], [action_select]]
        
        targets = (rewards + self.gam * action_eval*(1-done))
        
        values_current = self.model.predict_on_batch(states)
        
        values_current[[ind], [actions]] = targets
        
        self.model.fit(states, values_current, epochs=1, verbose=0)
        
        if len(self.memory) % 200 == 0:
            self.update_model()
        
    def decay_eps(self):
        self.eps = max(self.eps_min, self.eps * self.eps_decay)
    
    def update_model(self):
        self.select_model.set_weights(self.model.get_weights())

In [3]:
import gym
import random
env = gym.make('LunarLander-v2')
agent = Agent(8,4, env)
score_history=[]
num_epis = 6000

batch_size = 64

for i in range(1, num_epis+1):
    done = 0
    score = 0
    state = env.reset()
    
    while not done:
        env.render()
        action = agent.act(state)
        
        next_state, reward, done, inf = env.step(action)
        agent.memory.append((state, action, reward, next_state, done))
        state = next_state
        score += reward
        agent.replay()
    agent.eps = max(agent.eps * agent.eps_decay, agent.eps_min) 

    score_history += [score]
    avg_score = np.mean(score_history[-100:])
    print('episode %d, score %.2f, avg score %.2f, eps %.3f' % (i, score, avg_score, agent.eps))