In [1]:
import gym
import numpy as np
import random
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

from collections import deque

Using TensorFlow backend.


https://towardsdatascience.com/reinforcement-learning-w-keras-openai-dqns-1eed3a5338c

Here is a description of the data and space https://github.com/openai/gym/wiki/MountainCar-v0

In [2]:
class DQN:
    #this sets up the bolltmann parameters
    def __init__(self, env):
        self.env     = env
        self.memory  = deque(maxlen=2000)
        
        self.gamma = 0.85
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.005
        self.tau = .125

        self.model        = self.create_model()
        self.target_model = self.create_model()

    #This creates our model
    def create_model(self):
        model   = Sequential()
        state_shape  = self.env.observation_space.shape
        model.add(Dense(24, input_dim=state_shape[0], activation="relu"))
        model.add(Dense(48, activation="relu"))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(self.env.action_space.n))
        model.compile(loss="mean_squared_error",
            optimizer=Adam(lr=self.learning_rate))
        return model
    
    #This is the exploration vs exploitation part. sample() gives us our 1 of 3 action choices. Actually take the action
    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        return np.argmax(self.model.predict(state)[0])

    #puts the current trial in memory so we can sample from it later and use it
    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])

    #sample our paths from memory and fit them to get current and future state rewards
    def replay(self):
        batch_size = 32
        if len(self.memory) < batch_size: #don't let it sample from a spot where we don't have at least 32 options
            return

        samples = random.sample(self.memory, batch_size) #samples 32 observations from memory
        for sample in samples:
            state, action, reward, new_state, done = sample
            target = self.target_model.predict(state) #predicts all 3 options in an array
            if done:
                target[0][action] = reward #if we win/end then there are no more terminal states so we quit
            else:
                #otherwise choose the the max Q for estimating future reward
                Q_future = max(self.target_model.predict(new_state)[0]) 
                target[0][action] = reward + Q_future * self.gamma #Boltzmann Equation
            self.model.fit(state, target, epochs=1, verbose=1)
     
    #updates the training weights
    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            #update training weights as explained in the blog for convergence
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)

    #saves the weights
    def save_model(self, fn):
        self.model.save(fn)

In [15]:
env.reset()

array([-0.46930122,  0.        ])

In [41]:
#our initial parameters
env = gym.make("MountainCar-v0")
gamma   = 0.9
epsilon = .95
trials  = 1000
trial_len = 500

# updateTargetNetwork = 1000
dqn_agent = DQN(env=env)
steps = []
for trial in range(trials):
    #get a starting state
    cur_state = env.reset().reshape(1,2)
    for step in range(trial_len):
        #make an action
        action = dqn_agent.act(cur_state)
        
        #take next action
        new_state, reward, done, _ = env.step(action)

        #reward = reward if not done else -20
        new_state = new_state.reshape(1,2)
        #add new state to memory
        dqn_agent.remember(cur_state, action, reward, new_state, done)
        
        #run trials in memory through sampling to get next best move
        dqn_agent.replay()       # internally iterates default (prediction) model
        dqn_agent.target_train() # iterates target model. updates the weights

        cur_state = new_state
        if done:
            break
    if step >= 199:
        print("Failed to complete in trial {}".format(trial))
        #if step % 10 == 0:
        #    dqn_agent.save_model("trial-{}.model".format(trial))
    else:
        print("Completed in {} trials".format(trial))
        #dqn_agent.save_model("success.model")
        break

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[[-0.50493326  0.00199986]] [[-0.9737966  -0.03538012 -0.08355838]]
[[-0.51461785  0.00239345]] [[ 0.03141573 -0.03601095 -0.97349936]]
[[-0.54816503  0.        ]] [[ 0.03355893 -0.03870121 -0.9715741 ]]
[[-5.04359734e-01 -2.86415473e-04]] [[-0.97363454 -0.03564698 -0.08361523]]
[[-5.46475279e-01  4.29488524e-05]] [[ 0.03345388 -0.9715808  -0.09057386]]
[[-0.50579036 -0.00143063]] [[-0.97344965 -0.03590182 -0.08392838]]
[[-0.53851507  0.00216528]] [[ 0.03288725 -0.9721679  -0.08911347]]
[[-0.54698095  0.00118408]] [[-0.9715666  -0.03845843 -0.09058188]]
[[-0.50693312  0.0021248 ]] [[ 0.0309553  -0.9737881  -0.08388155]]
[[-0.54560398  0.0007215 ]] [[ 0.03337518 -0.03842339 -0.97176623]]
[[-0.52593963  0.00348507]] [[-0.9728412  -0.03666353 -0.08694144]]
[[-0.50407332  0.00085994]] [[-0.97374535 -0.03547265 -0.08349159]]
[[-5.46260891e-01  2.14387976e-04]] [[ 0.03343436 -0.9716062

KeyboardInterrupt: 

In [21]:
env.reset() #corresponds to your physical location and speed

array([-0.42457704,  0.        ])

In [22]:
env.action_space.n #corresponds to pushing left, right or not pushing

3

In [22]:
env.action_space.sample() #corresponds to the action taken

0