In [1]:
import gym
import numpy as np
import random
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.regularizers import l2
from collections import deque
import time

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [7]:
class QL:
    def __init__(self):
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_decay = 0.99
        self.epsilon_min = 0.1
        self.memory_len = 500
        self.env = gym.make("LunarLander-v2")
        self.nA= self.env.action_space.n
        self.nS= self.env.observation_space.shape[0]
        self.buffer = deque(maxlen=self.memory_len)
        self.model = self.create_network()
        self.target_model = self.create_network()
        self.iter=0
        
    def create_network(self):
        model = Sequential()
        model.add(Dense(128,activation='relu', activity_regularizer=l2(0.001), input_shape=(self.nS,)))
        model.add(Dense(32,activation='relu', activity_regularizer=l2(0.001)))
        model.add(Dense(self.nA))
        model.compile(loss='mse',optimizer='adam')
        return model
    
    def action(self,state):
        if np.random.rand()<=self.epsilon:
            return random.randrange(self.nA)
        else:
            return np.argmax(self.model.predict(np.array([state])[0]))
    
    def collect_data(self):
        curr_state = self.env.reset()
        for i in range(self.memory_len):
            next_action = self.action(curr_state)
            next_state, reward, done, info = self.env.step(next_action)
            if not done:
                self.buffer.append((curr_state,next_action,reward,next_state,done))
                curr_state = next_state
            else:
                self.buffer.append((curr_state,next_action,reward,next_state,done))
                curr_state = self.env.reset()
                
    def experience_replay(self,batch_size=64):
        minibatch = random.sample(self.buffer,batch_size)
        x = []
        y = []
        for curr_state,action,reward,next_state,done in self.buffer:
            td_target = reward
            if not done:
                td_target = reward + self.gamma*np.amax(self.target_model.predict(np.array([next_state]))[0])
            target = self.model.predict(np.array([curr_state]))[0]
            target[action] = td_target
            x.append(curr_state)
            y.append(target)
        x = np.array(x)
        y = np.array(y)
        self.model.fit(x,y,epochs=10,verbose=0)
        self.iter+=1
        print("Iteration {} loss: {}".format(self.iter,self.model.evaluate(x,y,verbose=0)))
        if self.epsilon>self.epsilon_min:
            self.epsilon = self.epsilon*self.epsilon_decay
    
    def target_model_update(self):
        weights = self.model.get_weights()
        self.target_model.set_weights(weights)
    
    def run(self,save_model_name):
        self.collect_data()
        while self.epsilon>self.epsilon_min:
            self.experience_replay()
            self.target_model_update()
        self.model.save(save_model_name)
            
    def render(self,saved_model=False, saved_model_name=None):
        if saved_model:
            self.model = load_model(saved_model_name)
        for i_episode in range(20):
            observation = self.env.reset()
            for t in range(1000):
#                 self.env.render()
                action = np.argmax(self.model.predict(np.array([observation]))[0])
                observation, reward, done, info = self.env.step(action)
                if done:
                    print("Episode finished after {} timesteps".format(t+1))
                    break
                time.sleep(0.01)
        self.env.close()

In [8]:
QL_NN = QL()
QL_NN.run('model3.h5')

Iteration 1 loss: 25.65554751968384
Iteration 2 loss: 23.574820447444917
Iteration 3 loss: 21.20874747610092
Iteration 4 loss: 19.504844694137574
Iteration 5 loss: 18.226036070585252
Iteration 6 loss: 17.157184839367865
Iteration 7 loss: 16.218447821497918
Iteration 8 loss: 15.497370860815048
Iteration 9 loss: 14.952485717058181
Iteration 10 loss: 14.517932628512382
Iteration 11 loss: 14.14151603460312
Iteration 12 loss: 13.819049837112427
Iteration 13 loss: 13.514335986971854
Iteration 14 loss: 13.271055732369422
Iteration 15 loss: 13.060887486934663
Iteration 16 loss: 12.774122889995574
Iteration 17 loss: 12.502358002185822
Iteration 18 loss: 12.164291334986686
Iteration 19 loss: 11.815499288797378
Iteration 20 loss: 11.593292006731033
Iteration 21 loss: 11.284758705735207
Iteration 22 loss: 10.964475357294083
Iteration 23 loss: 10.537407654762267
Iteration 24 loss: 10.185212156295776
Iteration 25 loss: 9.806725730419158
Iteration 26 loss: 9.499597683906556
Iteration 27 loss: 9.26136

Iteration 216 loss: 0.5894762399196625
Iteration 217 loss: 0.5850572726726532
Iteration 218 loss: 0.8233179244995117
Iteration 219 loss: 0.6260321853160858
Iteration 220 loss: 0.5983771182298661
Iteration 221 loss: 0.6473054629564285
Iteration 222 loss: 0.6690525476932525
Iteration 223 loss: 0.8675865244865417
Iteration 224 loss: 0.7038554041385651
Iteration 225 loss: 0.5897549324035645
Iteration 226 loss: 0.660957710146904
Iteration 227 loss: 0.5813650513887405
Iteration 228 loss: 0.6157048687934875
Iteration 229 loss: 0.8430904806852341
Iteration 230 loss: 0.6578761684894562


In [9]:
QL_NN.render(True,'model3.h5')

NotImplementedError: abstract