In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

from random import sample
from collections import deque

def init_dqn(B, kwargs):
    B.N_arms = kwargs['N_arms'] 
    B.context = env.reset()
    B.context = B.context.reshape(1,B.context.shape[0])
    B.done = False
    
    B.alpha = kwargs['alpha']
    B.discount = kwargs['discount']
    B.eps = kwargs['eps']
    B.eps_min = kwargs['eps_min']
    B.eps_reduction = kwargs['eps_reduction']
    B.batch_size = kwargs['batch_size']
    
    B.model = Sequential()
    B.model.add(Dense(kwargs['layer_size'][0], input_dim=env.observation_space.shape[0], activation='relu'))
    for layer_size in kwargs['layer_size'][1:]:
        B.model.add(Dense(layer_size, activation='relu'))
    B.model.add(Dense(env.action_space.n, activation='linear'))
    B.model.compile(loss='mse', optimizer=Adam(lr=kwargs['lr']))
    
    B.history = deque(maxlen=1000000)
    return

def dqn(B):
    p = random()
    if p < B.eps:
        return randint(0, B.N_arms-1)
    else:
        return np.argmax( B.model.predict(B.context) )

def update_dqn(B, arm, reward, observation):
    observation = observation.reshape(1, observation.shape[0])
    B.history.append( (B.context, arm, reward, observation) )
    B.context = observation
    if True or B.done:
        
        if len(B.history) < B.batch_size:
            return
        
        minibatch = sample(B.history, B.batch_size)
        
        states, actions, rewards, observations = zip(*minibatch)
        states = np.squeeze(states)
        observations = np.squeeze(observations)
        actions = np.squeeze(np.asarray(actions))
        Q_next = B.model.predict_on_batch( np.stack(observations) )
        
        Q_new = np.asarray(rewards) + B.discount * np.amax(Q_next, axis=1)
        Q_new[-1] = rewards[-1]
        Q = B.model.predict_on_batch( np.stack(states) )
        Q[np.arange(0,len(actions)), actions] = Q_new
        
        B.model.fit(states, Q, epochs=1, verbose=0)
        if B.eps > B.eps_min:
            B.eps *= B.eps_reduction
        B.done = False
        return    
    return

In [None]:
import gym
#env = gym.make('MountainCar-v0')
env = gym.make('LunarLander-v2')
env.reset()


kwargs = { "N_arms": 4, "alpha": 0.001, "layer_size": [150,120], "discount": 0.99, "eps": 1.0, 
           "eps_min": 0.01, "eps_reduction": 0.998, "lr": 0.001, "batch_size": 64 }

B = Bandit(env, dqn, update_dqn, init_dqn, kwargs )

rewards = []
all_weights = []
counter = 0
#for i in range(100000):
counter = 0
best_return = -200

interval = 25

for i in range(1000):
    done = False
    counter += 1
    if counter%interval == 0:
        print( counter, "\t", sum(rewards[-interval:])/float(interval) )
    total_return = 0
    env.reset()
    for j in range(3000):
        #env.render()
        arm = B.policy(B)
        observation, reward, done, info = env.step(arm)
        B.done = done
        B.update(B, arm, reward, observation)
        total_return += reward
        if done:
            rewards.append( total_return )
            env.reset()
            if total_return > best_return:
                best_return = total_return
            break
env.close()
print( "Done after ", counter, " episodes" )
print(best_return)
#plt.plot(rewards)