In [1]:
import sys, math
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import Box2D
from Box2D.b2 import (edgeShape, circleShape, fixtureDef, polygonShape, revoluteJointDef, contactListener)

In [3]:
import gym
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam, SGD
from keras.models import load_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
import random

In [23]:
env = gym.make('LunarLander-v2')

random.seed(4)
env.seed(4)

num_actions = env.action_space.n

model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(40))
model.add(Activation('relu'))
model.add(Dense(40))
model.add(Activation('relu'))
model.add(Dense(num_actions))
model.add(Activation('linear'))
model.compile(loss='mean_squared_error',  optimizer=Adam(lr=0.002, decay=2.25e-05))
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 8)                 0         
_________________________________________________________________
dense_4 (Dense)              (None, 40)                360       
_________________________________________________________________
activation_4 (Activation)    (None, 40)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 40)                1640      
_________________________________________________________________
activation_5 (Activation)    (None, 40)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 4)                 164       
_________________________________________________________________
activation_6 (Activation)    (None, 4)                 0         
Total para

In [24]:
def forward_pass(state):
    input = np.empty([1, 1, 8])
    input[0][0] = state
    return model.predict(input)[0]

In [25]:
def get_best_action(state):
    """Returns the index of the action with the highest Q-value, i.e.
        argMax(Q(nxt_state, all_actions))
    """
    state_q_values = forward_pass(state)
    return np.argmax(state_q_values)

In [26]:
def get_targets(state, action, reward, next_state):
    """
    Returns a set of target Q-values for a particular <s, a, r, s'> tuple
    """
    current_state_q_values = forward_pass(state)
    next_state_q_values = forward_pass(next_state)
    max_q_next_state = np.max(next_state_q_values)
    targets = np.empty([1, num_actions])

    for i in range(num_actions):
        if i == action:
            targets[0][i] = reward + (gamma * max_q_next_state)
        else:
            targets[0][i] = current_state_q_values[i]
    return targets

In [27]:
def choose_action(state, epsilon):
    """
    Greedy-epsilon exploration. Chooses action with the highest Q(s,a) value.
    With probability epsilon chooses a random action.
    """
    r = np.random.uniform()
    if r < epsilon:
        action = np.floor(np.random.randint(num_actions))
    else:
        input = np.empty([1, 1, 8])
        input[0][0] = state
        q_values = model.predict(input)[0]
        action = np.argmax(q_values)
    return int(action)

In [28]:
class Memory(object):
    def __init__(self, memory_size=10000, experience_size=1):
        self.experiences = np.empty([0, experience_size], dtype=object)
        self.max_memory_size = memory_size

    def add_experience(self, experience):
        self.experiences = np.insert(self.experiences, 0,
                                     experience, axis=0)
        if len(self.experiences) > self.max_memory_size:
            self.experiences = np.delete(self.experiences,
                                         self.max_memory_size, axis=0)

    def sample_experiences(self, mini_batch_size):
        if(mini_batch_size > len(self.experiences)):
            rep_needed = True
        else:
            rep_needed = False
        s = self.experiences[np.random.choice(
                self.experiences.shape[0],
                mini_batch_size, replace=rep_needed)]
        return s


In [29]:
def pack_experience(state, action, reward, new_state):
    experience = np.empty([0])
    experience = np.append(experience, state)
    experience = np.append(experience, [action])
    experience = np.append(experience, [reward])
    experience = np.append(experience, new_state)
    return experience


In [30]:
def unpack_experience(experience):
    state = experience[0:8]
    action = experience[8]
    reward = experience[9]
    new_state = experience[10:18]
    return state, action, reward, new_state


In [31]:
def learn_from_replay_memories(memory, batch_size):
    """
    Take a uniformly distributed batch of experiences and set the corresponding
    targets. Then train the network sequentally on each individual
    (experience, target) pair.
    """
    sample_batch = memory.sample_experiences(batch_size)
    for e in sample_batch:
        state, action, reward, new_state = unpack_experience(e)
        targets = get_targets(state, action, reward, new_state)
        x = np.empty([1, 1, 8])
        x[0][0] = state
        model.train_on_batch(x, targets)

In [32]:
mini_batch_size = 5
replay_memory_size = 25
gamma = 0.1
epsilon = 0.1
max_steps_per_epoch = 1000
max_epochs = 5000

memory = Memory(replay_memory_size, 18)
total_reward = np.zeros(max_epochs)

In [31]:
for epoch in range(max_epochs):
    state = env.reset()
    current_step = 0
    epoch_done = False
    while current_step < max_steps_per_epoch and not epoch_done:
        # Choose an action using the greedy-epsilon policy
        action = choose_action(state, epsilon)
        new_state, reward, epoch_done, info = env.step(action)
        total_reward[epoch] = total_reward[epoch] + reward
        # Store the experience in memory buffer
        experience = pack_experience(state, action, reward, new_state)
        memory.add_experience(experience)

        current_step = current_step + 1
        state = new_state
        # Learn from past experiences
        learn_from_replay_memories(memory, mini_batch_size)

    if not epoch % 10 and epoch and gamma < 0.975:
        # Gradually increase gamma to improve the importance of future-rewards
        # as the NN learns and becomes more accurate
        gamma = gamma * 1.0125
        print("New gamma = {}".format(gamma))

    print("Episode {} reward = {}".format(epoch, total_reward[epoch]))
    if not epoch % 10 and epoch:
        print("---------------------------")
        print("Last 10 episode avg = {}".format(np.average(total_reward[epoch-10:epoch])))
        print("---------------------------")

    if epoch and np.average(total_reward[epoch-100:epoch]) > 150:
        break


Episode 0 reward = -243.54597836472803
Episode 1 reward = -241.69893251354287


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


Episode 2 reward = -430.9548857830631
Episode 3 reward = -23.25972402093055
Episode 4 reward = -7.939534938967597
Episode 5 reward = -151.95441142217024
Episode 6 reward = -287.32347879473457
Episode 7 reward = -126.60105099018205
Episode 8 reward = -94.6322418552663
Episode 9 reward = -198.5913024802604
New gamma = 0.10125
Episode 10 reward = -17.747454901115987
---------------------------
Last 10 episode avg = -180.65015411638456
---------------------------
Episode 11 reward = -203.71059360469386
Episode 12 reward = -103.17248345475012
Episode 13 reward = -272.6997079760948
Episode 14 reward = -50.64786280339349
Episode 15 reward = -141.21352878531246
Episode 16 reward = -87.8216034458473
Episode 17 reward = 8.747135158426303
Episode 18 reward = -30.449034707450565
Episode 19 reward = 180.42012308017377
New gamma = 0.102515625
Episode 20 reward = -22.80216537522935
---------------------------
Last 10 episode avg = -71.82950114400585
---------------------------
Episode 21 reward = -26

Episode 161 reward = 55.305403149617916
Episode 162 reward = 217.5584242153812
Episode 163 reward = -67.20658189849317
Episode 164 reward = -6.476029283523829
Episode 165 reward = -83.24033862464142
Episode 166 reward = 9.368031557647157
Episode 167 reward = -91.28971834736046
Episode 168 reward = -202.37025452127898
Episode 169 reward = -32.41203600127672
New gamma = 0.12351381670492026
Episode 170 reward = 253.57347210538848
---------------------------
Last 10 episode avg = -23.433342937483324
---------------------------
Episode 171 reward = 3.7812474423505336
Episode 172 reward = -268.8790426689193
Episode 173 reward = -76.19248380288053
Episode 174 reward = -260.0807234836609
Episode 175 reward = -340.32900035557157
Episode 176 reward = 256.70473326896274
Episode 177 reward = 196.23709379479752
Episode 178 reward = -20.136143869487654
Episode 179 reward = -29.195328351766662
New gamma = 0.12505773941373174
Episode 180 reward = -167.96887674728345
---------------------------
Last 10

Episode 317 reward = 241.74723444089125
Episode 318 reward = -8.244956449508777
Episode 319 reward = 135.36577721726573
New gamma = 0.14881305085948257
Episode 320 reward = 26.073693651404867
---------------------------
Last 10 episode avg = 33.65911669572277
---------------------------
Episode 321 reward = 34.378916035998
Episode 322 reward = -39.24862788502536
Episode 323 reward = -126.17472352881423
Episode 324 reward = 253.4678508342833
Episode 325 reward = -244.96952207523992
Episode 326 reward = -289.6802262513488
Episode 327 reward = -175.77558528284234
Episode 328 reward = -40.36406450524302
Episode 329 reward = -287.30523655564593
New gamma = 0.1506732139952261
Episode 330 reward = -66.88722579223482
---------------------------
Last 10 episode avg = -88.95975255624735
---------------------------
Episode 331 reward = 48.61697262174306
Episode 332 reward = -222.8582319917503
Episode 333 reward = 229.31479972552157
Episode 334 reward = -386.1041644749639
Episode 335 reward = -255

Episode 471 reward = -203.66367089227512
Episode 472 reward = -57.09550739453505
Episode 473 reward = -192.9647700136208
Episode 474 reward = -267.17976960017603
Episode 475 reward = -248.4962028994182
Episode 476 reward = 268.5241992968164
Episode 477 reward = -39.70784005526531
Episode 478 reward = -189.9570003104821
Episode 479 reward = -25.31460417881209
New gamma = 0.18153548530526528
Episode 480 reward = -241.29668279257322
---------------------------
Last 10 episode avg = -95.61928692085569
---------------------------
Episode 481 reward = -214.44029070700606
Episode 482 reward = 224.8242723048283
Episode 483 reward = -231.00957968525847
Episode 484 reward = -317.10291361869645
Episode 485 reward = 218.5590558235379
Episode 486 reward = -408.8508100938071
Episode 487 reward = -211.9279601420728
Episode 488 reward = 101.32899864388777
Episode 489 reward = -225.53200682346107
New gamma = 0.18380467887158108
Episode 490 reward = -61.159633776345046
---------------------------
Last 1

Episode 627 reward = -179.2605331516059
Episode 628 reward = -5.6616934741518605
Episode 629 reward = -233.19939604337176
New gamma = 0.21871925043068582
Episode 630 reward = -231.19830724032585
---------------------------
Last 10 episode avg = -84.00499023566142
---------------------------
Episode 631 reward = -47.19828949105771
Episode 632 reward = -145.32873780506233
Episode 633 reward = 91.39229508422912
Episode 634 reward = -187.1605485454245
Episode 635 reward = -23.026807629858936
Episode 636 reward = 264.3245504517221
Episode 637 reward = 70.48315577026753
Episode 638 reward = -183.51598027572348
Episode 639 reward = -71.22505858233785
New gamma = 0.2214532410610694
Episode 640 reward = -178.31173049669414
---------------------------
Last 10 episode avg = -46.24537282635718
---------------------------
Episode 641 reward = 3.4647842819342145
Episode 642 reward = -46.15711368194526
Episode 643 reward = 47.82285854334806
Episode 644 reward = -25.029623378435915
Episode 645 reward 

Episode 781 reward = -79.78062546002224
Episode 782 reward = -5.831871834363199
Episode 783 reward = -161.07770815997367
Episode 784 reward = -212.74469406289398
Episode 785 reward = 208.67721602705208
Episode 786 reward = -239.1904150475911
Episode 787 reward = -68.64161060586451
Episode 788 reward = -27.234645534374636
Episode 789 reward = -88.40468630118096
New gamma = 0.2668133274818102
Episode 790 reward = -10.93645303488475
---------------------------
Last 10 episode avg = -45.714438377285965
---------------------------
Episode 791 reward = 265.6680144776101
Episode 792 reward = -231.94701605848837
Episode 793 reward = -9.647212466241342
Episode 794 reward = 49.84196593445469
Episode 795 reward = -222.98339749106475
Episode 796 reward = -274.208375006876
Episode 797 reward = 236.03587078221628
Episode 798 reward = 3.392588434294453
Episode 799 reward = -30.717771370389315
New gamma = 0.2701484940753328
Episode 800 reward = -24.40419835671223
---------------------------
Last 10 ep

Episode 937 reward = -38.103103757039236
Episode 938 reward = 167.70746764446125
Episode 939 reward = -13.144854717978461
New gamma = 0.3214644833411313
Episode 940 reward = 61.87110380345402
---------------------------
Last 10 episode avg = -81.82196012939778
---------------------------
Episode 941 reward = -6.724957686378417
Episode 942 reward = -157.1519552734649
Episode 943 reward = 265.6794711101359
Episode 944 reward = -29.978586866842306
Episode 945 reward = -90.30647740295753
Episode 946 reward = -61.426252622822275
Episode 947 reward = -80.09431080393395
Episode 948 reward = -316.106906098681
Episode 949 reward = -41.01691500624675
New gamma = 0.32548278938289543
Episode 950 reward = 250.92013520747736
---------------------------
Last 10 episode avg = -45.52557868477372
---------------------------
Episode 951 reward = -226.0232434073566
Episode 952 reward = -215.95595506174124
Episode 953 reward = -58.410016519719655
Episode 954 reward = 262.00108220093347
Episode 955 reward =

Episode 1091 reward = -73.65455798103936
Episode 1092 reward = -110.63646345326445
Episode 1093 reward = -41.54710765818844
Episode 1094 reward = 22.40021114850316
Episode 1095 reward = -0.00705052159773345
Episode 1096 reward = -252.51776457644647
Episode 1097 reward = -115.64436715925011
Episode 1098 reward = 10.222605531778868
Episode 1099 reward = -23.8512900997886
New gamma = 0.39215116318556437
Episode 1100 reward = -46.75342176589112
---------------------------
Last 10 episode avg = -63.3781639275742
---------------------------
Episode 1101 reward = -124.47436949249669
Episode 1102 reward = -122.3751620140731
Episode 1103 reward = -173.27480240894897
Episode 1104 reward = -81.03180509439429
Episode 1105 reward = -68.13270390248785
Episode 1106 reward = -12.80163978637978
Episode 1107 reward = -189.67972548317212
Episode 1108 reward = -127.37935644263011
Episode 1109 reward = -176.59836766527118
New gamma = 0.3970530527253839
Episode 1110 reward = -212.5035249230533
-------------

Episode 1243 reward = 235.24129672334735
Episode 1244 reward = -201.9823380961177
Episode 1245 reward = -194.00578967176364
Episode 1246 reward = -3.399411713899454
Episode 1247 reward = -68.30416554892432
Episode 1248 reward = 182.48124837508934
Episode 1249 reward = -49.3881142740243
New gamma = 0.4724751655206031
Episode 1250 reward = -11.231301037875
---------------------------
Last 10 episode avg = -35.9614760854795
---------------------------
Episode 1251 reward = 37.36561400637055
Episode 1252 reward = 243.99578945015534
Episode 1253 reward = -183.80048506974214
Episode 1254 reward = -23.799633675953814
Episode 1255 reward = -30.15180251578643
Episode 1256 reward = -222.34216127784492
Episode 1257 reward = -84.78243023223177
Episode 1258 reward = -225.38728682136255
Episode 1259 reward = -45.25945559306496
New gamma = 0.4783811050896106
Episode 1260 reward = 5.946178627008919
---------------------------
Last 10 episode avg = -54.539315276733575
---------------------------
Episod

Episode 1396 reward = -213.77917419420982
Episode 1397 reward = 24.265300902498467
Episode 1398 reward = -34.86797510809791
Episode 1399 reward = -168.9720064151988
New gamma = 0.5692518676225077
Episode 1400 reward = 183.79849576292946
---------------------------
Last 10 episode avg = -99.28985447907193
---------------------------
Episode 1401 reward = -164.3649278909241
Episode 1402 reward = -29.89353390808604
Episode 1403 reward = -64.31442424821375
Episode 1404 reward = -158.89245268737733
Episode 1405 reward = -73.30609709869819
Episode 1406 reward = -195.41296395638977
Episode 1407 reward = -150.1239647627522
Episode 1408 reward = -194.63216876467283
Episode 1409 reward = -209.9145145291415
New gamma = 0.576367515967789
Episode 1410 reward = -228.1153701475549
---------------------------
Last 10 episode avg = -105.70565520833263
---------------------------
Episode 1411 reward = -40.61241730996585
Episode 1412 reward = -133.44714348700035
Episode 1413 reward = 205.89092974055754
E

Episode 1549 reward = 13.713323116019154
New gamma = 0.6858512625412954
Episode 1550 reward = 239.12572939982292
---------------------------
Last 10 episode avg = 96.86576248125145
---------------------------
Episode 1551 reward = 24.058036907938003
Episode 1552 reward = 181.69458660917599
Episode 1553 reward = 270.6053242485756
Episode 1554 reward = -22.231105505368262
Episode 1555 reward = -19.274723323883833
Episode 1556 reward = 250.51509272426983
Episode 1557 reward = -201.03837475782353
Episode 1558 reward = 232.44818460275673
Episode 1559 reward = -159.01196443214292
New gamma = 0.6944244033230615
Episode 1560 reward = -25.864348984487577
---------------------------
Last 10 episode avg = 79.68907864733207
---------------------------
Episode 1561 reward = 212.90680262221872
Episode 1562 reward = -293.0556408203171
Episode 1563 reward = 205.4117601837023
Episode 1564 reward = 32.52235578444697
Episode 1565 reward = 16.018447758521077
Episode 1566 reward = -183.22788203456068
Episo

Episode 1701 reward = -7.317134295299937
Episode 1702 reward = -3.7041420847751
Episode 1703 reward = -128.96130943773886
Episode 1704 reward = 145.67744352130717
Episode 1705 reward = 213.52727601732357
Episode 1706 reward = -133.95920142140608
Episode 1707 reward = -125.03224505182261
Episode 1708 reward = 13.260954583021288
Episode 1709 reward = -40.897893502958496
New gamma = 0.8366627864529749
Episode 1710 reward = -15.937335847579334
---------------------------
Last 10 episode avg = -16.153452943276847
---------------------------
Episode 1711 reward = -14.566337607597603
Episode 1712 reward = 13.11576596635804
Episode 1713 reward = -4.689947193210969
Episode 1714 reward = -153.6295248674682
Episode 1715 reward = -45.524715407223766
Episode 1716 reward = 15.723871012787253
Episode 1717 reward = 197.6951225322674
Episode 1718 reward = -184.03469580054033
Episode 1719 reward = 220.56629584896092
New gamma = 0.847121071283637
Episode 1720 reward = -12.874537573625176
----------------

Episode 1855 reward = -87.99429246606145
Episode 1856 reward = -138.12139680561927
Episode 1857 reward = 145.16281441816335
Episode 1858 reward = -206.84402689194337
Episode 1859 reward = -55.02687591638282
Episode 1860 reward = -102.95807085596172
---------------------------
Last 10 episode avg = -71.4525856478145
---------------------------
Episode 1861 reward = 61.47187632556805
Episode 1862 reward = -19.305565652948605
Episode 1863 reward = 155.77943022326383
Episode 1864 reward = 135.61419710313953
Episode 1865 reward = 265.4387419721156
Episode 1866 reward = 172.55521747217838
Episode 1867 reward = 42.54370790586653
Episode 1868 reward = -49.4872459005291
Episode 1869 reward = 289.2748292491417
Episode 1870 reward = -133.0974101703894
---------------------------
Last 10 episode avg = 95.09271178418342
---------------------------
Episode 1871 reward = 182.70137671821374
Episode 1872 reward = 152.63278425512522
Episode 1873 reward = 227.7128205862974
Episode 1874 reward = 200.03420

In [34]:
print("Max episode reward = {}".format(np.max(total_reward)))
model.save('dqn.h5')

Max episode reward = 313.44232144971164


In [34]:
for i_episode in range(20):
    observation = env.reset()
    for t in range(1000):
        
        action = choose_action(observation, 0.0)
        observation, reward, done, info = env.step(action)
        env.render()
        if done:
            print("Episode finished after {} timesteps\n Last Reward: {}".format(t+1, reward))
            break

env.close()

Episode finished after 242 timesteps
 Last Reward: 100
Episode finished after 472 timesteps
 Last Reward: 100
Episode finished after 353 timesteps
 Last Reward: 100
Episode finished after 286 timesteps
 Last Reward: 100
Episode finished after 354 timesteps
 Last Reward: 100
Episode finished after 351 timesteps
 Last Reward: 100
Episode finished after 256 timesteps
 Last Reward: 100
Episode finished after 143 timesteps
 Last Reward: -100
Episode finished after 293 timesteps
 Last Reward: 100
Episode finished after 381 timesteps
 Last Reward: 100
Episode finished after 297 timesteps
 Last Reward: 100
Episode finished after 411 timesteps
 Last Reward: 100
Episode finished after 370 timesteps
 Last Reward: 100
Episode finished after 342 timesteps
 Last Reward: 100
Episode finished after 397 timesteps
 Last Reward: 100
Episode finished after 387 timesteps
 Last Reward: 100
Episode finished after 391 timesteps
 Last Reward: 100
Episode finished after 1000 timesteps
 Last Reward: -0.023770310

In [33]:
model = load_model('dqn.h5')