In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import random

%matplotlib inline

In [2]:
env = gym.make('FrozenLake8x8-v0')

MAX_STEPS = env.spec.timestep_limit # maximum steps in an episode, 200 for Taxi-v2

NUM_EPISODES = 50000000
GAMMA = 0.99  # Discount factor from Bellman Equation
START_ALPHA = 0.1  # Learning rate, how much we update our Q table each step
ALPHA_TAPER = 0.00001 # How much our adaptive learning rate is adjusted each update
START_EPSILON = 1.0  # Probability of random action
EPSILON_TAPER = 0.00001 # How much epsilon is adjusted each step

render = False

In [3]:
def moving_average(values, n=100) :
    ret = np.cumsum(values, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

In [4]:
def epsilon_action(Q, s, eps=0.1):
    if random.random() < (1 - eps):
        return np.argmax(Q[s])
    else:
        return env.action_space.sample()

In [5]:
def double_q_learning():
    
    obs_dim = env.observation_space.n # size of our state
    action_dim = env.action_space.n # number of actions

    # Initialize our Q table
    Q = np.zeros((obs_dim, action_dim))
    Q_1 = np.zeros((obs_dim, action_dim))
    Q_2 = np.zeros((obs_dim, action_dim))

    state_visit_counts = {}
    update_counts = np.zeros((obs_dim, action_dim), dtype=np.dtype(int))
    
    rewards_list = []    
    deltas = []
              
    for i_episode in range(NUM_EPISODES): 
        
        eps = START_EPSILON / (1.0 + i_episode * EPSILON_TAPER)
        
        # Play episode
        biggest_change = 0
        reward_per_episode = 0 
        curr_state = env.reset()
        for step in range(MAX_STEPS):          
          
            prev_state = curr_state
            state_visit_counts[prev_state] = state_visit_counts.get(prev_state,0) + 1
            action = epsilon_action(Q, curr_state, eps)           
            curr_state, reward, done, info = env.step(action)
            reward_per_episode += reward
            old_qsa = Q[prev_state][action]
            
            # Update Q
            alpha = START_ALPHA / (1.0 + update_counts[prev_state][action] * ALPHA_TAPER)
            update_counts[prev_state][action] += 1
            Q[prev_state][action] += alpha * (reward + GAMMA * max(Q[curr_state]) - Q[prev_state][action])
            
            if np.random.uniform() < 0.5:           
                Q_1[prev_state][action] += alpha * (reward + GAMMA * max(Q_2[curr_state]) - Q_1[prev_state][action]) 
            else:
                Q_2[prev_state][action] += alpha * (reward + GAMMA * max(Q_1[curr_state]) - Q_2[prev_state][action]) 
            Q = Q_1 + Q_2
            
            biggest_change = max(biggest_change, np.abs(old_qsa - Q[prev_state][action]))
            if done:
                break
        rewards_list.append(reward_per_episode)
        deltas.append(biggest_change)               
            
        avg_reward = np.mean(rewards_list[-100:])     
        if i_episode % 10000 == 0:            
            avg_delta = np.mean(deltas[-100:])  
            print ("Episode: {}   Avg reward: {:3.3f}  Avg Delta: {:8.8f}  Epsilon: {:3.3f}  Alpha: {:6.6f}".format(
                i_episode, avg_reward, avg_delta, eps, alpha))
            
        if len(deltas) > 1000:
            deltas.pop(0)
        if len(rewards_list) > 1000:
            rewards_list.pop(0)        
          
        if avg_reward >= env.spec.reward_threshold: 
            print("########## Solved! ###########")
            break
            
  
    mean_state_visits = np.mean(list(state_visit_counts.values()))
    print("each state was visited on average: ", mean_state_visits, " times.\n")
  
   
    #plt.plot(moving_average(deltas, n=1000))
    #plt.show()
    
    #plt.plot(moving_average(rewards_list, n=1000))
    #plt.show()
    
    return Q
  
            
print("action_space={}".format(env.action_space))
print("obs_space={}".format(env.observation_space))
print("threshold={} \n".format(env.spec.reward_threshold))

Q_final = double_q_learning()
print("Final Q values: {}\n".format(Q_final)) 


action_space=Discrete(4)
obs_space=Discrete(64)
threshold=0.99 

Episode: 0   Avg reward: 0.000  Avg Delta: 0.00000000  Epsilon: 1.000  Alpha: 0.100000
Episode: 10000   Avg reward: 0.000  Avg Delta: 0.00137887  Epsilon: 0.909  Alpha: 0.098050
Episode: 20000   Avg reward: 0.010  Avg Delta: 0.01061000  Epsilon: 0.833  Alpha: 0.098285
Episode: 30000   Avg reward: 0.020  Avg Delta: 0.02504482  Epsilon: 0.769  Alpha: 0.099624
Episode: 40000   Avg reward: 0.020  Avg Delta: 0.02845442  Epsilon: 0.714  Alpha: 0.099632
Episode: 50000   Avg reward: 0.000  Avg Delta: 0.02745810  Epsilon: 0.667  Alpha: 0.089792
Episode: 60000   Avg reward: 0.060  Avg Delta: 0.02906445  Epsilon: 0.625  Alpha: 0.096081
Episode: 70000   Avg reward: 0.040  Avg Delta: 0.02803835  Epsilon: 0.588  Alpha: 0.089530
Episode: 80000   Avg reward: 0.030  Avg Delta: 0.03045357  Epsilon: 0.556  Alpha: 0.095639
Episode: 90000   Avg reward: 0.050  Avg Delta: 0.02961241  Epsilon: 0.526  Alpha: 0.082805
Episode: 100000   Avg reward:

Episode: 890000   Avg reward: 0.640  Avg Delta: 0.01407598  Epsilon: 0.101  Alpha: 0.011489
Episode: 900000   Avg reward: 0.550  Avg Delta: 0.01654169  Epsilon: 0.100  Alpha: 0.011294
Episode: 910000   Avg reward: 0.600  Avg Delta: 0.01349486  Epsilon: 0.099  Alpha: 0.068571
Episode: 920000   Avg reward: 0.600  Avg Delta: 0.01416397  Epsilon: 0.098  Alpha: 0.010915
Episode: 930000   Avg reward: 0.520  Avg Delta: 0.01399074  Epsilon: 0.097  Alpha: 0.059650
Episode: 940000   Avg reward: 0.480  Avg Delta: 0.01676392  Epsilon: 0.096  Alpha: 0.010554
Episode: 950000   Avg reward: 0.540  Avg Delta: 0.01423643  Epsilon: 0.095  Alpha: 0.010383
Episode: 960000   Avg reward: 0.500  Avg Delta: 0.01462131  Epsilon: 0.094  Alpha: 0.096816
Episode: 970000   Avg reward: 0.590  Avg Delta: 0.01446034  Epsilon: 0.093  Alpha: 0.010056
Episode: 980000   Avg reward: 0.540  Avg Delta: 0.01431223  Epsilon: 0.093  Alpha: 0.081238
Episode: 990000   Avg reward: 0.530  Avg Delta: 0.01431482  Epsilon: 0.092  Alph

Episode: 1780000   Avg reward: 0.740  Avg Delta: 0.00952169  Epsilon: 0.053  Alpha: 0.004091
Episode: 1790000   Avg reward: 0.690  Avg Delta: 0.01000868  Epsilon: 0.053  Alpha: 0.057936
Episode: 1800000   Avg reward: 0.680  Avg Delta: 0.00865217  Epsilon: 0.053  Alpha: 0.001245
Episode: 1810000   Avg reward: 0.700  Avg Delta: 0.00823172  Epsilon: 0.052  Alpha: 0.003996
Episode: 1820000   Avg reward: 0.650  Avg Delta: 0.00918984  Epsilon: 0.052  Alpha: 0.058395
Episode: 1830000   Avg reward: 0.750  Avg Delta: 0.00764301  Epsilon: 0.052  Alpha: 0.003934
Episode: 1840000   Avg reward: 0.640  Avg Delta: 0.00882730  Epsilon: 0.052  Alpha: 0.001208
Episode: 1850000   Avg reward: 0.640  Avg Delta: 0.00991056  Epsilon: 0.051  Alpha: 0.003875
Episode: 1860000   Avg reward: 0.670  Avg Delta: 0.00870390  Epsilon: 0.051  Alpha: 0.003846
Episode: 1870000   Avg reward: 0.700  Avg Delta: 0.00720433  Epsilon: 0.051  Alpha: 0.003817
Episode: 1880000   Avg reward: 0.710  Avg Delta: 0.00741571  Epsilon: 

Episode: 2670000   Avg reward: 0.780  Avg Delta: 0.00457436  Epsilon: 0.036  Alpha: 0.002346
Episode: 2680000   Avg reward: 0.780  Avg Delta: 0.00559362  Epsilon: 0.036  Alpha: 0.002334
Episode: 2690000   Avg reward: 0.730  Avg Delta: 0.00598313  Epsilon: 0.036  Alpha: 0.002323
Episode: 2700000   Avg reward: 0.710  Avg Delta: 0.00671260  Epsilon: 0.036  Alpha: 0.045957
Episode: 2710000   Avg reward: 0.700  Avg Delta: 0.00616990  Epsilon: 0.036  Alpha: 0.002300
Episode: 2720000   Avg reward: 0.770  Avg Delta: 0.00671586  Epsilon: 0.035  Alpha: 0.055758
Episode: 2730000   Avg reward: 0.660  Avg Delta: 0.00733548  Epsilon: 0.035  Alpha: 0.002278
Episode: 2740000   Avg reward: 0.700  Avg Delta: 0.00736039  Epsilon: 0.035  Alpha: 0.002267
Episode: 2750000   Avg reward: 0.720  Avg Delta: 0.00659930  Epsilon: 0.035  Alpha: 0.052647
Episode: 2760000   Avg reward: 0.790  Avg Delta: 0.00550605  Epsilon: 0.035  Alpha: 0.098141
Episode: 2770000   Avg reward: 0.710  Avg Delta: 0.00605875  Epsilon: 

Episode: 3560000   Avg reward: 0.720  Avg Delta: 0.00474946  Epsilon: 0.027  Alpha: 0.061368
Episode: 3570000   Avg reward: 0.800  Avg Delta: 0.00353562  Epsilon: 0.027  Alpha: 0.069503
Episode: 3580000   Avg reward: 0.760  Avg Delta: 0.00374428  Epsilon: 0.027  Alpha: 0.035713
Episode: 3590000   Avg reward: 0.770  Avg Delta: 0.00454686  Epsilon: 0.027  Alpha: 0.001596
Episode: 3600000   Avg reward: 0.840  Avg Delta: 0.00371900  Epsilon: 0.027  Alpha: 0.001590
Episode: 3610000   Avg reward: 0.750  Avg Delta: 0.00518057  Epsilon: 0.027  Alpha: 0.001585
Episode: 3620000   Avg reward: 0.850  Avg Delta: 0.00413538  Epsilon: 0.027  Alpha: 0.001579
Episode: 3630000   Avg reward: 0.750  Avg Delta: 0.00521281  Epsilon: 0.027  Alpha: 0.000774
Episode: 3640000   Avg reward: 0.750  Avg Delta: 0.00440796  Epsilon: 0.027  Alpha: 0.001568
Episode: 3650000   Avg reward: 0.780  Avg Delta: 0.00391789  Epsilon: 0.027  Alpha: 0.001563
Episode: 3660000   Avg reward: 0.810  Avg Delta: 0.00356381  Epsilon: 

Episode: 4450000   Avg reward: 0.790  Avg Delta: 0.00412632  Epsilon: 0.022  Alpha: 0.001219
Episode: 4460000   Avg reward: 0.770  Avg Delta: 0.00439339  Epsilon: 0.022  Alpha: 0.001215
Episode: 4470000   Avg reward: 0.870  Avg Delta: 0.00252591  Epsilon: 0.022  Alpha: 0.001212
Episode: 4480000   Avg reward: 0.720  Avg Delta: 0.00437550  Epsilon: 0.022  Alpha: 0.001208
Episode: 4490000   Avg reward: 0.780  Avg Delta: 0.00401510  Epsilon: 0.022  Alpha: 0.074301
Episode: 4500000   Avg reward: 0.710  Avg Delta: 0.00415168  Epsilon: 0.022  Alpha: 0.001202
Episode: 4510000   Avg reward: 0.790  Avg Delta: 0.00339675  Epsilon: 0.022  Alpha: 0.001198
Episode: 4520000   Avg reward: 0.770  Avg Delta: 0.00452184  Epsilon: 0.022  Alpha: 0.001195
Episode: 4530000   Avg reward: 0.780  Avg Delta: 0.00385096  Epsilon: 0.022  Alpha: 0.001192
Episode: 4540000   Avg reward: 0.840  Avg Delta: 0.00311382  Epsilon: 0.022  Alpha: 0.001189
Episode: 4550000   Avg reward: 0.750  Avg Delta: 0.00366562  Epsilon: 

Episode: 5340000   Avg reward: 0.810  Avg Delta: 0.00301775  Epsilon: 0.018  Alpha: 0.023864
Episode: 5350000   Avg reward: 0.720  Avg Delta: 0.00368421  Epsilon: 0.018  Alpha: 0.000973
Episode: 5360000   Avg reward: 0.720  Avg Delta: 0.00348085  Epsilon: 0.018  Alpha: 0.029082
Episode: 5370000   Avg reward: 0.730  Avg Delta: 0.00360973  Epsilon: 0.018  Alpha: 0.036381
Episode: 5380000   Avg reward: 0.790  Avg Delta: 0.00363826  Epsilon: 0.018  Alpha: 0.031453
Episode: 5390000   Avg reward: 0.860  Avg Delta: 0.00235830  Epsilon: 0.018  Alpha: 0.000964
Episode: 5400000   Avg reward: 0.880  Avg Delta: 0.00321773  Epsilon: 0.018  Alpha: 0.000962
Episode: 5410000   Avg reward: 0.790  Avg Delta: 0.00285248  Epsilon: 0.018  Alpha: 0.000959
Episode: 5420000   Avg reward: 0.740  Avg Delta: 0.00318638  Epsilon: 0.018  Alpha: 0.000957
Episode: 5430000   Avg reward: 0.770  Avg Delta: 0.00344634  Epsilon: 0.018  Alpha: 0.000955
Episode: 5440000   Avg reward: 0.740  Avg Delta: 0.00351719  Epsilon: 

Episode: 6230000   Avg reward: 0.790  Avg Delta: 0.00254660  Epsilon: 0.016  Alpha: 0.000810
Episode: 6240000   Avg reward: 0.800  Avg Delta: 0.00246119  Epsilon: 0.016  Alpha: 0.000809
Episode: 6250000   Avg reward: 0.830  Avg Delta: 0.00198129  Epsilon: 0.016  Alpha: 0.020965
Episode: 6260000   Avg reward: 0.810  Avg Delta: 0.00252951  Epsilon: 0.016  Alpha: 0.000806
Episode: 6270000   Avg reward: 0.850  Avg Delta: 0.00264540  Epsilon: 0.016  Alpha: 0.000804
Episode: 6280000   Avg reward: 0.840  Avg Delta: 0.00181018  Epsilon: 0.016  Alpha: 0.000802
Episode: 6290000   Avg reward: 0.840  Avg Delta: 0.00213703  Epsilon: 0.016  Alpha: 0.000801
Episode: 6300000   Avg reward: 0.780  Avg Delta: 0.00281339  Epsilon: 0.016  Alpha: 0.000799
Episode: 6310000   Avg reward: 0.820  Avg Delta: 0.00234020  Epsilon: 0.016  Alpha: 0.032386
Episode: 6320000   Avg reward: 0.850  Avg Delta: 0.00231649  Epsilon: 0.016  Alpha: 0.000796
Episode: 6330000   Avg reward: 0.820  Avg Delta: 0.00247624  Epsilon: 

Episode: 7120000   Avg reward: 0.760  Avg Delta: 0.00257777  Epsilon: 0.014  Alpha: 0.000692
Episode: 7130000   Avg reward: 0.820  Avg Delta: 0.00279092  Epsilon: 0.014  Alpha: 0.000691
Episode: 7140000   Avg reward: 0.840  Avg Delta: 0.00201364  Epsilon: 0.014  Alpha: 0.000690
Episode: 7150000   Avg reward: 0.860  Avg Delta: 0.00171077  Epsilon: 0.014  Alpha: 0.000688
Episode: 7160000   Avg reward: 0.780  Avg Delta: 0.00217840  Epsilon: 0.014  Alpha: 0.000687
Episode: 7170000   Avg reward: 0.770  Avg Delta: 0.00168243  Epsilon: 0.014  Alpha: 0.000686
Episode: 7180000   Avg reward: 0.830  Avg Delta: 0.00219790  Epsilon: 0.014  Alpha: 0.018040
Episode: 7190000   Avg reward: 0.800  Avg Delta: 0.00250764  Epsilon: 0.014  Alpha: 0.000684
Episode: 7200000   Avg reward: 0.760  Avg Delta: 0.00211302  Epsilon: 0.014  Alpha: 0.000683
Episode: 7210000   Avg reward: 0.850  Avg Delta: 0.00207843  Epsilon: 0.014  Alpha: 0.000682
Episode: 7220000   Avg reward: 0.820  Avg Delta: 0.00250187  Epsilon: 

Episode: 8010000   Avg reward: 0.800  Avg Delta: 0.00263605  Epsilon: 0.012  Alpha: 0.000603
Episode: 8020000   Avg reward: 0.780  Avg Delta: 0.00227198  Epsilon: 0.012  Alpha: 0.000602
Episode: 8030000   Avg reward: 0.830  Avg Delta: 0.00167824  Epsilon: 0.012  Alpha: 0.000158
Episode: 8040000   Avg reward: 0.870  Avg Delta: 0.00150768  Epsilon: 0.012  Alpha: 0.000600
Episode: 8050000   Avg reward: 0.810  Avg Delta: 0.00196252  Epsilon: 0.012  Alpha: 0.000600
Episode: 8060000   Avg reward: 0.800  Avg Delta: 0.00224064  Epsilon: 0.012  Alpha: 0.015890
Episode: 8070000   Avg reward: 0.790  Avg Delta: 0.00268310  Epsilon: 0.012  Alpha: 0.000598
Episode: 8080000   Avg reward: 0.730  Avg Delta: 0.00190179  Epsilon: 0.012  Alpha: 0.000597
Episode: 8090000   Avg reward: 0.890  Avg Delta: 0.00201265  Epsilon: 0.012  Alpha: 0.000596
Episode: 8100000   Avg reward: 0.750  Avg Delta: 0.00233583  Epsilon: 0.012  Alpha: 0.000595
Episode: 8110000   Avg reward: 0.840  Avg Delta: 0.00196566  Epsilon: 

Episode: 8900000   Avg reward: 0.850  Avg Delta: 0.00244188  Epsilon: 0.011  Alpha: 0.000534
Episode: 8910000   Avg reward: 0.880  Avg Delta: 0.00176057  Epsilon: 0.011  Alpha: 0.000533
Episode: 8920000   Avg reward: 0.840  Avg Delta: 0.00204546  Epsilon: 0.011  Alpha: 0.000533
Episode: 8930000   Avg reward: 0.790  Avg Delta: 0.00178222  Epsilon: 0.011  Alpha: 0.000532
Episode: 8940000   Avg reward: 0.860  Avg Delta: 0.00204774  Epsilon: 0.011  Alpha: 0.023598
Episode: 8950000   Avg reward: 0.860  Avg Delta: 0.00180498  Epsilon: 0.011  Alpha: 0.000531
Episode: 8960000   Avg reward: 0.840  Avg Delta: 0.00248740  Epsilon: 0.011  Alpha: 0.000530
Episode: 8970000   Avg reward: 0.860  Avg Delta: 0.00142848  Epsilon: 0.011  Alpha: 0.000529
Episode: 8980000   Avg reward: 0.770  Avg Delta: 0.00221352  Epsilon: 0.011  Alpha: 0.000529
Episode: 8990000   Avg reward: 0.780  Avg Delta: 0.00172249  Epsilon: 0.011  Alpha: 0.000528
Episode: 9000000   Avg reward: 0.850  Avg Delta: 0.00122977  Epsilon: 

Episode: 9790000   Avg reward: 0.810  Avg Delta: 0.00167608  Epsilon: 0.010  Alpha: 0.000125
Episode: 9800000   Avg reward: 0.830  Avg Delta: 0.00182870  Epsilon: 0.010  Alpha: 0.000478
Episode: 9810000   Avg reward: 0.830  Avg Delta: 0.00151677  Epsilon: 0.010  Alpha: 0.000478
Episode: 9820000   Avg reward: 0.850  Avg Delta: 0.00159627  Epsilon: 0.010  Alpha: 0.000477
Episode: 9830000   Avg reward: 0.770  Avg Delta: 0.00249054  Epsilon: 0.010  Alpha: 0.000477
Episode: 9840000   Avg reward: 0.810  Avg Delta: 0.00174115  Epsilon: 0.010  Alpha: 0.059183
Episode: 9850000   Avg reward: 0.850  Avg Delta: 0.00181129  Epsilon: 0.010  Alpha: 0.000476
Episode: 9860000   Avg reward: 0.720  Avg Delta: 0.00181635  Epsilon: 0.010  Alpha: 0.000126
Episode: 9870000   Avg reward: 0.830  Avg Delta: 0.00154453  Epsilon: 0.010  Alpha: 0.000475
Episode: 9880000   Avg reward: 0.780  Avg Delta: 0.00167431  Epsilon: 0.010  Alpha: 0.000474
Episode: 9890000   Avg reward: 0.890  Avg Delta: 0.00134209  Epsilon: 

Episode: 10670000   Avg reward: 0.870  Avg Delta: 0.00117156  Epsilon: 0.009  Alpha: 0.000434
Episode: 10680000   Avg reward: 0.840  Avg Delta: 0.00117705  Epsilon: 0.009  Alpha: 0.000434
Episode: 10690000   Avg reward: 0.860  Avg Delta: 0.00105220  Epsilon: 0.009  Alpha: 0.000434
Episode: 10700000   Avg reward: 0.840  Avg Delta: 0.00162719  Epsilon: 0.009  Alpha: 0.000433
Episode: 10710000   Avg reward: 0.840  Avg Delta: 0.00175157  Epsilon: 0.009  Alpha: 0.000433
Episode: 10720000   Avg reward: 0.810  Avg Delta: 0.00143419  Epsilon: 0.009  Alpha: 0.000432
Episode: 10730000   Avg reward: 0.830  Avg Delta: 0.00205327  Epsilon: 0.009  Alpha: 0.080465
Episode: 10740000   Avg reward: 0.750  Avg Delta: 0.00204873  Epsilon: 0.009  Alpha: 0.024483
Episode: 10750000   Avg reward: 0.810  Avg Delta: 0.00209456  Epsilon: 0.009  Alpha: 0.000431
Episode: 10760000   Avg reward: 0.740  Avg Delta: 0.00164532  Epsilon: 0.009  Alpha: 0.000430
Episode: 10770000   Avg reward: 0.840  Avg Delta: 0.00149734

Episode: 11550000   Avg reward: 0.830  Avg Delta: 0.00164898  Epsilon: 0.009  Alpha: 0.000397
Episode: 11560000   Avg reward: 0.780  Avg Delta: 0.00124489  Epsilon: 0.009  Alpha: 0.000397
Episode: 11570000   Avg reward: 0.810  Avg Delta: 0.00150395  Epsilon: 0.009  Alpha: 0.000397
Episode: 11580000   Avg reward: 0.860  Avg Delta: 0.00142430  Epsilon: 0.009  Alpha: 0.000396
Episode: 11590000   Avg reward: 0.780  Avg Delta: 0.00210051  Epsilon: 0.009  Alpha: 0.000396
Episode: 11600000   Avg reward: 0.810  Avg Delta: 0.00159077  Epsilon: 0.009  Alpha: 0.021947
Episode: 11610000   Avg reward: 0.910  Avg Delta: 0.00166452  Epsilon: 0.009  Alpha: 0.000395
Episode: 11620000   Avg reward: 0.860  Avg Delta: 0.00127627  Epsilon: 0.009  Alpha: 0.000395
Episode: 11630000   Avg reward: 0.850  Avg Delta: 0.00128443  Epsilon: 0.009  Alpha: 0.015756
Episode: 11640000   Avg reward: 0.890  Avg Delta: 0.00108792  Epsilon: 0.009  Alpha: 0.000394
Episode: 11650000   Avg reward: 0.820  Avg Delta: 0.00145752

Episode: 12430000   Avg reward: 0.820  Avg Delta: 0.00134476  Epsilon: 0.008  Alpha: 0.000366
Episode: 12440000   Avg reward: 0.860  Avg Delta: 0.00066870  Epsilon: 0.008  Alpha: 0.000366
Episode: 12450000   Avg reward: 0.790  Avg Delta: 0.00161314  Epsilon: 0.008  Alpha: 0.000365
Episode: 12460000   Avg reward: 0.850  Avg Delta: 0.00175763  Epsilon: 0.008  Alpha: 0.000365
Episode: 12470000   Avg reward: 0.780  Avg Delta: 0.00132017  Epsilon: 0.008  Alpha: 0.000365
Episode: 12480000   Avg reward: 0.800  Avg Delta: 0.00176836  Epsilon: 0.008  Alpha: 0.000364
Episode: 12490000   Avg reward: 0.830  Avg Delta: 0.00070671  Epsilon: 0.008  Alpha: 0.000364
Episode: 12500000   Avg reward: 0.810  Avg Delta: 0.00145398  Epsilon: 0.008  Alpha: 0.000364
Episode: 12510000   Avg reward: 0.810  Avg Delta: 0.00122696  Epsilon: 0.008  Alpha: 0.000363
Episode: 12520000   Avg reward: 0.850  Avg Delta: 0.00157637  Epsilon: 0.008  Alpha: 0.000363
Episode: 12530000   Avg reward: 0.840  Avg Delta: 0.00143266

Episode: 13310000   Avg reward: 0.890  Avg Delta: 0.00135342  Epsilon: 0.007  Alpha: 0.000339
Episode: 13320000   Avg reward: 0.810  Avg Delta: 0.00128685  Epsilon: 0.007  Alpha: 0.000339
Episode: 13330000   Avg reward: 0.930  Avg Delta: 0.00094348  Epsilon: 0.007  Alpha: 0.000339
Episode: 13340000   Avg reward: 0.860  Avg Delta: 0.00116166  Epsilon: 0.007  Alpha: 0.000338
Episode: 13350000   Avg reward: 0.880  Avg Delta: 0.00128340  Epsilon: 0.007  Alpha: 0.017655
Episode: 13360000   Avg reward: 0.870  Avg Delta: 0.00124545  Epsilon: 0.007  Alpha: 0.000338
Episode: 13370000   Avg reward: 0.810  Avg Delta: 0.00187955  Epsilon: 0.007  Alpha: 0.000338
Episode: 13380000   Avg reward: 0.850  Avg Delta: 0.00175171  Epsilon: 0.007  Alpha: 0.000337
Episode: 13390000   Avg reward: 0.820  Avg Delta: 0.00123862  Epsilon: 0.007  Alpha: 0.000337
Episode: 13400000   Avg reward: 0.880  Avg Delta: 0.00088360  Epsilon: 0.007  Alpha: 0.000337
Episode: 13410000   Avg reward: 0.850  Avg Delta: 0.00160419

Episode: 14190000   Avg reward: 0.800  Avg Delta: 0.00165772  Epsilon: 0.007  Alpha: 0.000316
Episode: 14200000   Avg reward: 0.860  Avg Delta: 0.00137944  Epsilon: 0.007  Alpha: 0.000316
Episode: 14210000   Avg reward: 0.810  Avg Delta: 0.00156297  Epsilon: 0.007  Alpha: 0.000316
Episode: 14220000   Avg reward: 0.810  Avg Delta: 0.00090375  Epsilon: 0.007  Alpha: 0.000315
Episode: 14230000   Avg reward: 0.820  Avg Delta: 0.00097043  Epsilon: 0.007  Alpha: 0.000315
Episode: 14240000   Avg reward: 0.770  Avg Delta: 0.00155734  Epsilon: 0.007  Alpha: 0.000315
Episode: 14250000   Avg reward: 0.860  Avg Delta: 0.00110567  Epsilon: 0.007  Alpha: 0.000315
Episode: 14260000   Avg reward: 0.890  Avg Delta: 0.00109328  Epsilon: 0.007  Alpha: 0.000314
Episode: 14270000   Avg reward: 0.840  Avg Delta: 0.00073538  Epsilon: 0.007  Alpha: 0.034980
Episode: 14280000   Avg reward: 0.910  Avg Delta: 0.00079721  Epsilon: 0.007  Alpha: 0.000314
Episode: 14290000   Avg reward: 0.850  Avg Delta: 0.00093342

Episode: 15070000   Avg reward: 0.850  Avg Delta: 0.00078108  Epsilon: 0.007  Alpha: 0.015886
Episode: 15080000   Avg reward: 0.790  Avg Delta: 0.00086519  Epsilon: 0.007  Alpha: 0.000147
Episode: 15090000   Avg reward: 0.810  Avg Delta: 0.00118810  Epsilon: 0.007  Alpha: 0.011463
Episode: 15100000   Avg reward: 0.810  Avg Delta: 0.00087956  Epsilon: 0.007  Alpha: 0.027904
Episode: 15110000   Avg reward: 0.840  Avg Delta: 0.00132453  Epsilon: 0.007  Alpha: 0.020484
Episode: 15120000   Avg reward: 0.820  Avg Delta: 0.00140147  Epsilon: 0.007  Alpha: 0.000295
Episode: 15130000   Avg reward: 0.830  Avg Delta: 0.00115997  Epsilon: 0.007  Alpha: 0.011400
Episode: 15140000   Avg reward: 0.820  Avg Delta: 0.00123297  Epsilon: 0.007  Alpha: 0.000294
Episode: 15150000   Avg reward: 0.850  Avg Delta: 0.00096943  Epsilon: 0.007  Alpha: 0.032947
Episode: 15160000   Avg reward: 0.900  Avg Delta: 0.00126073  Epsilon: 0.007  Alpha: 0.000294
Episode: 15170000   Avg reward: 0.790  Avg Delta: 0.00113277

Episode: 15950000   Avg reward: 0.850  Avg Delta: 0.00091680  Epsilon: 0.006  Alpha: 0.000278
Episode: 15960000   Avg reward: 0.840  Avg Delta: 0.00122404  Epsilon: 0.006  Alpha: 0.000278
Episode: 15970000   Avg reward: 0.880  Avg Delta: 0.00093276  Epsilon: 0.006  Alpha: 0.000277
Episode: 15980000   Avg reward: 0.800  Avg Delta: 0.00117792  Epsilon: 0.006  Alpha: 0.000277
Episode: 15990000   Avg reward: 0.840  Avg Delta: 0.00078492  Epsilon: 0.006  Alpha: 0.000277
Episode: 16000000   Avg reward: 0.850  Avg Delta: 0.00065263  Epsilon: 0.006  Alpha: 0.000277
Episode: 16010000   Avg reward: 0.770  Avg Delta: 0.00092649  Epsilon: 0.006  Alpha: 0.000090
Episode: 16020000   Avg reward: 0.770  Avg Delta: 0.00190171  Epsilon: 0.006  Alpha: 0.000277
Episode: 16030000   Avg reward: 0.810  Avg Delta: 0.00152870  Epsilon: 0.006  Alpha: 0.000276
Episode: 16040000   Avg reward: 0.830  Avg Delta: 0.00122362  Epsilon: 0.006  Alpha: 0.000276
Episode: 16050000   Avg reward: 0.840  Avg Delta: 0.00126073

Episode: 16830000   Avg reward: 0.880  Avg Delta: 0.00090558  Epsilon: 0.006  Alpha: 0.000262
Episode: 16840000   Avg reward: 0.780  Avg Delta: 0.00102559  Epsilon: 0.006  Alpha: 0.000262
Episode: 16850000   Avg reward: 0.840  Avg Delta: 0.00108544  Epsilon: 0.006  Alpha: 0.000262
Episode: 16860000   Avg reward: 0.890  Avg Delta: 0.00083716  Epsilon: 0.006  Alpha: 0.000262
Episode: 16870000   Avg reward: 0.850  Avg Delta: 0.00093576  Epsilon: 0.006  Alpha: 0.000261
Episode: 16880000   Avg reward: 0.890  Avg Delta: 0.00101738  Epsilon: 0.006  Alpha: 0.000070
Episode: 16890000   Avg reward: 0.870  Avg Delta: 0.00105392  Epsilon: 0.006  Alpha: 0.000261
Episode: 16900000   Avg reward: 0.810  Avg Delta: 0.00098284  Epsilon: 0.006  Alpha: 0.000261
Episode: 16910000   Avg reward: 0.840  Avg Delta: 0.00073851  Epsilon: 0.006  Alpha: 0.000261
Episode: 16920000   Avg reward: 0.840  Avg Delta: 0.00077343  Epsilon: 0.006  Alpha: 0.000261
Episode: 16930000   Avg reward: 0.860  Avg Delta: 0.00099149

Episode: 17710000   Avg reward: 0.860  Avg Delta: 0.00079482  Epsilon: 0.006  Alpha: 0.000248
Episode: 17720000   Avg reward: 0.810  Avg Delta: 0.00139494  Epsilon: 0.006  Alpha: 0.000248
Episode: 17730000   Avg reward: 0.790  Avg Delta: 0.00131811  Epsilon: 0.006  Alpha: 0.000248
Episode: 17740000   Avg reward: 0.760  Avg Delta: 0.00091749  Epsilon: 0.006  Alpha: 0.000247
Episode: 17750000   Avg reward: 0.890  Avg Delta: 0.00062958  Epsilon: 0.006  Alpha: 0.000247
Episode: 17760000   Avg reward: 0.880  Avg Delta: 0.00110120  Epsilon: 0.006  Alpha: 0.000247
Episode: 17770000   Avg reward: 0.830  Avg Delta: 0.00133533  Epsilon: 0.006  Alpha: 0.000247
Episode: 17780000   Avg reward: 0.770  Avg Delta: 0.00123476  Epsilon: 0.006  Alpha: 0.000247
Episode: 17790000   Avg reward: 0.850  Avg Delta: 0.00103947  Epsilon: 0.006  Alpha: 0.000247
Episode: 17800000   Avg reward: 0.850  Avg Delta: 0.00070866  Epsilon: 0.006  Alpha: 0.000246
Episode: 17810000   Avg reward: 0.800  Avg Delta: 0.00070363

Episode: 18590000   Avg reward: 0.870  Avg Delta: 0.00087088  Epsilon: 0.005  Alpha: 0.000235
Episode: 18600000   Avg reward: 0.820  Avg Delta: 0.00069179  Epsilon: 0.005  Alpha: 0.008948
Episode: 18610000   Avg reward: 0.900  Avg Delta: 0.00040523  Epsilon: 0.005  Alpha: 0.000235
Episode: 18620000   Avg reward: 0.850  Avg Delta: 0.00086348  Epsilon: 0.005  Alpha: 0.000235
Episode: 18630000   Avg reward: 0.850  Avg Delta: 0.00056040  Epsilon: 0.005  Alpha: 0.008929
Episode: 18640000   Avg reward: 0.810  Avg Delta: 0.00095419  Epsilon: 0.005  Alpha: 0.000234
Episode: 18650000   Avg reward: 0.830  Avg Delta: 0.00124063  Epsilon: 0.005  Alpha: 0.000234
Episode: 18660000   Avg reward: 0.880  Avg Delta: 0.00096387  Epsilon: 0.005  Alpha: 0.000234
Episode: 18670000   Avg reward: 0.880  Avg Delta: 0.00084900  Epsilon: 0.005  Alpha: 0.000234
Episode: 18680000   Avg reward: 0.820  Avg Delta: 0.00071070  Epsilon: 0.005  Alpha: 0.000234
Episode: 18690000   Avg reward: 0.850  Avg Delta: 0.00073360

Episode: 19470000   Avg reward: 0.890  Avg Delta: 0.00059062  Epsilon: 0.005  Alpha: 0.000224
Episode: 19480000   Avg reward: 0.770  Avg Delta: 0.00088576  Epsilon: 0.005  Alpha: 0.000223
Episode: 19490000   Avg reward: 0.890  Avg Delta: 0.00064096  Epsilon: 0.005  Alpha: 0.000223
Episode: 19500000   Avg reward: 0.880  Avg Delta: 0.00089807  Epsilon: 0.005  Alpha: 0.000223
Episode: 19510000   Avg reward: 0.800  Avg Delta: 0.00068139  Epsilon: 0.005  Alpha: 0.000223
Episode: 19520000   Avg reward: 0.850  Avg Delta: 0.00044971  Epsilon: 0.005  Alpha: 0.000223
Episode: 19530000   Avg reward: 0.830  Avg Delta: 0.00113724  Epsilon: 0.005  Alpha: 0.000223
Episode: 19540000   Avg reward: 0.870  Avg Delta: 0.00081511  Epsilon: 0.005  Alpha: 0.000223
Episode: 19550000   Avg reward: 0.840  Avg Delta: 0.00092353  Epsilon: 0.005  Alpha: 0.032749
Episode: 19560000   Avg reward: 0.780  Avg Delta: 0.00097302  Epsilon: 0.005  Alpha: 0.000222
Episode: 19570000   Avg reward: 0.880  Avg Delta: 0.00074879

Episode: 20350000   Avg reward: 0.840  Avg Delta: 0.00070854  Epsilon: 0.005  Alpha: 0.000213
Episode: 20360000   Avg reward: 0.880  Avg Delta: 0.00082194  Epsilon: 0.005  Alpha: 0.007768
Episode: 20370000   Avg reward: 0.890  Avg Delta: 0.00057237  Epsilon: 0.005  Alpha: 0.000213
Episode: 20380000   Avg reward: 0.850  Avg Delta: 0.00051618  Epsilon: 0.005  Alpha: 0.000213
Episode: 20390000   Avg reward: 0.900  Avg Delta: 0.00069499  Epsilon: 0.005  Alpha: 0.000213
Episode: 20400000   Avg reward: 0.860  Avg Delta: 0.00090318  Epsilon: 0.005  Alpha: 0.005175
Episode: 20410000   Avg reward: 0.870  Avg Delta: 0.00092300  Epsilon: 0.005  Alpha: 0.000212
Episode: 20420000   Avg reward: 0.880  Avg Delta: 0.00076249  Epsilon: 0.005  Alpha: 0.005169
Episode: 20430000   Avg reward: 0.860  Avg Delta: 0.00072224  Epsilon: 0.005  Alpha: 0.000212
Episode: 20440000   Avg reward: 0.880  Avg Delta: 0.00065837  Epsilon: 0.005  Alpha: 0.000212
Episode: 20450000   Avg reward: 0.820  Avg Delta: 0.00061589

Episode: 21230000   Avg reward: 0.790  Avg Delta: 0.00090038  Epsilon: 0.005  Alpha: 0.000204
Episode: 21240000   Avg reward: 0.880  Avg Delta: 0.00096461  Epsilon: 0.005  Alpha: 0.000204
Episode: 21250000   Avg reward: 0.890  Avg Delta: 0.00058709  Epsilon: 0.005  Alpha: 0.007399
Episode: 21260000   Avg reward: 0.910  Avg Delta: 0.00042039  Epsilon: 0.005  Alpha: 0.000203
Episode: 21270000   Avg reward: 0.880  Avg Delta: 0.00081292  Epsilon: 0.005  Alpha: 0.000203
Episode: 21280000   Avg reward: 0.880  Avg Delta: 0.00043243  Epsilon: 0.005  Alpha: 0.004931
Episode: 21290000   Avg reward: 0.890  Avg Delta: 0.00091878  Epsilon: 0.005  Alpha: 0.000094
Episode: 21300000   Avg reward: 0.810  Avg Delta: 0.00045715  Epsilon: 0.005  Alpha: 0.000203
Episode: 21310000   Avg reward: 0.800  Avg Delta: 0.00103252  Epsilon: 0.005  Alpha: 0.000203
Episode: 21320000   Avg reward: 0.830  Avg Delta: 0.00089265  Epsilon: 0.005  Alpha: 0.000203
Episode: 21330000   Avg reward: 0.900  Avg Delta: 0.00061879

In [6]:
def test(Q):            
    state = env.reset()
    for step in range(MAX_STEPS):        
        action = epsilon_action(Q, state, 0)          
        state, reward, done, _ = env.step(action) 
        env.render()                           
        if done:
            print("Reward: {}".format(reward))
            break

test(Q_final)
        

  (Up)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFFFF
F[41mF[0mFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
SFFFFFFF
[41mF[0mFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
SFFFFFFF
[41mF[0mFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
SFFFFFFF
F[41mF[0mFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SF[41mF[0mFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFFFF
FF[41mF[0mFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
SFFFFFFF
FFF[41mF[0mFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
SFFFFFFF
FF[41mF[0mFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
SFFFFFFF
FFF[41mF[0mFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
SFFFFFFF
FF[41m