In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import random

%matplotlib inline

In [2]:
env = gym.make('FrozenLake8x8-v0')

MAX_STEPS = env.spec.timestep_limit # maximum steps in an episode, 200 for Taxi-v2

NUM_EPISODES = 10000000
GAMMA = 0.95  # Discount factor from Bellman Equation
START_ALPHA = 0.5  # Learning rate, how much we update our Q table each step
ALPHA_TAPER = 0.00001 # How much our adaptive learning rate is adjusted each update
START_EPSILON = 1.0  # Probability of random action
EPSILON_TAPER = 0.00001 # How much epsilon is adjusted each step

render = False

In [3]:
def update_Q(Q, update_counts, alpha, prev_state, action, reward, cur_state):      
    
    return Q, update

def moving_average(values, n=100) :
    ret = np.cumsum(values, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

In [4]:
def epsilon_action(Q, s, eps=0.1):
    if random.random() < (1 - eps):
        return np.argmax(Q[s])
    else:
        return env.action_space.sample()

In [None]:
def double_q_learning():
    
    obs_dim = env.observation_space.n # size of our state
    action_dim = env.action_space.n # number of actions

    # Initialize our Q table
    Q = np.zeros((obs_dim, action_dim))
    Q_1 = np.zeros((obs_dim, action_dim))
    Q_2 = np.zeros((obs_dim, action_dim))

    state_visit_counts = {}
    update_counts = np.zeros((obs_dim, action_dim), dtype=np.dtype(int))
    
    rewards_list = []    
    deltas = []
              
    for i_episode in range(NUM_EPISODES): 
        
        eps = START_EPSILON / (1.0 + i_episode * EPSILON_TAPER)
        
        # Play episode
        biggest_change = 0
        reward_per_episode = 0 
        curr_state = env.reset()
        for step in range(MAX_STEPS):          
          
            prev_state = curr_state
            state_visit_counts[prev_state] = state_visit_counts.get(prev_state,0) + 1
            action = epsilon_action(Q, curr_state, eps)           
            curr_state, reward, done, info = env.step(action)
            reward_per_episode += reward
            old_qsa = Q[prev_state][action]
            
            # Update Q
            alpha = START_ALPHA / (1.0 + update_counts[prev_state][action] * ALPHA_TAPER)
            update_counts[prev_state][action] += 1
            Q[prev_state][action] += alpha * (reward + GAMMA * max(Q[curr_state]) - Q[prev_state][action])
            
            if np.random.uniform() < 0.5:           
                Q_1[prev_state][action] += alpha * (reward + GAMMA * max(Q_2[curr_state]) - Q_1[prev_state][action]) 
            else:
                Q_2[prev_state][action] += alpha * (reward + GAMMA * max(Q_1[curr_state]) - Q_2[prev_state][action]) 
            Q = Q_1 + Q_2
            
            biggest_change = max(biggest_change, np.abs(old_qsa - Q[prev_state][action]))
            if done:
                break
        rewards_list.append(reward_per_episode)
        deltas.append(biggest_change)               
            
        avg_reward = np.mean(rewards_list[-500:])     
        if i_episode % 5000 == 0:            
            avg_delta = np.mean(deltas[-500:])  
            print ("Episode: {}   Avg reward: {:3.3f}  Avg Delta: {:8.8f}  Epsilon: {:3.3f}  Alpha: {:6.6f}".format(
                i_episode, avg_reward, avg_delta, eps, alpha))
            
            
        #if len(rewards_list) > 1000:
        #    rewards_list.pop(0)
          
        if avg_reward >= env.spec.reward_threshold: 
            print("########## Solved! ###########")
            break
            
  
    mean_state_visits = np.mean(list(state_visit_counts.values()))
    print("each state was visited on average: ", mean_state_visits, " times.\n")
  

    plt.plot(moving_average(deltas, n=1000))
    plt.show()
    
    plt.plot(moving_average(rewards_list, n=1000))
    plt.show()
    
    return Q
  
            
print("action_space={}".format(env.action_space))
print("obs_space={}".format(env.observation_space))
print("threshold={} \n".format(env.spec.reward_threshold))

Q_final = double_q_learning()
print("Final Q values: {}\n".format(Q_final)) 


action_space=Discrete(4)
obs_space=Discrete(64)
threshold=0.99 

Episode: 0   Avg reward: 0.000  Avg Delta: 0.00000000  Epsilon: 1.000  Alpha: 0.500000
Episode: 5000   Avg reward: 0.000  Avg Delta: 0.03249485  Epsilon: 0.952  Alpha: 0.497423
Episode: 10000   Avg reward: 0.006  Avg Delta: 0.04917654  Epsilon: 0.909  Alpha: 0.499900
Episode: 15000   Avg reward: 0.002  Avg Delta: 0.07379496  Epsilon: 0.870  Alpha: 0.496998
Episode: 20000   Avg reward: 0.008  Avg Delta: 0.09161141  Epsilon: 0.833  Alpha: 0.480123
Episode: 25000   Avg reward: 0.002  Avg Delta: 0.07942969  Epsilon: 0.800  Alpha: 0.492446
Episode: 30000   Avg reward: 0.010  Avg Delta: 0.07823514  Epsilon: 0.769  Alpha: 0.471138
Episode: 35000   Avg reward: 0.008  Avg Delta: 0.09230280  Epsilon: 0.741  Alpha: 0.466457
Episode: 40000   Avg reward: 0.008  Avg Delta: 0.08525815  Epsilon: 0.714  Alpha: 0.484943
Episode: 45000   Avg reward: 0.012  Avg Delta: 0.09035506  Epsilon: 0.690  Alpha: 0.458775
Episode: 50000   Avg reward: 0

Episode: 445000   Avg reward: 0.312  Avg Delta: 0.08843003  Epsilon: 0.183  Alpha: 0.216877
Episode: 450000   Avg reward: 0.260  Avg Delta: 0.08198103  Epsilon: 0.182  Alpha: 0.369355
Episode: 455000   Avg reward: 0.288  Avg Delta: 0.08753006  Epsilon: 0.180  Alpha: 0.436353
Episode: 460000   Avg reward: 0.262  Avg Delta: 0.07871100  Epsilon: 0.179  Alpha: 0.451810
Episode: 465000   Avg reward: 0.276  Avg Delta: 0.07857671  Epsilon: 0.177  Alpha: 0.365625
Episode: 470000   Avg reward: 0.270  Avg Delta: 0.07843787  Epsilon: 0.175  Alpha: 0.201721
Episode: 475000   Avg reward: 0.294  Avg Delta: 0.07853785  Epsilon: 0.174  Alpha: 0.198950
Episode: 480000   Avg reward: 0.266  Avg Delta: 0.08343241  Epsilon: 0.172  Alpha: 0.196188
Episode: 485000   Avg reward: 0.322  Avg Delta: 0.09080321  Epsilon: 0.171  Alpha: 0.385003
Episode: 490000   Avg reward: 0.326  Avg Delta: 0.07961229  Epsilon: 0.169  Alpha: 0.444298
Episode: 495000   Avg reward: 0.276  Avg Delta: 0.07141358  Epsilon: 0.168  Alph

Episode: 895000   Avg reward: 0.432  Avg Delta: 0.05739504  Epsilon: 0.101  Alpha: 0.075588
Episode: 900000   Avg reward: 0.410  Avg Delta: 0.05991377  Epsilon: 0.100  Alpha: 0.074929
Episode: 905000   Avg reward: 0.434  Avg Delta: 0.06743348  Epsilon: 0.100  Alpha: 0.360581
Episode: 910000   Avg reward: 0.452  Avg Delta: 0.05419267  Epsilon: 0.099  Alpha: 0.349350
Episode: 915000   Avg reward: 0.432  Avg Delta: 0.05529320  Epsilon: 0.099  Alpha: 0.072994
Episode: 920000   Avg reward: 0.404  Avg Delta: 0.05690545  Epsilon: 0.098  Alpha: 0.416320
Episode: 925000   Avg reward: 0.482  Avg Delta: 0.05850349  Epsilon: 0.098  Alpha: 0.071720
Episode: 930000   Avg reward: 0.452  Avg Delta: 0.05004782  Epsilon: 0.097  Alpha: 0.071115
Episode: 935000   Avg reward: 0.438  Avg Delta: 0.05819213  Epsilon: 0.097  Alpha: 0.070522
Episode: 940000   Avg reward: 0.450  Avg Delta: 0.05752671  Epsilon: 0.096  Alpha: 0.277146
Episode: 945000   Avg reward: 0.434  Avg Delta: 0.05473528  Epsilon: 0.096  Alph

Episode: 1340000   Avg reward: 0.512  Avg Delta: 0.04568416  Epsilon: 0.069  Alpha: 0.040287
Episode: 1345000   Avg reward: 0.504  Avg Delta: 0.04573505  Epsilon: 0.069  Alpha: 0.262975
Episode: 1350000   Avg reward: 0.470  Avg Delta: 0.04308747  Epsilon: 0.069  Alpha: 0.341402
Episode: 1355000   Avg reward: 0.512  Avg Delta: 0.04723735  Epsilon: 0.069  Alpha: 0.039616
Episode: 1360000   Avg reward: 0.496  Avg Delta: 0.04314697  Epsilon: 0.068  Alpha: 0.226448
Episode: 1365000   Avg reward: 0.506  Avg Delta: 0.04399154  Epsilon: 0.068  Alpha: 0.039180
Episode: 1370000   Avg reward: 0.484  Avg Delta: 0.04378387  Epsilon: 0.068  Alpha: 0.340876
Episode: 1375000   Avg reward: 0.492  Avg Delta: 0.04084968  Epsilon: 0.068  Alpha: 0.038745
Episode: 1380000   Avg reward: 0.542  Avg Delta: 0.04107389  Epsilon: 0.068  Alpha: 0.038526
Episode: 1385000   Avg reward: 0.488  Avg Delta: 0.04407159  Epsilon: 0.067  Alpha: 0.038318
Episode: 1390000   Avg reward: 0.500  Avg Delta: 0.03969718  Epsilon: 

Episode: 1785000   Avg reward: 0.532  Avg Delta: 0.03690489  Epsilon: 0.053  Alpha: 0.026212
Episode: 1790000   Avg reward: 0.564  Avg Delta: 0.03458551  Epsilon: 0.053  Alpha: 0.026106
Episode: 1795000   Avg reward: 0.512  Avg Delta: 0.03510211  Epsilon: 0.053  Alpha: 0.025999
Episode: 1800000   Avg reward: 0.564  Avg Delta: 0.03478578  Epsilon: 0.053  Alpha: 0.025896
Episode: 1805000   Avg reward: 0.604  Avg Delta: 0.03347246  Epsilon: 0.052  Alpha: 0.311226
Episode: 1810000   Avg reward: 0.570  Avg Delta: 0.03554061  Epsilon: 0.052  Alpha: 0.025685
Episode: 1815000   Avg reward: 0.572  Avg Delta: 0.03367009  Epsilon: 0.052  Alpha: 0.025580
Episode: 1820000   Avg reward: 0.564  Avg Delta: 0.03569079  Epsilon: 0.052  Alpha: 0.255021
Episode: 1825000   Avg reward: 0.586  Avg Delta: 0.03308226  Epsilon: 0.052  Alpha: 0.025370
Episode: 1830000   Avg reward: 0.598  Avg Delta: 0.03291899  Epsilon: 0.052  Alpha: 0.350287
Episode: 1835000   Avg reward: 0.612  Avg Delta: 0.03290698  Epsilon: 

Episode: 2230000   Avg reward: 0.634  Avg Delta: 0.02802129  Epsilon: 0.043  Alpha: 0.379861
Episode: 2235000   Avg reward: 0.586  Avg Delta: 0.03176630  Epsilon: 0.043  Alpha: 0.203946
Episode: 2240000   Avg reward: 0.550  Avg Delta: 0.02654732  Epsilon: 0.043  Alpha: 0.158633
Episode: 2245000   Avg reward: 0.624  Avg Delta: 0.03059398  Epsilon: 0.043  Alpha: 0.018837
Episode: 2250000   Avg reward: 0.578  Avg Delta: 0.03362587  Epsilon: 0.043  Alpha: 0.018777
Episode: 2255000   Avg reward: 0.634  Avg Delta: 0.02969542  Epsilon: 0.042  Alpha: 0.293030
Episode: 2260000   Avg reward: 0.624  Avg Delta: 0.02898588  Epsilon: 0.042  Alpha: 0.018659
Episode: 2265000   Avg reward: 0.598  Avg Delta: 0.02884144  Epsilon: 0.042  Alpha: 0.207370
Episode: 2270000   Avg reward: 0.576  Avg Delta: 0.02616894  Epsilon: 0.042  Alpha: 0.168325
Episode: 2275000   Avg reward: 0.578  Avg Delta: 0.02921759  Epsilon: 0.042  Alpha: 0.018489
Episode: 2280000   Avg reward: 0.572  Avg Delta: 0.02817042  Epsilon: 

Episode: 2675000   Avg reward: 0.636  Avg Delta: 0.02392317  Epsilon: 0.036  Alpha: 0.014769
Episode: 2680000   Avg reward: 0.632  Avg Delta: 0.02343354  Epsilon: 0.036  Alpha: 0.014730
Episode: 2685000   Avg reward: 0.606  Avg Delta: 0.02636282  Epsilon: 0.036  Alpha: 0.014693
Episode: 2690000   Avg reward: 0.580  Avg Delta: 0.02456172  Epsilon: 0.036  Alpha: 0.014656
Episode: 2695000   Avg reward: 0.614  Avg Delta: 0.02286171  Epsilon: 0.036  Alpha: 0.014618
Episode: 2700000   Avg reward: 0.608  Avg Delta: 0.02465411  Epsilon: 0.036  Alpha: 0.492170
Episode: 2705000   Avg reward: 0.586  Avg Delta: 0.02663444  Epsilon: 0.036  Alpha: 0.014544
Episode: 2710000   Avg reward: 0.596  Avg Delta: 0.02097156  Epsilon: 0.036  Alpha: 0.175865
Episode: 2715000   Avg reward: 0.614  Avg Delta: 0.02592139  Epsilon: 0.036  Alpha: 0.014470
Episode: 2720000   Avg reward: 0.588  Avg Delta: 0.02549226  Epsilon: 0.035  Alpha: 0.014433
Episode: 2725000   Avg reward: 0.618  Avg Delta: 0.02373932  Epsilon: 

Episode: 3120000   Avg reward: 0.610  Avg Delta: 0.02320516  Epsilon: 0.031  Alpha: 0.011989
Episode: 3125000   Avg reward: 0.590  Avg Delta: 0.02062968  Epsilon: 0.031  Alpha: 0.011964
Episode: 3130000   Avg reward: 0.624  Avg Delta: 0.02241232  Epsilon: 0.031  Alpha: 0.011938
Episode: 3135000   Avg reward: 0.660  Avg Delta: 0.01977696  Epsilon: 0.031  Alpha: 0.011913
Episode: 3140000   Avg reward: 0.656  Avg Delta: 0.02025784  Epsilon: 0.031  Alpha: 0.011888
Episode: 3145000   Avg reward: 0.636  Avg Delta: 0.01779716  Epsilon: 0.031  Alpha: 0.011863
Episode: 3150000   Avg reward: 0.628  Avg Delta: 0.01997147  Epsilon: 0.031  Alpha: 0.330260
Episode: 3155000   Avg reward: 0.612  Avg Delta: 0.02519529  Epsilon: 0.031  Alpha: 0.309366
Episode: 3160000   Avg reward: 0.650  Avg Delta: 0.02125687  Epsilon: 0.031  Alpha: 0.152969
Episode: 3165000   Avg reward: 0.618  Avg Delta: 0.01808185  Epsilon: 0.031  Alpha: 0.011763
Episode: 3170000   Avg reward: 0.650  Avg Delta: 0.02085468  Epsilon: 

Episode: 3565000   Avg reward: 0.672  Avg Delta: 0.01866763  Epsilon: 0.027  Alpha: 0.181405
Episode: 3570000   Avg reward: 0.618  Avg Delta: 0.01913959  Epsilon: 0.027  Alpha: 0.176480
Episode: 3575000   Avg reward: 0.632  Avg Delta: 0.02258793  Epsilon: 0.027  Alpha: 0.010017
Episode: 3580000   Avg reward: 0.626  Avg Delta: 0.01834159  Epsilon: 0.027  Alpha: 0.009999
Episode: 3585000   Avg reward: 0.642  Avg Delta: 0.01939487  Epsilon: 0.027  Alpha: 0.105327
Episode: 3590000   Avg reward: 0.626  Avg Delta: 0.01988074  Epsilon: 0.027  Alpha: 0.009962
Episode: 3595000   Avg reward: 0.648  Avg Delta: 0.01756900  Epsilon: 0.027  Alpha: 0.009944
Episode: 3600000   Avg reward: 0.640  Avg Delta: 0.01855578  Epsilon: 0.027  Alpha: 0.009926
Episode: 3605000   Avg reward: 0.654  Avg Delta: 0.01707166  Epsilon: 0.027  Alpha: 0.009908
Episode: 3610000   Avg reward: 0.598  Avg Delta: 0.01541110  Epsilon: 0.027  Alpha: 0.120629
Episode: 3615000   Avg reward: 0.648  Avg Delta: 0.01835695  Epsilon: 

Episode: 4010000   Avg reward: 0.664  Avg Delta: 0.01523740  Epsilon: 0.024  Alpha: 0.119645
Episode: 4015000   Avg reward: 0.638  Avg Delta: 0.01756939  Epsilon: 0.024  Alpha: 0.008620
Episode: 4020000   Avg reward: 0.648  Avg Delta: 0.01756790  Epsilon: 0.024  Alpha: 0.008606
Episode: 4025000   Avg reward: 0.626  Avg Delta: 0.01738538  Epsilon: 0.024  Alpha: 0.008593
Episode: 4030000   Avg reward: 0.654  Avg Delta: 0.01524473  Epsilon: 0.024  Alpha: 0.392166
Episode: 4035000   Avg reward: 0.630  Avg Delta: 0.01724741  Epsilon: 0.024  Alpha: 0.008565
Episode: 4040000   Avg reward: 0.614  Avg Delta: 0.01677964  Epsilon: 0.024  Alpha: 0.142735
Episode: 4045000   Avg reward: 0.624  Avg Delta: 0.01562861  Epsilon: 0.024  Alpha: 0.392028
Episode: 4050000   Avg reward: 0.692  Avg Delta: 0.01449334  Epsilon: 0.024  Alpha: 0.008524
Episode: 4055000   Avg reward: 0.640  Avg Delta: 0.01634916  Epsilon: 0.024  Alpha: 0.008511
Episode: 4060000   Avg reward: 0.640  Avg Delta: 0.01610514  Epsilon: 

Episode: 4455000   Avg reward: 0.640  Avg Delta: 0.01646421  Epsilon: 0.022  Alpha: 0.007551
Episode: 4460000   Avg reward: 0.642  Avg Delta: 0.01503714  Epsilon: 0.022  Alpha: 0.007540
Episode: 4465000   Avg reward: 0.658  Avg Delta: 0.01345938  Epsilon: 0.022  Alpha: 0.106123
Episode: 4470000   Avg reward: 0.644  Avg Delta: 0.01497607  Epsilon: 0.022  Alpha: 0.142468
Episode: 4475000   Avg reward: 0.636  Avg Delta: 0.01507495  Epsilon: 0.022  Alpha: 0.086397
Episode: 4480000   Avg reward: 0.672  Avg Delta: 0.01313923  Epsilon: 0.022  Alpha: 0.007498
Episode: 4485000   Avg reward: 0.646  Avg Delta: 0.01443865  Epsilon: 0.022  Alpha: 0.007488
Episode: 4490000   Avg reward: 0.648  Avg Delta: 0.01352692  Epsilon: 0.022  Alpha: 0.007477
Episode: 4495000   Avg reward: 0.670  Avg Delta: 0.01279072  Epsilon: 0.022  Alpha: 0.086166
Episode: 4500000   Avg reward: 0.644  Avg Delta: 0.01465883  Epsilon: 0.022  Alpha: 0.007456
Episode: 4505000   Avg reward: 0.626  Avg Delta: 0.01496486  Epsilon: 

Episode: 4900000   Avg reward: 0.662  Avg Delta: 0.01332178  Epsilon: 0.020  Alpha: 0.006699
Episode: 4905000   Avg reward: 0.668  Avg Delta: 0.01422170  Epsilon: 0.020  Alpha: 0.006691
Episode: 4910000   Avg reward: 0.686  Avg Delta: 0.01436068  Epsilon: 0.020  Alpha: 0.006682
Episode: 4915000   Avg reward: 0.686  Avg Delta: 0.01247615  Epsilon: 0.020  Alpha: 0.006673
Episode: 4920000   Avg reward: 0.688  Avg Delta: 0.01434154  Epsilon: 0.020  Alpha: 0.006665
Episode: 4925000   Avg reward: 0.638  Avg Delta: 0.01232711  Epsilon: 0.020  Alpha: 0.093570
Episode: 4930000   Avg reward: 0.724  Avg Delta: 0.01145655  Epsilon: 0.020  Alpha: 0.078929
Episode: 4935000   Avg reward: 0.690  Avg Delta: 0.01117710  Epsilon: 0.020  Alpha: 0.099559
Episode: 4940000   Avg reward: 0.648  Avg Delta: 0.01243336  Epsilon: 0.020  Alpha: 0.006631
Episode: 4945000   Avg reward: 0.678  Avg Delta: 0.01229918  Epsilon: 0.020  Alpha: 0.168399
Episode: 4950000   Avg reward: 0.672  Avg Delta: 0.01450810  Epsilon: 

Episode: 5345000   Avg reward: 0.660  Avg Delta: 0.01439964  Epsilon: 0.018  Alpha: 0.006012
Episode: 5350000   Avg reward: 0.678  Avg Delta: 0.01344941  Epsilon: 0.018  Alpha: 0.088021
Episode: 5355000   Avg reward: 0.642  Avg Delta: 0.01222979  Epsilon: 0.018  Alpha: 0.005998
Episode: 5360000   Avg reward: 0.664  Avg Delta: 0.01181512  Epsilon: 0.018  Alpha: 0.005991
Episode: 5365000   Avg reward: 0.632  Avg Delta: 0.01227140  Epsilon: 0.018  Alpha: 0.087542
Episode: 5370000   Avg reward: 0.676  Avg Delta: 0.01289750  Epsilon: 0.018  Alpha: 0.005977
Episode: 5375000   Avg reward: 0.722  Avg Delta: 0.01225945  Epsilon: 0.018  Alpha: 0.005970
Episode: 5380000   Avg reward: 0.666  Avg Delta: 0.01316200  Epsilon: 0.018  Alpha: 0.005963
Episode: 5385000   Avg reward: 0.668  Avg Delta: 0.01101036  Epsilon: 0.018  Alpha: 0.351872
Episode: 5390000   Avg reward: 0.694  Avg Delta: 0.01230470  Epsilon: 0.018  Alpha: 0.005950
Episode: 5395000   Avg reward: 0.670  Avg Delta: 0.01050067  Epsilon: 

Episode: 5790000   Avg reward: 0.672  Avg Delta: 0.01053062  Epsilon: 0.017  Alpha: 0.005448
Episode: 5795000   Avg reward: 0.706  Avg Delta: 0.01048773  Epsilon: 0.017  Alpha: 0.005443
Episode: 5800000   Avg reward: 0.656  Avg Delta: 0.01086754  Epsilon: 0.017  Alpha: 0.005437
Episode: 5805000   Avg reward: 0.654  Avg Delta: 0.01078063  Epsilon: 0.017  Alpha: 0.194330
Episode: 5810000   Avg reward: 0.664  Avg Delta: 0.00913752  Epsilon: 0.017  Alpha: 0.194151
Episode: 5815000   Avg reward: 0.692  Avg Delta: 0.00951993  Epsilon: 0.017  Alpha: 0.005420
Episode: 5820000   Avg reward: 0.668  Avg Delta: 0.01177991  Epsilon: 0.017  Alpha: 0.079285
Episode: 5825000   Avg reward: 0.676  Avg Delta: 0.01321605  Epsilon: 0.017  Alpha: 0.005409
Episode: 5830000   Avg reward: 0.632  Avg Delta: 0.01069351  Epsilon: 0.017  Alpha: 0.005403
Episode: 5835000   Avg reward: 0.674  Avg Delta: 0.01195160  Epsilon: 0.017  Alpha: 0.114833
Episode: 5840000   Avg reward: 0.666  Avg Delta: 0.01128910  Epsilon: 

Episode: 6235000   Avg reward: 0.678  Avg Delta: 0.00800558  Epsilon: 0.016  Alpha: 0.004979
Episode: 6240000   Avg reward: 0.696  Avg Delta: 0.01071610  Epsilon: 0.016  Alpha: 0.226668
Episode: 6245000   Avg reward: 0.680  Avg Delta: 0.00913741  Epsilon: 0.016  Alpha: 0.004969
Episode: 6250000   Avg reward: 0.678  Avg Delta: 0.01125482  Epsilon: 0.016  Alpha: 0.073745
Episode: 6255000   Avg reward: 0.672  Avg Delta: 0.01081481  Epsilon: 0.016  Alpha: 0.004960
Episode: 6260000   Avg reward: 0.658  Avg Delta: 0.01112518  Epsilon: 0.016  Alpha: 0.073513
Episode: 6265000   Avg reward: 0.690  Avg Delta: 0.00880992  Epsilon: 0.016  Alpha: 0.061098
Episode: 6270000   Avg reward: 0.710  Avg Delta: 0.00917187  Epsilon: 0.016  Alpha: 0.004946
Episode: 6275000   Avg reward: 0.664  Avg Delta: 0.01024885  Epsilon: 0.016  Alpha: 0.289179
Episode: 6280000   Avg reward: 0.690  Avg Delta: 0.01184670  Epsilon: 0.016  Alpha: 0.345619
Episode: 6285000   Avg reward: 0.656  Avg Delta: 0.01055372  Epsilon: 

Episode: 6680000   Avg reward: 0.688  Avg Delta: 0.01067332  Epsilon: 0.015  Alpha: 0.004581
Episode: 6685000   Avg reward: 0.672  Avg Delta: 0.00839597  Epsilon: 0.015  Alpha: 0.076172
Episode: 6690000   Avg reward: 0.648  Avg Delta: 0.00941984  Epsilon: 0.015  Alpha: 0.102546
Episode: 6695000   Avg reward: 0.672  Avg Delta: 0.01135726  Epsilon: 0.015  Alpha: 0.004569
Episode: 6700000   Avg reward: 0.688  Avg Delta: 0.00868633  Epsilon: 0.015  Alpha: 0.076115
Episode: 6705000   Avg reward: 0.716  Avg Delta: 0.00995510  Epsilon: 0.015  Alpha: 0.004561
Episode: 6710000   Avg reward: 0.686  Avg Delta: 0.00931207  Epsilon: 0.015  Alpha: 0.004557
Episode: 6715000   Avg reward: 0.670  Avg Delta: 0.01033057  Epsilon: 0.015  Alpha: 0.082193
Episode: 6720000   Avg reward: 0.678  Avg Delta: 0.00926206  Epsilon: 0.015  Alpha: 0.004548
Episode: 6725000   Avg reward: 0.672  Avg Delta: 0.00941923  Epsilon: 0.015  Alpha: 0.081977
Episode: 6730000   Avg reward: 0.706  Avg Delta: 0.01087407  Epsilon: 

Episode: 7125000   Avg reward: 0.690  Avg Delta: 0.00846342  Epsilon: 0.014  Alpha: 0.211715
Episode: 7130000   Avg reward: 0.682  Avg Delta: 0.00893036  Epsilon: 0.014  Alpha: 0.004237
Episode: 7135000   Avg reward: 0.700  Avg Delta: 0.00854118  Epsilon: 0.014  Alpha: 0.004234
Episode: 7140000   Avg reward: 0.718  Avg Delta: 0.00802470  Epsilon: 0.014  Alpha: 0.004230
Episode: 7145000   Avg reward: 0.694  Avg Delta: 0.00856850  Epsilon: 0.014  Alpha: 0.004227
Episode: 7150000   Avg reward: 0.710  Avg Delta: 0.00891904  Epsilon: 0.014  Alpha: 0.004223
Episode: 7155000   Avg reward: 0.682  Avg Delta: 0.00881939  Epsilon: 0.014  Alpha: 0.052065
Episode: 7160000   Avg reward: 0.706  Avg Delta: 0.00899123  Epsilon: 0.014  Alpha: 0.004216
Episode: 7165000   Avg reward: 0.670  Avg Delta: 0.00784819  Epsilon: 0.014  Alpha: 0.004213
Episode: 7170000   Avg reward: 0.680  Avg Delta: 0.00867943  Epsilon: 0.014  Alpha: 0.004209
Episode: 7175000   Avg reward: 0.662  Avg Delta: 0.01059081  Epsilon: 

Episode: 7570000   Avg reward: 0.710  Avg Delta: 0.00913496  Epsilon: 0.013  Alpha: 0.003946
Episode: 7575000   Avg reward: 0.692  Avg Delta: 0.00882938  Epsilon: 0.013  Alpha: 0.069675
Episode: 7580000   Avg reward: 0.696  Avg Delta: 0.00684376  Epsilon: 0.013  Alpha: 0.003940
Episode: 7585000   Avg reward: 0.714  Avg Delta: 0.00864338  Epsilon: 0.013  Alpha: 0.003937
Episode: 7590000   Avg reward: 0.702  Avg Delta: 0.00691577  Epsilon: 0.013  Alpha: 0.003934
Episode: 7595000   Avg reward: 0.674  Avg Delta: 0.00914662  Epsilon: 0.013  Alpha: 0.003931
Episode: 7600000   Avg reward: 0.690  Avg Delta: 0.00834832  Epsilon: 0.013  Alpha: 0.003928
Episode: 7605000   Avg reward: 0.674  Avg Delta: 0.00995227  Epsilon: 0.013  Alpha: 0.003925
Episode: 7610000   Avg reward: 0.628  Avg Delta: 0.00780961  Epsilon: 0.013  Alpha: 0.003922
Episode: 7615000   Avg reward: 0.690  Avg Delta: 0.00856397  Epsilon: 0.013  Alpha: 0.003919
Episode: 7620000   Avg reward: 0.720  Avg Delta: 0.00793511  Epsilon: 

Episode: 8015000   Avg reward: 0.740  Avg Delta: 0.00670578  Epsilon: 0.012  Alpha: 0.003688
Episode: 8020000   Avg reward: 0.684  Avg Delta: 0.00858817  Epsilon: 0.012  Alpha: 0.003685
Episode: 8025000   Avg reward: 0.732  Avg Delta: 0.00726171  Epsilon: 0.012  Alpha: 0.003683
Episode: 8030000   Avg reward: 0.686  Avg Delta: 0.00717432  Epsilon: 0.012  Alpha: 0.003680
Episode: 8035000   Avg reward: 0.650  Avg Delta: 0.00894146  Epsilon: 0.012  Alpha: 0.003677
Episode: 8040000   Avg reward: 0.724  Avg Delta: 0.00691303  Epsilon: 0.012  Alpha: 0.285238
Episode: 8045000   Avg reward: 0.690  Avg Delta: 0.00730454  Epsilon: 0.012  Alpha: 0.045021
Episode: 8050000   Avg reward: 0.706  Avg Delta: 0.00798142  Epsilon: 0.012  Alpha: 0.003669
Episode: 8055000   Avg reward: 0.720  Avg Delta: 0.00786511  Epsilon: 0.012  Alpha: 0.064322
Episode: 8060000   Avg reward: 0.686  Avg Delta: 0.00872046  Epsilon: 0.012  Alpha: 0.060877
Episode: 8065000   Avg reward: 0.714  Avg Delta: 0.00770547  Epsilon: 

Episode: 8460000   Avg reward: 0.706  Avg Delta: 0.00598801  Epsilon: 0.012  Alpha: 0.003461
Episode: 8465000   Avg reward: 0.702  Avg Delta: 0.00750885  Epsilon: 0.012  Alpha: 0.053045
Episode: 8470000   Avg reward: 0.690  Avg Delta: 0.00763705  Epsilon: 0.012  Alpha: 0.003456
Episode: 8475000   Avg reward: 0.704  Avg Delta: 0.00639044  Epsilon: 0.012  Alpha: 0.058268
Episode: 8480000   Avg reward: 0.690  Avg Delta: 0.00697119  Epsilon: 0.012  Alpha: 0.003452
Episode: 8485000   Avg reward: 0.728  Avg Delta: 0.00768582  Epsilon: 0.012  Alpha: 0.003449
Episode: 8490000   Avg reward: 0.680  Avg Delta: 0.00839265  Epsilon: 0.012  Alpha: 0.003447
Episode: 8495000   Avg reward: 0.664  Avg Delta: 0.00818291  Epsilon: 0.012  Alpha: 0.003445
Episode: 8500000   Avg reward: 0.692  Avg Delta: 0.00915998  Epsilon: 0.012  Alpha: 0.003442
Episode: 8505000   Avg reward: 0.740  Avg Delta: 0.00637498  Epsilon: 0.012  Alpha: 0.003440
Episode: 8510000   Avg reward: 0.708  Avg Delta: 0.00650351  Epsilon: 

Episode: 8905000   Avg reward: 0.706  Avg Delta: 0.00707184  Epsilon: 0.011  Alpha: 0.056134
Episode: 8910000   Avg reward: 0.758  Avg Delta: 0.00569916  Epsilon: 0.011  Alpha: 0.003258
Episode: 8915000   Avg reward: 0.694  Avg Delta: 0.00877812  Epsilon: 0.011  Alpha: 0.039507
Episode: 8920000   Avg reward: 0.700  Avg Delta: 0.00852590  Epsilon: 0.011  Alpha: 0.362684
Episode: 8925000   Avg reward: 0.688  Avg Delta: 0.00609727  Epsilon: 0.011  Alpha: 0.003251
Episode: 8930000   Avg reward: 0.684  Avg Delta: 0.00751626  Epsilon: 0.011  Alpha: 0.138955
Episode: 8935000   Avg reward: 0.724  Avg Delta: 0.00575201  Epsilon: 0.011  Alpha: 0.003247
Episode: 8940000   Avg reward: 0.718  Avg Delta: 0.00665605  Epsilon: 0.011  Alpha: 0.003245
Episode: 8945000   Avg reward: 0.720  Avg Delta: 0.00766556  Epsilon: 0.011  Alpha: 0.003243
Episode: 8950000   Avg reward: 0.678  Avg Delta: 0.00704415  Epsilon: 0.011  Alpha: 0.003241
Episode: 8955000   Avg reward: 0.686  Avg Delta: 0.00830941  Epsilon: 

Episode: 9350000   Avg reward: 0.692  Avg Delta: 0.00673035  Epsilon: 0.011  Alpha: 0.046453
Episode: 9355000   Avg reward: 0.714  Avg Delta: 0.00593563  Epsilon: 0.011  Alpha: 0.003078
Episode: 9360000   Avg reward: 0.698  Avg Delta: 0.00685080  Epsilon: 0.011  Alpha: 0.003076
Episode: 9365000   Avg reward: 0.702  Avg Delta: 0.00713275  Epsilon: 0.011  Alpha: 0.272763
Episode: 9370000   Avg reward: 0.700  Avg Delta: 0.00748070  Epsilon: 0.011  Alpha: 0.054109
Episode: 9375000   Avg reward: 0.698  Avg Delta: 0.00691663  Epsilon: 0.011  Alpha: 0.003070
Episode: 9380000   Avg reward: 0.710  Avg Delta: 0.00571380  Epsilon: 0.011  Alpha: 0.003069
Episode: 9385000   Avg reward: 0.716  Avg Delta: 0.00629156  Epsilon: 0.011  Alpha: 0.003067
Episode: 9390000   Avg reward: 0.696  Avg Delta: 0.00593074  Epsilon: 0.011  Alpha: 0.046212
Episode: 9395000   Avg reward: 0.744  Avg Delta: 0.00589141  Epsilon: 0.011  Alpha: 0.003063
Episode: 9400000   Avg reward: 0.712  Avg Delta: 0.00554878  Epsilon: 

In [None]:
def test(Q):            
    state = env.reset()
    for step in range(MAX_STEPS):        
        action = epsilon_action(Q, state, 0)          
        state, reward, done, _ = env.step(action) 
        env.render()                           
        if done:
            print("Reward: {}".format(reward))
            break

test(Q_final)
        