This implements qlearning on an environment within OpenAI gym - this does not set up the environment, 
actions, rewards or states

In [250]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

In [251]:
#Load the gym from OpenAI and look at some of the information it holds on the environment

In [252]:
env = gym.make("FrozenLake-v1", is_slippery=False)
print("Size of action space: ", env.action_space.n)
print("Size of state space: ", env.observation_space.n)
print("Potential rewards: ", env.reward_range)

Size of action space:  4
Size of state space:  16
Potential rewards:  (0, 1)


In [253]:
#Now we have the environment loaded we can build the initial Q-table.  This is a 2-d grid of actions*states

In [254]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))
q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [255]:
#Now we can set up the initial parameters to use

In [256]:
num_episodes = 20000    #this is how many episodes we want to train the model with
max_steps_per_episode = 100    #this is the maximum number of steps before the episode ends to sto it going on forever

learning_rate = 0.1    #this is the rate of learning or how much we take prev Qvalues into account when we update them
discount_rate = 0.99    #this is the discount we apply to future rewards.  Immediate rewards have a greater importance

exploration_rate = 1    #this is the rate we explore v exploit
max_exploration_rate = 1    #this is the max the rate can be and means we always explore
min_exploration_rate = 0.1    #this is the min the rate can be and means we explore 1% of the time
exploration_decay_rate = 0.01  #this is the rate we reduce the exploration_rate

In [257]:
#Create a list to store the rewards for each episode

In [258]:
rewards_all_episodes = []

In [259]:
#Create the loop to run through each episode

for episode in range(num_episodes):    #for each episode in the range
    state = env.reset()                #start with the environment reset
    done = False                       #ensure the episode is not set to be complete
    rewards_current_episode = 0        #set the current rewards to be 0
    
    #Create the loop for each step in the episode
    for step in range(max_steps_per_episode):                   #for each step in the episode
        exploration_rate_threshold = random.uniform(0, 1)       #generate a random number between 0 and 1
        if exploration_rate_threshold > exploration_rate:       #if the number is greater than the exploration rate
            action = np.argmax(q_table[state,:])                #exploit and take the action with the highest Qvalue
        else:
            action = env.action_space.sample()                  #else explore and take a random action
            
        new_state, reward, done, info = env.step(action)        #get the new state, reward and done using the action
        
        #update the qvalue for the state action pair  
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \
            learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))
        
        state = new_state                                       #move to the new state within the episode
        rewards_current_episode += reward                       #increase the reward for the current episode
        
        if done == True:                                        #if it is a terminal step then stop the episode
            break
            
    # Reduce the exploration rate as we have learned more
    exploration_rate = min_exploration_rate + \
        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)
    
    #append the rewards for this episode to the list of rewards
    rewards_all_episodes.append(rewards_current_episode)

the equation for updating the qvalue is:

the new qvalue for the current state action pair 
= (1- learning rate) * original qvalue (old value)
+ learning rate * expected return (learned value)

In [260]:
# Calculate and print the average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/1000)
count = 1000

print("********Average reward per thousand episodes********\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count += 1000

********Average reward per thousand episodes********

1000 :  0.0
2000 :  0.0
3000 :  0.0
4000 :  0.0
5000 :  0.001
6000 :  0.0
7000 :  0.0
8000 :  0.0
9000 :  0.0
10000 :  0.6000000000000004
11000 :  0.8960000000000007
12000 :  0.8950000000000007
13000 :  0.8960000000000007
14000 :  0.9090000000000007
15000 :  0.8960000000000007
16000 :  0.9000000000000007
17000 :  0.8910000000000007
18000 :  0.8950000000000007
19000 :  0.8840000000000007
20000 :  0.8880000000000007


In [261]:
# Print updated Q-table
print("\n\n********Q-table********\n")
print(q_table)



********Q-table********

[[0.94148015 0.95099005 0.93206535 0.94148015]
 [0.94148015 0.         0.13701068 0.52961563]
 [0.4844953  0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.95099005 0.96059601 0.         0.94148015]
 [0.         0.         0.         0.        ]
 [0.         0.58844143 0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.96059601 0.         0.970299   0.95099005]
 [0.96059601 0.9801     0.96059601 0.        ]
 [0.970299   0.8248959  0.         0.21412399]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.9801     0.99       0.970299  ]
 [0.9801     0.99       1.         0.96059601]
 [0.         0.         0.         0.        ]]


In [263]:
for episode in range(5):
    state = env.reset()
    done = False
    print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    time.sleep(1)

    for step in range(max_steps_per_episode):        
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        
        action = np.argmax(q_table[state,:])        
        new_state, reward, done, info = env.step(action)

        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                print("****You fell through a hole!****")
                time.sleep(3)
                clear_output(wait=True)
            break
            
        state = new_state

env.close()


  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
****You reached the goal!****
