In [26]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output


In [27]:
env = gym.make("FrozenLake-v0")

In [28]:
actions_count= env.action_space.n

In [29]:
states_count = env.observation_space.n

In [30]:
q_table = np.zeros((states_count, actions_count))

In [31]:
q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [32]:
total_num_episodes = 10000
max_moves_per_episode= 100

learning_rate= 0.1 #alpha
discount_rate = 0.99 #gamma

#epsilon. refer epsilon greedy policy.
exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate= 0.01

#rate at which exploration_rate will decay
exploration_decay_rate= 0.001


In [33]:
rewards_from_all_episodes= []

#Q_learning Algorithm

#iterate over episodes:
for episode in range(total_num_episodes):
    #for each episode first reset the state of the agent to starting state.
    state= env.reset()
    episode_done = False #keeps track if the episode is completed or not.
    current_episode_reward =0
    
    
    #iterate over time_steps in an episode:
    for step in range(max_moves_per_episode):
        #exploration-exploitation trade-off- calculate using e-greedy and random number from(0-1)
        exploration_rate_threshold = random.uniform(0,1)
        if exploration_rate_threshold> exploration_rate:
            action = np.argmax(q_table[state,:]) #exploitation #choose action with highest Q-val
        else:
            action = env.action_space.sample() #exploration #select an action randomly from action space.
        #Take a new action
        new_state, reward, episode_done, info = env.step(action)
#        print(reward)
        #Update Qtable
        #weighted sum of old value and learned value
        q_table[state,action] = q_table[state,action]*(1-learning_rate) + learning_rate*(reward + discount_rate*np.max(q_table[new_state,:]))
        #Set new state
        state = new_state
        #Add new reward
        current_episode_reward += reward
        if episode_done == True:
            break
    #exploration rate decay #decay using exponential decay
    exploration_rate= min_exploration_rate + (max_exploration_rate-min_exploration_rate)*np.exp(-exploration_decay_rate*episode)
    #Add current episodes reward to reward list
    rewards_from_all_episodes.append(current_episode_reward)
    
        

In [34]:
rewards_per_thousand_episodes= np.split(np.array(rewards_from_all_episodes),total_num_episodes/1000)
count =1000

for reward in rewards_per_thousand_episodes:
    print('Average reward after', count,'episodes' ,':' ,   str(sum(reward/1000)))
    count+= 1000

Average reward after 1000 episodes : 0.05300000000000004
Average reward after 2000 episodes : 0.20400000000000015
Average reward after 3000 episodes : 0.4200000000000003
Average reward after 4000 episodes : 0.5160000000000003
Average reward after 5000 episodes : 0.6260000000000004
Average reward after 6000 episodes : 0.6640000000000005
Average reward after 7000 episodes : 0.6690000000000005
Average reward after 8000 episodes : 0.6960000000000005
Average reward after 9000 episodes : 0.6820000000000005
Average reward after 10000 episodes : 0.7010000000000005


In [35]:
q_table

array([[0.54263621, 0.49091474, 0.5092613 , 0.49189985],
       [0.33805722, 0.26652998, 0.23082938, 0.51051787],
       [0.39525004, 0.29258623, 0.26023072, 0.248822  ],
       [0.03694571, 0.11625917, 0.03232122, 0.04870128],
       [0.56096142, 0.42119434, 0.39961486, 0.39849727],
       [0.        , 0.        , 0.        , 0.        ],
       [0.18341823, 0.14923575, 0.29559957, 0.09857666],
       [0.        , 0.        , 0.        , 0.        ],
       [0.43553631, 0.51469593, 0.39521782, 0.59631836],
       [0.43182487, 0.64528456, 0.43290936, 0.39625022],
       [0.6022408 , 0.38926498, 0.3685924 , 0.21723127],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.56258663, 0.57275832, 0.76829091, 0.36285588],
       [0.75640054, 0.85047867, 0.79942012, 0.77080148],
       [0.        , 0.        , 0.        , 0.        ]])

In [36]:
#Visualize the trained Agent play the game


In [38]:
#watch 5 episodes
for episode in range(5):
    #initalize parameters for new episode.
    state = env.reset()
    done = False
    print("Episode", episode+1,"\n\n\n\n")
    time.sleep(1)
    
    for step in range(max_moves_per_episode):
        #render current sate
        clear_output(wait=True)
        env.render()
        time.sleep(0.7)
        #select the action with highest Qvalue in the Qtable
        action = np.argmax(q_table[state,:])
        #take new action
        new_state,reward,done,info =env.step(action)
       
        if done:
            clear_output(wait=True)
            env.render()
            
            if reward ==1:
                print("YOU REACHED THE GOAL")
                #Agent reached goal
                time.sleep(5)
            else:
                print("YOU FELL IN THE HOLE")
                #agent fell in a hole
                time.sleep(5)
            break
        #set new state
        state = new_state
        
env.close()

KeyboardInterrupt: 