In [None]:
import gym
import random
import time
import numpy as np
from IPython.display import clear_output
env = gym.make("FrozenLake-v0").env
env.render()

In [None]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n
print(action_space_size)
print(state_space_size)

In [None]:
q_table = np.zeros((state_space_size, action_space_size)) #16 by 4
print(q_table)

In [None]:
epochs = 100000
max_steps = 200

learning_rate = 0.09
discount_rate = 0.99 #works best

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

In [None]:
rewards_all = []

for episode in range(epochs):
    state = env.reset()
    done = False
    current_reward = 0
    for step in range(max_steps):
        exploration_rate_threshold = random.uniform(0,1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()
            
        new_state, reward, done, info = env.step(action)
        
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate*(reward  + discount_rate * np.max(q_table[new_state,:])) 
        
        state = new_state
        current_reward += reward
        
        if done == True:
            break
        
    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)
    
    rewards_all.append(current_reward)
    
rewards_print = np.split(np.array(rewards_all), epochs/500)

accuracy = []
for r in rewards_print:
    accuracy.append(sum(r/500))

In [None]:
count = 500
print("*******Average accuracy per 500 episodes******\n")
for i in accuracy:
    print(count, ": ", i)
    count += 500

In [None]:
print("max:", np.amax(accuracy), "at:", epochs/500 - (1+np.argmax(accuracy, axis=0)), "to last row")

Trials:
* 0.806 & 0.802
    * epochs = 50000
    * max_steps = 200
    * learning_rate = 0.09
    * discount_rate = 0.99 
    * exploration_rate = 1
    * max_exploration_rate = 1
    * min_exploration_rate = 0.01
    * exploration_decay_rate = 0.001   
* 0.782 & 0.784
    * epochs = 10000(same as above but 1/5 the amount of epochs)
    * max_steps = 200
    * learning_rate = 0.09
    * discount_rate = 0.99 
    * exploration_rate = 1
    * max_exploration_rate = 1
    * min_exploration_rate = 0.01
    * exploration_decay_rate = 0.001 
* 0.818 & 0.828 & 0.828 = 3 min
    * epochs = 100000(same as above but 1/5 the amount of epochs)
    * max_steps = 300
    * learning_rate = 0.09
    * discount_rate = 0.99 
    * exploration_rate = 1
    * max_exploration_rate = 1
    * min_exploration_rate = 0.01
    * exploration_decay_rate = 0.001 
* 0.79 = 35min
    * epochs = 100000(same as above but 1/5 the amount of epochs)
    * max_steps = 300
    * learning_rate = 0.09
    * discount_rate = 0.99 
    * exploration_rate = 1
    * max_exploration_rate = 1
    * min_exploration_rate = 0.01
    * exploration_decay_rate = 0.001 



In [None]:
print(q_table)

Testing the model

   -**to see the AI actually play, uncomment the print and time statements**

In [None]:
win_loss = []
from time import sleep
for episodes in range(1000):
    state = env.reset()
    done = False
    #time.sleep(2)
    while not done:
        action = np.argmax(q_table[state,:])
        new_state, reward, done, info = env.step(action)
        #clear_output(wait = True)
        #env.render()
        #sleep(.3)
        state = new_state
    if reward == 1:
        win_loss.append("Win")
        #print("******YOU WIN CONGRATZ!!!******")
        #time.sleep(1)
    else:
        win_loss.append("Loss")
        #print("******YOU LOSE******")
        #time.sleep(1)
    #sleep(2)
        

In [None]:
w = 0
l = 0
for i in win_loss:
    if i == "Win":
        w+=1
    else:
        l+=1
print("Wins:", w, "Losses:", l)