<h1>Solving Frozen Lake Environment</h1>

In [1]:
import numpy as np
import gym

In [2]:
env = gym.make("FrozenLake-v0")



[2017-07-16 17:01:01,076] Making new env: FrozenLake-v0


In [3]:
n_states = env.observation_space.n
n_actions = env.action_space.n
print(n_states)
print(n_actions)

16
4


In [4]:
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


<h1>How good does performing randomly do?</h1>


In [5]:
state = env.reset()

reward = None
done = None

g = 0
episodes = 0
rewardTracker = []

while reward != 1:
    state, reward, done, info = env.step(env.action_space.sample())
    g += reward
    if done == True:
        rewardTracker.append(g)
        state = env.reset()
        episodes += 1
print("Reached goal after {} episodes with a average return of {}".format(episodes, sum(rewardTracker)/len(rewardTracker)))


Reached goal after 68 episodes with a average return of 0.014705882352941176


<h1>How good does previously used Q Learning work?</h1>

In [6]:
episodes = 1000
rewardTracker = []

Q = np.zeros([n_states, n_states])

G = 0
alpha = 0.618

for episode in range(1,episodes+1):
    done = False
    G, reward = 0,0
    state = env.reset()
    while done != True:
        action = np.argmax(Q[state]) 
        state2, reward, done, info = env.step(action) 
        Q[state,action] += alpha * ((reward + (np.max(Q[state2]))  - Q[state,action]))
        G += reward
        state = state2
    rewardTracker.append(G)
    
print("Average return of {}".format(sum(rewardTracker)/len(rewardTracker)))

Average return of 0.0


In [7]:
np.sum(Q)

0.0

<h1>Since we do not receive a negative reward for each action, all our agent ever does is try action 0 and doesn't explore other actions</h1>


<h1>We can create a epsilon greedy policy that will choose a random action at a given epsilon percentage</h1>

In [8]:
epsilon = 0.5 
episodes = 5000
rewardTracker = []

Q = np.zeros([n_states, n_states])

G = 0
alpha = 0.618

for episode in range(1,episodes+1):
    done = False
    G, reward = 0,0
    state = env.reset()
    while done != True:
        
        if np.random.rand() > epsilon:
            action = np.argmax(Q[state])
        else:
            action = env.action_space.sample()
            
        state2, reward, done, info = env.step(action) 
        Q[state,action] += alpha * ((reward + (np.max(Q[state2]))  - Q[state,action]))
        G += reward
        state = state2
    rewardTracker.append(G)
    
print("Average return of {}".format(sum(rewardTracker)/len(rewardTracker)))

Average return of 0.0356


<h1>Still not very impressive</h1>

<h1>Let's add reward discounting and a decaying epsilon</h1>

In [10]:
epsilon = 1
gamma = 0.95

rewardTracker = []
Q = np.zeros([n_states, n_states])
episodes = 5000
G = 0
alpha = 0.618

for episode in range(1,episodes+1):
    done = False
    G, reward = 0,0
    state = env.reset()
    while done != True:
        
        if np.random.rand() > epsilon:
            action = np.argmax(Q[state])
        else:
            action = env.action_space.sample()
            epsilon -= 10**-3
            
        state2, reward, done, info = env.step(action) 
        Q[state,action] += alpha * ((reward + gamma * (np.max(Q[state2])) -  Q[state,action]))
        G += reward
        state = state2
    rewardTracker.append(G)

if (sum(rewardTracker[episode-100:episode])/100.0) > .78:
            print('-------------------------------------------------------')
            print('Solved after {} episodes with average return of {}'.format(episode-100, sum(rewardTracker[episode-100:episodeNum])/100.0))
    
print("Average return of {}".format(sum(rewardTracker)/len(rewardTracker)))

Average return of 0.4158


<h1>We are getting closer but we need to define a more appropriate epsilon greedy policy</h1>

In [11]:
def e_greedy(eps, Q, state, episode):
    
    if np.random.rand() > eps:
        action = np.argmax(Q[state,:]+np.random.randn(1, n_actions)/(episode/4))
    else:
        action = env.action_space.sample()
        eps -= 10**-5

    return action, eps

<h1>Much like other machine learnings problems you need to define optimal hyper parameters</h1>


In [12]:
def learn_Q(alpha, gamma, eps, numTrainingEpisodes, numTrainingSteps):

    global Q_star
    Q = np.zeros([env.observation_space.n, env.action_space.n])
    rewardTracker = []
    
    for episode in range(1,numTrainingEpisodes+1):  
        
        G = 0
        state = env.reset()
        
        for step in range(1,numTrainingSteps):
            
            action, eps = e_greedy(eps, Q, state, episode)
            state2, reward, done, info = env.step(action)
            Q[state,action] += alpha * (reward + gamma * np.max(Q[state2]) - Q[state,action])
            state = state2
            G += reward
        
        rewardTracker.append(G)
        
        if episode % (numTrainingEpisodes*.10) == 0 and episode != 0:
            print('Alpha {}  Gamma {}  Epsilon {:04.3f}  Episode {} of {}'.format(alpha, gamma, eps, episode, numTrainingEpisodes))
            print("Average Total Return: {}".format(sum(rewardTracker)/episode))
        
        if (sum(rewardTracker[episode-100:episode])/100.0) > .78:
            print('-------------------------------------------------------')
            print('Solved after {} episodes with average return of {}'.format(episode-100, sum(rewardTracker[episode-100:episode])/100.0))
            Q_star = Q
            break
    Q_star = Q
   



In [13]:
# Alpha, Gamma, Eps, Episodes, Steps per Episode
learn_Q(0.8, 0.95, 0.1, 5000, 300)

Alpha 0.8  Gamma 0.95  Epsilon 0.022  Episode 500 of 5000
Average Total Return: 0.236
Alpha 0.8  Gamma 0.95  Epsilon 0.005  Episode 1000 of 5000
Average Total Return: 0.318
Alpha 0.8  Gamma 0.95  Epsilon 0.001  Episode 1500 of 5000
Average Total Return: 0.40066666666666667
-------------------------------------------------------
Solved after 1409 episodes with average return of 0.79


In [14]:
def evaluate(Q, numTrainingEpisodes, numTrainingSteps, render):

    rewardTracker = []
    
    for episode in range(1,numTrainingEpisodes+1):  
        
        G = 0
        state = env.reset()
        
        for step in range(1,numTrainingSteps):
            
            action = np.argmax(Q[state])
        
            state2, reward, done, info = env.step(action)
            state = state2
            G += reward
            if render == True:
                env.render()
            
            if done == True:
                break
                
        rewardTracker.append(G)        
    
        if episode % (numTrainingEpisodes*.10) == 0 and episode != 0:
    
            print("Average Total Return After {} Episodes: {:04.3f}".format(episode, sum(rewardTracker)/episode))
            
   

   

In [15]:
# Q-Table, Number of Episodes, Number of Steps per Episode, Render
evaluate(Q_star, 1, 300, True)

  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
