## Frozen Lake

Frozen Lake environment is a 4×4 grid which contains 
- S: initial state
- F: frozen lake
- H: hole
- G: the goal
The agent moves around the grid until it reaches the goal or the hole. If it falls into the hole, it has to start from the beginning and is rewarded the value 0. 

This environment is from OpenAI Gym, an open source Python library for developing and comparing reinforcement learning algorithms.

#### Note: Please do not modify any pre-defined variables. Doing so can affect the autograder results.

In [None]:
import numpy as np
import random
import matplotlib.pyplot as plt
import gym
from time import sleep
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"

In [None]:
!pip install pygame
import gym
env = gym.make('FrozenLake-v1')

In [None]:
env.reset()
env.render()
env.reset()


Please complete the epsilon greedy function using the epsilon greedy formula from the lectures.

In [None]:
def estimate(OldEstimate, StepSize, Target):
    'An incremental implementation of average.'
    
    NewEstimate = OldEstimate + StepSize * (Target - OldEstimate)
    return NewEstimate

def epsilon_greedy(value, e, seed = None):

    if seed != None:
        np.random.seed(seed)
    
    # START CODING HERE

    # your code here
    

    # END CODING HERE 
    
    return action

def action_evaluation(env, gamma, v):

    nS = env.env.nS
    nA = env.env.nA
    q = np.zeros((nS, nA))
    for s in range(nS):
        for a in range(nA):
            
            for i in env.P[s][a]:
                p, ns, r, ts = i
                q[s,a] += p * (r + gamma * v[ns])

    return q

def action_selection(q):
    
    actions = np.argmax(q, axis = 1)    
    return actions 

def render(env, policy):

    state = env.reset()
    terminal = False
    
    while not terminal:
        action = policy[state]
        state, reward, terminal, prob = env.step(action)
        env.render()
        sleep(1)
    
    print('Episode ends. Reward =', reward)
    
def human_play(env):
    print('Action indices: LEFT = 0, DOWN = 1, RIGHT = 2, UP = 3')
    state = env.reset()
    env.render()
    terminal = False
    
    while not terminal:
        action = int(input('Give the environment your action index:'))
        state, reward, terminal, prob = env.step(action)
        env.render()

You can review the different concepts covered in the course below:

In [None]:
def policy_iteration(env, gamma, max_iteration, theta):


    V = np.zeros(env.nS)
    policy = np.zeros(env.nS, dtype = np.int32)
    policy_stable = False
    numIterations = 0
    
    while not policy_stable and numIterations < max_iteration:

        V = policy_evaluation(env,policy,gamma,theta)
        policy , policy_stable = policy_improvement(env, V, policy, gamma)

        numIterations += 1
        
    return V, policy, numIterations


def policy_evaluation(env, policy, gamma, theta):
    V = np.zeros(env.nS)
    
    while True:
        dl = 0 
        ###from policy iteration pseudocode
        for i in range(env.nS):
            temp = V[i]
            a = policy[i]
            temp2 = 0
            for p,ns,r,t in env.P[i][a]:
                temp2 += (p * (r+gamma*V[ns]))
            V[i]=temp2
            dl = max(dl,abs(temp - V[i]))
                
               
        if dl<theta:
            break

    return V


def policy_improvement(env, value_from_policy, policy, gamma):
   
    
    policy_stable = True
    for i in range(env.nS):
        temp = policy[i]
        
        policy_final = np.zeros(env.nA)
        for a in range(env.nA):
            
            policy_temp = 0
            for p,ns,r,t in env.P[i][a]:
                policy_temp +=  p * (r + gamma * value_from_policy[ns])
            policy_final[a] = policy_temp
        policy[i] = np.argmax(policy_final)
        
        if temp !=policy[i]:
            policy_stable = False

    return policy, policy_stable

In [None]:
def QLearning(env, num_episodes, gamma, lr, e):

    Q = np.zeros((env.nS, env.nA))
    
    eps = 0
    for i in range(num_episodes):
        eps+=1
        s = env.reset()
        c = False
        while not c:
            a = epsilon_greedy(Q[s,:],e)
            ns,r,done,_ = env.step(a)
            Q[s,a] = Q[s,a] + lr * (r + gamma *  np.max(Q[ns,:]) - Q[s,a])
            s = ns
            c = done

    return Q

In [None]:
def SARSA(env, num_episodes, gamma, lr, e):
    
    Q = np.zeros((env.nS, env.nA))
    eps = 0
    for i in range(num_episodes):
        eps+=1
        s = env.reset()
        a = epsilon_greedy(Q[s,:],e)
        c = False
        while not c:
            ns,r,done,_ = env.step(a)
            na = epsilon_greedy(Q[ns,:],e)
            Q[s,a] = Q[s,a] + lr * (r + gamma * (Q[ns,na]) - Q[s,a])
            s = ns
            a = na
            c = done

    return Q

In [None]:
def value_iteration(env, gamma, max_iteration, theta):

    V = np.zeros(env.nS)
    numIterations = 0

    while numIterations< max_iteration:
        numIterations += 1
        dl = 0 
        for i in range(env.nS):
            temp = V[i]
            value_final = np.zeros(env.nA)
            for a in range(env.nA):
                value_temp = 0
                for p,ns,r,t in env.P[i][a]:
                    value_temp +=  p * (r + gamma * V[ns])
                value_final[a] = value_temp
            V[i] = np.max(value_final)
            dl = max(dl,abs(temp - V[i]))

        if dl<theta:
            break

    policy = extract_policy(env, V, gamma)
    
    return V, policy, numIterations

def extract_policy(env, v, gamma):

    policy = np.zeros(env.nS, dtype = np.int32)

    for i in range(env.nS):
        policy_final = np.zeros(env.nA)
        for a in range(env.nA):
            policy_temp = 0
            for p,ns,r,t in env.P[i][a]:
                policy_temp +=  p * (r + gamma * v[ns])
            policy_final[a] = policy_temp
        policy[i] = np.argmax(policy_final)

    return policy

In [None]:
random.seed(6885)
numTimeStep = 10000
q_h = np.zeros(numTimeStep + 1) 
q_f = np.zeros(numTimeStep + 1) 
FixedStepSize = 0.5 
for step in range(1, numTimeStep + 1):
    if step < numTimeStep / 2:
        r = random.gauss(mu = 1, sigma = 0.1)
    else:
        r = random.gauss(mu = 3, sigma = 0.1)

    q_h[step] = estimate(q_h[step-1], 1/step , r)
    q_f[step] = estimate(q_h[step-1], FixedStepSize , r)
    
q_h = q_h[1:]
q_f = q_f[1:]

In [None]:
plt.plot(range(numTimeStep),q_h, label='Q_h')
plt.plot(range(numTimeStep),q_f, label='Q_f')
plt.ylabel('Q_values')
plt.xlabel('Number of Steps')
plt.legend()
plt.show()

In [None]:
np.random.seed(6885) #Set the seed to cancel the randomness
q = np.random.normal(0, 1, size = 5)

greedy_action = epsilon_greedy(q,0) #Use epsilon = 0 for Greedy
e_greedy_action = epsilon_greedy(q,0.1) #Use epsilon = 0.1

print('Values:')
print(q)
print('Greedy Choice =', greedy_action)
print('Epsilon-Greedy Choice =', e_greedy_action)