In [1]:
import gym
import numpy as np

game = gym.make('FrozenLake-v0')
size = 4
if False:
    game = gym.make('FrozenLake8x8-v0')
    size=8

env = game.env
policy_to_action = {0:"L",1:"D",2:"R",3:"U"}

In [2]:
def value_iterations(env, theta = 0.00001, discount_factor = 0.9):
    """
    Args:
        env = the game env
            env.P returns all and their corresponsing action
            env.nS returns total no of states
            env.nA return total no of actions
        theta =  stop iteration if change become less than theta
        discount_factor = Gamma value
        
    Returns: best_policy, value function
    
    """
    def one_step_lookahead(s,V,env=env,discount_factor=discount_factor):
        """Helper Function to to best action and its value for a state"""
        A = dict()
        for a in env.P[s].keys():
            A[a] = 0
            for (action_prob,next_state,reward,is_done) in env.P[s][a]:
                A[a] += action_prob * ( reward + discount_factor*V[next_state] ) 
        
        best_action = 0
        best_value = float('-inf')
        for a,v in A.items():
            if v > best_value:
                best_value = v
                best_action = a
        
        return best_action,best_value
        
    
    #value optimization
    V = np.zeros(env.nS)
    while True:
        biggest_change=0
        for s in range(env.nS):
            _, new_v = one_step_lookahead(s,V)
            old_v = V[s]

            V[s] = new_v
            
            change = abs(old_v-new_v)
            if biggest_change < change:
                biggest_change = change
        
        if biggest_change< theta:
            break
    
    #policy extraction
    policy = np.zeros(env.nS)
    for s in range(env.nS):
        best_a,_ = one_step_lookahead(s,V)
        policy[s] = best_a
    
    
    return policy, V

In [3]:
policy, value = value_iterations(env)

gpolicy = list(map(lambda a: policy_to_action[a],policy))
print("Optimal Policy :\n {} ".format(np.reshape(gpolicy,(size,size))))
print("Optimal Values :\n {}".format(np.reshape(value,(size,size))))

Optimal Policy :
 [['L' 'U' 'L' 'U']
 ['L' 'L' 'L' 'L']
 ['U' 'D' 'L' 'L']
 ['L' 'R' 'D' 'L']] 
Optimal Values :
 [[0.06884713 0.0613827  0.0743898  0.0557857 ]
 [0.09182101 0.         0.1121994  0.        ]
 [0.14541346 0.24748435 0.2996098  0.        ]
 [0.         0.37992783 0.63901636 0.        ]]


In [4]:
def policy_iterations(env,theta=1e-10, discount_factor=0.9):
    """
    Args:
        env = the game env
            env.P returns all and their corresponsing action
            env.nS returns total no of states
            env.nA return total no of actions
        theta =  stop iteration if change become less than theta
        discount_factor = Gamma value
        
    Returns: best_policy, value function
    """
    
    def policy_evaluation(policy,V,env=env,theta=theta,discount_factor=discount_factor):
        """Helper function that returns new value function corresponding to a policy"""
        while True:
            temp_V = V.copy()
            biggest_change=0
            for s in range(env.nS):
                action = policy[s]
                v = 0
                for (prob,next_state,reward,_) in env.P[s][action]:
                    v += prob * (reward + discount_factor * V[next_state] )

                temp_V[s]=v
                change = abs(v-V[s])
                if biggest_change < change:
                    biggest_change = change
            V = temp_V
            if biggest_change<theta:
                return V
            
            
    def policy_improvement(V,env=env,discount_factor=discount_factor):
        """Helper function that returns best policy corresponding to value function using greedy method"""
        npolicy = np.zeros(env.nS)
        for s in range(env.nS):
            A = dict()
            for a in env.P[s].keys():
                A[a] = 0
                for (prob,next_state,reward,_) in env.P[s][a]:
                    A[a] += prob * (reward + V[next_state]*discount_factor )
            best_action = 0
            best_value = float('-inf')
            for a,v in A.items():
                if best_value < v:
                    best_action = a
                    best_value = v   
            npolicy[s] = best_action
            
        return npolicy
        
    
    #initializing policy which says always move right
    policy = np.zeros(env.nS)+2
    
    #initializing V
    V = np.zeros(env.nS)
    
    #policy iterations
    while True:
        V = policy_evaluation(policy,V)
        npolicy = policy_improvement(V)
        
        change = False
        for _p,p in zip(npolicy,policy):
            if _p!=p: change=True;break
        
        if not change:
            break
        else:
            policy = npolicy
            
    return policy,V

In [5]:
policy, value = policy_iterations(env)

gpolicy = list(map(lambda a: policy_to_action[a],policy))
print("Optimal Policy :\n {} ".format(np.reshape(gpolicy,(size,size))))
print("Optimal Values :\n {}".format(np.reshape(value,(size,size))))

Optimal Policy :
 [['L' 'U' 'L' 'U']
 ['L' 'L' 'L' 'L']
 ['U' 'D' 'L' 'L']
 ['L' 'R' 'D' 'L']] 
Optimal Values :
 [[0.0688909  0.06141457 0.07440976 0.05580732]
 [0.09185454 0.         0.11220821 0.        ]
 [0.14543635 0.24749695 0.29961759 0.        ]
 [0.         0.3799359  0.63902015 0.        ]]


In [6]:
"""
Lets Play A Game
"""
state = game.reset()
game.render()
while True:
    action = int(policy[state])
    (state,reward,is_done,_) = game.step(action)
    game.render()
    if is_done:
        if reward>0:
            print("You Won!!!")
        else:
            print("You fell in a hole!!!")
        game.close()
        break


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Left)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0m

In [7]:
"""
Lets see our success rate
"""
games = 1000
won = 0
for _ in range(games):
    state = game.reset()
    while True:
        action = int(policy[state])
        (state,reward,is_done,_) = game.step(action)
        if is_done:
            if reward>0:
                won+=1
            game.close()
            break
            
print("Success Rate : {}".format(won/games))

Success Rate : 0.715


#### Success rate of 71.5% is good, in this kind of underterministic environment