In [1]:
import numpy as np
from gym import Env

In [2]:
def make_map(studentNum): 
    """
    Observation Space : map of environment 
    A 6*6 Grid Graph
    Start point = (0,0)
    End point = (5,5)
    Just one safe path for each Student number
    The probability of falling in each state:
    in safe path = 0.0001 and for other states = 1

    :param 1: Student number 
    
    :return : The Created Map 
    """

    np.random.seed(studentNum)  
    move = np.zeros(10)  # Minimum moves for start to the end point in a 6*6 grid graph
    idx = np.random.choice(range(10),size=5,replace=False)
    move[idx] = 1

    point = [0,0]
    lowprobs = [tuple(point)]

    for m in move:
        if m:
            point[0] += 1
        else:
            point[1] += 1
        lowprobs.append(tuple(point))

    idx = np.array(lowprobs)
    map = np.ones((6,6))
    map[idx[:,0],idx[:,1]] = 0.0001  
    
    map[0,0] = 0.0   # Start point
    map[5,5] = 0.0   # End point

    return map

def make_map_part2(studentNum):

    np.random.seed(studentNum)  
    move = np.zeros(10)  # Minimum moves for start to the end point in a 6*6 grid graph
    idx = np.random.choice(range(10),size=5,replace=False)
    move[idx] = 1

    point = [0,0]
    lowprobs = [tuple(point)]

    for m in move:
        if m:
            point[0] += 1
        else:
            point[1] += 1
        lowprobs.append(tuple(point))

    idx = np.array(lowprobs)
    map = np.random.uniform(0,1,(6,6))
    map[idx[:,0],idx[:,1]] = 0.001  
    
    map[0,0] = 0.0   # Start point
    map[5,5] = 0.0   # End point

    return map

def make_map_part3(studentNum):
    np.random.seed(studentNum)  
    move = np.zeros(28)  # Minimum moves for start to the end point in a 15*15 grid graph
    idx = np.random.choice(range(28),size=14,replace=False)
    move[idx] = 1

    point = [0,0]
    lowprobs = [tuple(point)]

    for m in move:
        if m:
            point[0] += 1
        else:
            point[1] += 1
        lowprobs.append(tuple(point))

    idx = np.array(lowprobs)
    map = np.random.uniform(0,1,(15,15))
    map[idx[:,0],idx[:,1]] = 0.001  
    
    map[0,0] = 0.0   # Start point
    map[14,14] = 0.0   # End point

    return map

### Enviroment

In [3]:
class FrozenLake(Env):
    def __init__(self,studentNum,action_space,observation_space,p_error = 0.94,transition_cost = -1,goal_reward = 100,punishment = -10):
        
        self.studentNum = studentNum
        self.action_space = action_space
        self.observation_space = observation_space
        self.LEFT = action_space[0]
        self.DOWN = action_space[1]
        self.RIGHT = action_space[2]
        self.UP = action_space[3]

        observation_shape = self.observation_space.shape
        action_shape = self.action_space.shape
        self.nS = np.prod(observation_shape)
        self.nA = np.prod(action_shape)
        self.grid = np.arange(self.nS).reshape(observation_shape)

        self.p_error = p_error
        self.transition_cost = transition_cost
        self.goal_reward = goal_reward
        self.punishment = punishment
        
        self.MAX_X = self.observation_space.shape[0]
        self.MAX_Y = self.observation_space.shape[1]

    def transitions(self):
        neighbors = {}
        for s in range(self.nS):
            idx = np.argwhere(self.grid == s)
            neighbors[s] = {a : [] for a in range(self.nA)}
            if(s != (self.MAX_X*self.MAX_Y-1)):
                neighbors[s][self.LEFT] = s-1 if(idx[0,1] != 0) else s
                neighbors[s][self.DOWN] = s + self.MAX_X if(idx[0,0] != self.MAX_Y-1) else s
                neighbors[s][self.RIGHT] = s+1 if(idx[0,1] != self.MAX_X-1) else s
                neighbors[s][self.UP] = s-self.MAX_X if(idx[0,0] != 0) else s
            else:
                neighbors[s][self.LEFT] = s
                neighbors[s][self.DOWN] = s
                neighbors[s][self.RIGHT] = s
                neighbors[s][self.UP] = s               
        return neighbors

    def accessible_states(self,current_state):
        accessible_states = np.zeros(self.nA)
        for i in range(self.nA):
            accessible_states[i] = self.transitions()[current_state][i]
        return accessible_states

    def available_states(self):
        available_states = []
        for i in range(self.nS):
            where = np.argwhere(self.grid == i)
            [idx1,idx2] = [where[0,0],where[0,1]]
            if(self.observation_space[idx1][idx2] < 1):
                available_states = np.append(available_states,i) 
        return available_states

    def transition_probs(self,current_state,next_state,action):
        if(current_state == 0 and next_state == 0):
            if(action == 0):
                return self.p_error
            else:
                return (1-self.p_error)/3
        else:
            if(self.transitions()[current_state][action] == next_state):
                return self.p_error
            else:
                return (1-self.p_error)/3
            
    def is_alive(self,next_state):
        idx = np.argwhere(self.grid == next_state)
        falling_prob = self.observation_space[idx[0,0],idx[0,1]]
        choice = np.random.uniform(0,1)
        #return (choice > falling_prob)
        return falling_prob

    def get_reward(self,current_state,next_state):
        
        falling_prob = self.is_alive(next_state)
        if(current_state == next_state):
            reward = 0 + self.punishment * (falling_prob)
        elif(next_state == (self.MAX_X*self.MAX_Y-1)):
            reward = self.goal_reward + self.transition_cost
        else:
            reward = self.transition_cost * (1 - falling_prob) + self.punishment * (falling_prob)

        return reward

    def is_goal(self,current_state):
        return (current_state == (self.MAX_X*self.MAX_Y-1))

    

### Agent

In [43]:
class agent():
    
    def __init__(self,Enviroment,theta,discount):
        self.Enviroment = Enviroment
        self.theta = theta
        self.discount = discount
        
        self.v_change = bool

        self.V = {s: 0 for s in range(0, self.Enviroment.nS)}
        # init V
        self.policy = {s: 0 for s in range(0, self.Enviroment.nS)}

        self.state_action_val = np.zeros((self.Enviroment.nS,4))

    def policy_evaluation(self):
        v_prior =  np.array(list(self.V.items()))[:,1]
        while True:
            Delta = 0
            v_temp = self.V
            for s in self.Enviroment.available_states():
                a = self.policy[s]
                new_V = 0
                zero_state = 0
                for next_state in self.Enviroment.accessible_states(s):

                    if(zero_state==0):
                            new_V += self.Enviroment.transition_probs(s,next_state,a) * (self.Enviroment.get_reward(s,next_state) + 
                            self.discount * v_temp[next_state])
                            zero_state = zero_state + 1
                    elif(zero_state==1 and (next_state==0 or next_state==self.Enviroment.grid[0,-1] or next_state==self.Enviroment.grid[-1,0])): 
                            new_V += (1-self.Enviroment.p_error)/3 * (self.Enviroment.get_reward(s,next_state) + 
                            self.discount * v_temp[next_state])
                    else:
                            new_V += self.Enviroment.transition_probs(s,next_state,a) * (self.Enviroment.get_reward(s,next_state) + 
                            self.discount * v_temp[next_state])                    

                    self.V[s] = new_V
                Delta = max(Delta, abs(v_temp[s] - self.V[s]))
            if(Delta < self.theta): break
        v_post = np.array(list(self.V.items()))[:,1]
        self.v_change = (np.max(np.abs(v_post-v_prior)) > self.theta)
        return self.v_change

    def policy_improvement(self):
        policy_stable = True
        for s in self.Enviroment.available_states():
            old_action = self.policy[s]
            action_val = np.zeros(len(self.Enviroment.action_space))
            for a in self.Enviroment.action_space:
                new_V = 0
                zero_state = 0
                for next_state in self.Enviroment.accessible_states(s):

                    if(zero_state== 0):
                            new_V += self.Enviroment.transition_probs(s,next_state,a) * (self.Enviroment.get_reward(s,next_state) + 
                            self.discount * self.V[next_state])
                            zero_state = zero_state + 1
                    elif(zero_state==1 and (next_state==0 or next_state==self.Enviroment.grid[0,-1] or next_state==self.Enviroment.grid[-1,0])): 
                            new_V += (1-self.Enviroment.p_error)/3 * (self.Enviroment.get_reward(s,next_state) + 
                            self.discount * self.V[next_state])
                    else:
                            new_V += self.Enviroment.transition_probs(s,next_state,a) * (self.Enviroment.get_reward(s,next_state) + 
                            self.discount * self.V[next_state])                    



                action_val[a] = new_V            
            self.policy[s] = np.argmax(action_val)
            if(old_action != self.policy[s] or self.v_change):
                policy_stable = False
        return policy_stable

    def state_action_value(self):
        for s in self.Enviroment.available_states():
            for a in self.Enviroment.action_space:
                self.state_action_val[int(s)][a] = self.V[self.Enviroment.transitions()[s][a]]
     
    
    def value_iteration(self):
        while True:
            delta = 0
            v_temp = self.V.copy()
            for s in self.Enviroment.available_states():
                old_v = self.V[s]
                action_val = np.zeros(len(self.Enviroment.action_space))
                
                for a in self.Enviroment.action_space:
                    new_V = 0
                    zero_state = 0
                    for next_state in self.Enviroment.accessible_states(s):
                        if(zero_state==0):
                                new_V += self.Enviroment.transition_probs(s,next_state,a) * (self.Enviroment.get_reward(s,next_state) + 
                                self.discount * v_temp[next_state])
                                zero_state = zero_state + 1
                        elif(zero_state==1 and (next_state==0 or next_state==self.Enviroment.grid[0,-1] or next_state==self.Enviroment.grid[-1,0])): 
                                new_V += (1-self.Enviroment.p_error)/3* (self.Enviroment.get_reward(s,next_state) + 
                                self.discount * v_temp[next_state])
                        else:
                                new_V += self.Enviroment.transition_probs(s,next_state,a) * (self.Enviroment.get_reward(s,next_state) + 
                                self.discount * v_temp[next_state])                    

                    action_val[a] = new_V        
                self.V[s] = np.max(action_val)
                delta = max(delta, abs(old_v - self.V[s]))
            if(delta < self.theta): break

        for s in self.Enviroment.available_states():
            action_val = np.zeros(len(self.Enviroment.action_space))
            for a in self.Enviroment.action_space:
                new_V = 0
                zero_state = 0
                for next_state in self.Enviroment.accessible_states(s):
                    if(zero_state==0):
                            new_V += self.Enviroment.transition_probs(s,next_state,a) * (self.Enviroment.get_reward(s,next_state) + 
                            self.discount * v_temp[next_state])
                            zero_state = zero_state + 1
                    elif(zero_state==1 and (next_state==0 or next_state==self.Enviroment.grid[0,-1] or next_state==self.Enviroment.grid[-1,0])): 
                            new_V += (1-self.Enviroment.p_error)/3 * (self.Enviroment.get_reward(s,next_state) + 
                            self.discount * v_temp[next_state])
                    else:
                            new_V += self.Enviroment.transition_probs(s,next_state,a) * (self.Enviroment.get_reward(s,next_state) + 
                            self.discount * v_temp[next_state])                    

                action_val[a] = new_V        
            self.policy[s] = np.argmax(action_val)
        self.state_action_value()
    
    def policy_iteration(self):
        while True:
            self.policy_evaluation()
            policy_stable = self.policy_improvement()
            if(policy_stable):
                break
        self.state_action_value()

### functions

In [44]:
def plot(Enviroment,agent):
    op_pol = np.zeros((Enviroment.MAX_X,Enviroment.MAX_Y))
    final_val = np.zeros((Enviroment.MAX_X,Enviroment.MAX_Y))
    for i in range(Enviroment.MAX_X):
        for j in range(Enviroment.MAX_Y):
            if(np.sum((Enviroment.available_states() == i*Enviroment.MAX_X+j).astype(int)) == 1):
                op_pol[i][j] = agent.policy[i*Enviroment.MAX_X+j]
    op_pol_list = op_pol.tolist()
    for i in range(Enviroment.MAX_X):
        for j in range(Enviroment.MAX_Y):
            if(np.sum((Enviroment.available_states() == i*Enviroment.MAX_X+j).astype(int)) == 1):
                if(op_pol_list[i][j] == 0):
                    op_pol_list[i][j] = 'l'
                if(op_pol_list[i][j] == 1):
                    op_pol_list[i][j] = 'd'
                if(op_pol_list[i][j] == 2):
                    op_pol_list[i][j] = 'r'
                if(op_pol_list[i][j] == 3):
                    op_pol_list[i][j] = 'u'
            else:
                op_pol_list[i][j] = 'T'
    op_pol_list[Enviroment.MAX_X-1][Enviroment.MAX_X-1] = 'G'
    for i in range(Enviroment.MAX_X):
        print(op_pol_list[i])

    for i in range(Enviroment.MAX_X):
        for j in range(Enviroment.MAX_Y):
            if(np.sum((Enviroment.available_states() == i*Enviroment.MAX_X+j).astype(int)) == 1):
                final_val[i][j] = agent.V[i*Enviroment.MAX_X+j]
    print(final_val)

## Part 1

In [45]:
make_map(810198367)

array([[0.e+00, 1.e-04, 1.e-04, 1.e+00, 1.e+00, 1.e+00],
       [1.e+00, 1.e+00, 1.e-04, 1.e+00, 1.e+00, 1.e+00],
       [1.e+00, 1.e+00, 1.e-04, 1.e-04, 1.e-04, 1.e+00],
       [1.e+00, 1.e+00, 1.e+00, 1.e+00, 1.e-04, 1.e-04],
       [1.e+00, 1.e+00, 1.e+00, 1.e+00, 1.e+00, 1.e-04],
       [1.e+00, 1.e+00, 1.e+00, 1.e+00, 1.e+00, 0.e+00]])

In [46]:
LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3
student_num = 810198367
map = make_map(student_num)
action_space = np.array([LEFT,DOWN,RIGHT,UP])

e1 = FrozenLake(student_num,action_space,map,0.94)
a1 = agent(e1,0.000001,0.9)

In [47]:
a1.policy_iteration()

In [48]:
plot(e1,a1)

['r', 'r', 'd', 'T', 'T', 'T']
['T', 'T', 'd', 'T', 'T', 'T']
['T', 'T', 'r', 'r', 'd', 'T']
['T', 'T', 'T', 'T', 'r', 'd']
['T', 'T', 'T', 'T', 'T', 'd']
['T', 'T', 'T', 'T', 'T', 'G']
[[19.30392701 23.35221698 28.0764827   0.          0.          0.        ]
 [ 0.          0.         33.47586961  0.          0.          0.        ]
 [ 0.          0.         40.58080014 48.86418261 58.50424611  0.        ]
 [ 0.          0.          0.          0.         69.72287804 82.77856455]
 [ 0.          0.          0.          0.          0.         96.00145616]
 [ 0.          0.          0.          0.          0.          0.        ]]


In [51]:
a1.state_action_val

array([[19.30392701,  0.        , 23.35221698, 19.30392701],
       [19.30392701,  0.        , 28.0764827 , 23.35221698],
       [23.35221698, 33.47586961,  0.        , 28.0764827 ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        , 40.58080014,  0.        , 28.0764827 ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        , 48.86418261, 33.47586961],
       [40.58080014,  0.        , 58.50424611,  0.        ],
       [48.86418261, 69.

In [52]:
e1 = FrozenLake(student_num,action_space,map,0.94)
a1 = agent(e1,0.000001,0.9)

In [53]:
a1.value_iteration()

In [54]:
plot(e1,a1)

['r', 'r', 'd', 'T', 'T', 'T']
['T', 'T', 'd', 'T', 'T', 'T']
['T', 'T', 'r', 'r', 'd', 'T']
['T', 'T', 'T', 'T', 'r', 'd']
['T', 'T', 'T', 'T', 'T', 'd']
['T', 'T', 'T', 'T', 'T', 'G']
[[19.34820256 23.39540585 28.11688762  0.          0.          0.        ]
 [ 0.          0.         33.51128881  0.          0.          0.        ]
 [ 0.          0.         40.62180714 48.91190064 58.55977791  0.        ]
 [ 0.          0.          0.          0.         69.78750319 82.85377208]
 [ 0.          0.          0.          0.          0.         96.06041741]
 [ 0.          0.          0.          0.          0.          0.        ]]


In [55]:
a1.state_action_val

array([[19.34820256,  0.        , 23.39540585, 19.34820256],
       [19.34820256,  0.        , 28.11688762, 23.39540585],
       [23.39540585, 33.51128881,  0.        , 28.11688762],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        , 40.62180714,  0.        , 28.11688762],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        , 48.91190064, 33.51128881],
       [40.62180714,  0.        , 58.55977791,  0.        ],
       [48.91190064, 69.

## Part 2

In [56]:
np.round_((make_map_part2(810198367)), decimals = 3)

array([[0.   , 0.001, 0.001, 0.996, 0.772, 0.171],
       [0.67 , 0.375, 0.001, 0.064, 0.976, 0.853],
       [0.335, 0.444, 0.001, 0.001, 0.001, 0.137],
       [0.454, 0.407, 0.807, 0.027, 0.001, 0.001],
       [0.208, 0.175, 0.169, 0.359, 0.463, 0.001],
       [0.775, 0.888, 0.939, 0.853, 0.995, 0.   ]])

In [57]:
LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3
student_num = 810198367
map = make_map_part2(student_num)
action_space = np.array([LEFT,DOWN,RIGHT,UP])

e2 = FrozenLake(student_num,action_space,map,0.7)
a2 = agent(e2,0.000001,0.9)

In [58]:
a2.policy_iteration()

In [59]:
plot(e2,a2)

['r', 'r', 'd', 'd', 'd', 'd']
['r', 'r', 'd', 'd', 'd', 'd']
['r', 'r', 'r', 'r', 'd', 'd']
['r', 'r', 'r', 'r', 'r', 'd']
['r', 'r', 'r', 'r', 'r', 'd']
['u', 'u', 'r', 'r', 'r', 'G']
[[ 7.57250441 11.18767623 15.13268487 18.09085162 17.06240108 22.00375189]
 [ 6.96097647 14.37643218 20.71792736 26.36715707 33.09371449 38.33864416]
 [ 9.2451923  18.88845612 27.1294995  35.99545644 44.85899868 53.86404989]
 [10.31733217 18.70357511 33.56148044 44.54449507 56.82811871 69.75017725]
 [13.14222629 22.00859663 34.07528878 50.43333369 69.86419941 88.00316081]
 [ 5.37391463 12.17646357 25.44743617 46.72057938 77.7133737   0.        ]]


In [61]:
a2.state_action_val

array([[ 7.57250441,  6.96097647, 11.18767623,  7.57250441],
       [ 7.57250441, 14.37643218, 15.13268487, 11.18767623],
       [11.18767623, 20.71792736, 18.09085162, 15.13268487],
       [15.13268487, 26.36715707, 17.06240108, 18.09085162],
       [18.09085162, 33.09371449, 22.00375189, 17.06240108],
       [17.06240108, 38.33864416, 22.00375189, 22.00375189],
       [ 6.96097647,  9.2451923 , 14.37643218,  7.57250441],
       [ 6.96097647, 18.88845612, 20.71792736, 11.18767623],
       [14.37643218, 27.1294995 , 26.36715707, 15.13268487],
       [20.71792736, 35.99545644, 33.09371449, 18.09085162],
       [26.36715707, 44.85899868, 38.33864416, 17.06240108],
       [33.09371449, 53.86404989, 38.33864416, 22.00375189],
       [ 9.2451923 , 10.31733217, 18.88845612,  6.96097647],
       [ 9.2451923 , 18.70357511, 27.1294995 , 14.37643218],
       [18.88845612, 33.56148044, 35.99545644, 20.71792736],
       [27.1294995 , 44.54449507, 44.85899868, 26.36715707],
       [35.99545644, 56.

In [62]:
e2 = FrozenLake(student_num,action_space,map,0.7)
a2 = agent(e2,0.000001,0.9)

In [63]:
a2.value_iteration()

In [64]:
plot(e2,a2)

['r', 'r', 'd', 'd', 'd', 'd']
['r', 'r', 'd', 'd', 'd', 'd']
['r', 'r', 'r', 'r', 'd', 'd']
['r', 'r', 'r', 'r', 'r', 'd']
['r', 'r', 'r', 'r', 'r', 'd']
['u', 'r', 'r', 'r', 'r', 'G']
[[ 8.68670814 12.40207016 16.43107398 19.51559256 18.87407697 25.04680121]
 [ 7.98660308 15.52954425 22.03799217 27.89791596 34.97229934 41.3486654 ]
 [10.42943158 20.22034083 28.65594644 37.76474822 46.93338577 56.94201728]
 [11.94248964 20.47981599 35.48501098 46.62537777 59.15996181 72.51388207]
 [15.95645006 25.32285147 37.57898426 53.53852888 72.27236376 89.79365522]
 [ 9.00390345 19.88518493 37.4032345  58.78101963 86.50025633  0.        ]]


In [65]:
a2.state_action_val

array([[ 8.68670814,  7.98660308, 12.40207016,  8.68670814],
       [ 8.68670814, 15.52954425, 16.43107398, 12.40207016],
       [12.40207016, 22.03799217, 19.51559256, 16.43107398],
       [16.43107398, 27.89791596, 18.87407697, 19.51559256],
       [19.51559256, 34.97229934, 25.04680121, 18.87407697],
       [18.87407697, 41.3486654 , 25.04680121, 25.04680121],
       [ 7.98660308, 10.42943158, 15.52954425,  8.68670814],
       [ 7.98660308, 20.22034083, 22.03799217, 12.40207016],
       [15.52954425, 28.65594644, 27.89791596, 16.43107398],
       [22.03799217, 37.76474822, 34.97229934, 19.51559256],
       [27.89791596, 46.93338577, 41.3486654 , 18.87407697],
       [34.97229934, 56.94201728, 41.3486654 , 25.04680121],
       [10.42943158, 11.94248964, 20.22034083,  7.98660308],
       [10.42943158, 20.47981599, 28.65594644, 15.52954425],
       [20.22034083, 35.48501098, 37.76474822, 22.03799217],
       [28.65594644, 46.62537777, 46.93338577, 27.89791596],
       [37.76474822, 59.

## Part 3

In [72]:
LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3
student_num = 810198367
map = make_map_part3(student_num)
action_space = np.array([LEFT,DOWN,RIGHT,UP])

e3 = FrozenLake(student_num,action_space,map,0.7)
a3 = agent(e3,0.000001,0.9)

In [73]:
a3.value_iteration()

In [74]:
plot(e3,a3)

['u', 'l', 'l', 'l', 'l', 'l', 'l', 'l', 'r', 'r', 'r', 'r', 'r', 'r', 'r']
['u', 'u', 'u', 'u', 'l', 'd', 'd', 'l', 'd', 'r', 'u', 'u', 'u', 'u', 'u']
['u', 'u', 'r', 'u', 'l', 'l', 'l', 'l', 'd', 'r', 'r', 'l', 'r', 'u', 'u']
['r', 'u', 'u', 'u', 'u', 'u', 'l', 'd', 'd', 'd', 'u', 'd', 'r', 'u', 'l']
['u', 'u', 'u', 'u', 'r', 'u', 'd', 'l', 'l', 'd', 'l', 'r', 'd', 'u', 'd']
['u', 'u', 'u', 'r', 'r', 'r', 'd', 'd', 'd', 'd', 'd', 'r', 'd', 'l', 'd']
['u', 'u', 'd', 'r', 'u', 'r', 'r', 'r', 'd', 'd', 'd', 'd', 'd', 'r', 'd']
['r', 'r', 'r', 'd', 'u', 'r', 'r', 'r', 'r', 'd', 'd', 'd', 'd', 'd', 'd']
['r', 'r', 'r', 'u', 'l', 'd', 'r', 'u', 'r', 'r', 'r', 'd', 'd', 'd', 'd']
['d', 'd', 'd', 'd', 'd', 'r', 'r', 'd', 'r', 'r', 'r', 'd', 'd', 'r', 'd']
['d', 'r', 'd', 'd', 'l', 'd', 'r', 'r', 'd', 'r', 'r', 'd', 'd', 'd', 'd']
['d', 'd', 'd', 'd', 'd', 'd', 'r', 'r', 'r', 'r', 'r', 'd', 'd', 'd', 'd']
['d', 'l', 'l', 'l', 'l', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'd', 'd', 'd']
['d', 'l', '

### Higher Reward for the goal state

In [75]:
LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3
student_num = 810198367
map = make_map_part3(student_num)
action_space = np.array([LEFT,DOWN,RIGHT,UP])

e4 = FrozenLake(student_num,action_space,map,0.7,-1,10000)
a4 = agent(e4,0.1,0.9)

In [78]:
a4.value_iteration()

In [79]:
plot(e4,a4)

['r', 'r', 'r', 'r', 'r', 'd', 'r', 'r', 'r', 'r', 'd', 'd', 'd', 'd', 'd']
['r', 'd', 'r', 'd', 'd', 'd', 'd', 'r', 'd', 'r', 'd', 'd', 'd', 'd', 'd']
['r', 'r', 'r', 'r', 'r', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd']
['r', 'r', 'r', 'r', 'r', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd']
['r', 'r', 'r', 'r', 'r', 'r', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd']
['d', 'r', 'r', 'r', 'r', 'r', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd']
['d', 'd', 'd', 'r', 'r', 'r', 'r', 'r', 'd', 'd', 'd', 'd', 'd', 'd', 'd']
['d', 'r', 'r', 'd', 'r', 'r', 'r', 'r', 'r', 'd', 'd', 'd', 'd', 'd', 'd']
['d', 'r', 'r', 'r', 'd', 'd', 'd', 'r', 'r', 'r', 'd', 'd', 'd', 'd', 'd']
['d', 'd', 'd', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'd', 'd', 'd', 'd']
['r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'd', 'r', 'r', 'd', 'd', 'd', 'd']
['d', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'd', 'd', 'd', 'd']
['r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'd', 'd', 'd']
['r', 'r', '