In [2]:
import numpy as np
import random

# define the gridworld
gridworld_length = 10
gridworld_width = 10
gridworld_size = gridworld_length * gridworld_width

# add a random source and destination to the gridworld
source = random.randint(0, gridworld_size-1)
destination = random.randint(0, gridworld_size-1)
while destination == source:
    destination = random.randint(0, gridworld_size-1)

# add some random obstacles to the gridworld, making sure that the source and destination are not obstacles
num_obstacles = 20
obstacle_positions = random.sample(range(gridworld_size), num_obstacles)
for position in obstacle_positions:
    if position != source and position != destination:
        obstacle_positions.remove(position)

# define the agent's actions
actions = ['u', 'd', 'l', 'r']

# use a dictionary for convenient computation of next state id.
ds_actions = {"u": -gridworld_length, "d": gridworld_length, "l": -1, "r": 1}  

collisionReward = -1
destinationReward = 10
defaultReward = 0 # Can be set to -0.1 to encourage faster learning
failChance = 0.1
# discount factor
gamma = 0.9

# compute next state id and corresponding reward according to 
# current state and action.
def getNextStateReward(s, a):
    # take action with probability 0.1, stay in same state with probability 0.9
    if random.random() > 1 - failChance:
        return s, defaultReward
    ns = s + ds_actions[a]
    if ns < 0 or ns >= gridworld_size or ns in obstacle_positions:
        return s, collisionReward
    if ns == destination:
        return ns, destinationReward
    return ns, defaultReward

# define a function to print the current state of the gridworld
def print_gridworld(s):
    for row in range(gridworld_length):
        for col in range(gridworld_width):
            if row*gridworld_width + col in obstacle_positions:
                print('O', end=' ')
            elif row*gridworld_width + col == destination:
                print('D', end=' ')
            elif [row, col] == [s//gridworld_width, s%gridworld_width]:
                print('A', end=' ')
            else:
                print('-', end=' ')
        print()

rewards=[]
# simulate the agent's actions
state = source
for _ in range(20):
    # choose a random action
    action = random.choice(actions)
    # take the action and update the agent's position
    state, reward = getNextStateReward(state, action)
    rewards.append(reward)
    # print the current state of the gridworld
    print_gridworld(state)
    print()


- - O - - - - - - - 
- - - - - O O - - - 
- - - - - - - - - - 
- - - - - - - - - - 
- - - O O - - - - - 
- O - - O - - - - O 
- - - - - - - A - - 
- - - - - - - - - - 
- - - - - - - O - - 
- - - - - - - O - - 

- - O - - - - - - - 
- - - - - O O - - - 
- - - - - - - - - - 
- - - - - - - - - - 
- - - O O - - - - - 
- O - - O - - - - O 
- - - - - - - - - - 
- - - - - - - A - - 
- - - - - - - O - - 
- - - - - - - O - - 

- - O - - - - - - - 
- - - - - O O - - - 
- - - - - - - - - - 
- - - - - - - - - - 
- - - O O - - - - - 
- O - - O - - - - O 
- - - - - - - A - - 
- - - - - - - - - - 
- - - - - - - O - - 
- - - - - - - O - - 

- - O - - - - - - - 
- - - - - O O - - - 
- - - - - - - - - - 
- - - - - - - - - - 
- - - O O - - - - - 
- O - - O - - A - O 
- - - - - - - - - - 
- - - - - - - - - - 
- - - - - - - O - - 
- - - - - - - O - - 

- - O - - - - - - - 
- - - - - O O - - - 
- - - - - - - - - - 
- - - - - - - - - - 
- - - O O - - A - - 
- O - - O - - - - O 
- - - - - - - - - - 
- - - - -