In [1]:
import numpy as np

#class used to represent the 1d grid.
class grid_1d:
    
    def __init__(self):
        self.dim = 6
        self.pos_goal = 5 # the goal position
        self.reward_goal = 1
        self.pos_trap = 0 # the trap position
        self.reward_trap = -1
        self.start = 2 # the starting position
        self.s = self.start #the actual position
        self.complete = False #True if we reached a terminal state - terminal state = (goal position, trap position)
        self.possible_actions = [0, 1] # 0 for left 1 for right
    
    #to display the grid
    def display_grid(self):
        print("-" * (self.dim * 5 + 5))
        row = []
        for i in range(self.dim):
            if i == self.pos_goal :
                row.append("|  G ")    
            elif i == self.pos_trap :
                row.append("|  T ")
            elif i == self.start :
                row.append("|  S ")
            else:
                row.append("|    ")
        row.append("|")
        print(' '.join(row))    
        print("-" * (self.dim * 5 + 5))
        

    def step(self, a): # a is the action
        
        #when agent reaches a terminal state self.complete becomes True 
        if self.s == self.pos_goal:
            self.complete = True
            reward = self.reward_goal
        elif self.s == self.pos_trap:
            self.complete = True
            reward = self.reward_trap
        else:  
            # move to left
            if a == 0 and self.s > 0:
                self.s -= 1
            # move to right
            elif a == 1 and self.s < self.dim - 1:
                self.s += 1
            reward = 0
        return self.s, reward, self.complete
 
    # to restart grid
    def restart(self):
        self.s = self.start
        self.complete = False
        return self.s
    
    
    # to get and display policy
    def policy(self, q_table):
        policy=[]
    
        q_max, q_max_action=max_qs(q_table)
        
        #print(q_max)
        #print(q_max_action)
        print("-" * (self.dim * 5 + 5))
        for action in q_max_action:
            if action == 0:
                policy.append("|  L ")
            elif action == 1:
                policy.append("|  R ")
        policy.append("|")
        print(' '.join(policy))
        print("-" * (self.dim * 5 + 5))
                   
    


In [2]:
def max_qs(q_table): #used to get max values of q and associated actions between left and right
    q_max=[]
    q_max_action=[]
    for q in range(len(q_table)):
        q_max.append(max(q_table[q][0],q_table[q][1]))
        if(max(q_table[q][0],q_table[q][1]) == q_table[q][0]):
            q_max_action.append(0)
        else :
            q_max_action.append(1)
        
    return q_max,q_max_action

In [3]:
mygrid = grid_1d()
mygrid.display_grid()

-----------------------------------
|  T  |     |  S  |     |     |  G  |
-----------------------------------


In [4]:
# parameters chosen 
nb_episodes = 1000
gamma = 0.9
eps = 0.5
alpha = 0.1

In [5]:
# initialize all the values of q_table at zero
q_table = np.zeros((mygrid.dim, len(mygrid.possible_actions)))

# iterate through episodes
for episode in range(nb_episodes):
    actual_step = mygrid.restart() #step 0
    done = False
    #episode terminate when terminal state reached
    while done == False:
        # exploration
        if np.random.rand() < eps:
            action = np.random.choice(mygrid.possible_actions)
        else:
            # exploitation
            action = np.argmax(q_table[actual_step])

        next_step, reward, done = mygrid.step(action)
        
        # update the q-table
        q_table[actual_step][action] += alpha*(reward + gamma*np.max(q_table[next_step])- q_table[actual_step][action])
        
        actual_step = next_step
            
#display q_table
for q in range(len(q_table)):
    print("position: "+str(q))
    for p in range(len(q_table[0])):   
        print(q_table[q][p])
    print("\n")
            


position: 0
-4.880024006256031
-4.85637585917002


position: 1
-4.122890334521776
6.54330557689164


position: 2
5.886746077579065
7.274907347657242


position: 3
6.544695173652013
8.084037972270364


position: 4
7.272547516558673
8.98320154745949


position: 5
9.97666641154442
9.98291112886439




We can see that the closer we get to the goal the larger the values are .
And the larger values of both (left and right) are the values that produce a policy.

In our case we got all q_table[i][1] larger than the q_table[i][0] and that's how we know that in order to reach the goal we should move to the right and if we move to the left we will only get farther from the goal .

In [6]:
mygrid.policy(q_table)

-----------------------------------
|  R  |  R  |  R  |  R  |  R  |  R  |
-----------------------------------
