# Import Libraries

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

# Define a class

In [2]:
# This is a class for creating the Robot environement

class RobotEnv:
    
    # Constructor for initialization, takes as input the parameters of the  env such as start, end, rewards, positions...
    def __init__(self,
                 dims=(6,6),
                 rewards={'r_time':-1,'r_pond':-15,'r_croissant':200,'r_cogs':200,'r_work':15},
                 start=(1,0),
                 end=(5,5),
                 positions={'pond':[(2,4),(4,3)], 'cogs':[(5,2)], 'croissant':[(1,4)]},
                 tubes=[ [(0,0), (3,5)], [(1,2), (4,1)] ],
                 walls=[ [(0,2), (0,3)], [(1,2), (1,3)] ,[(2,2), (2,3)], [(1,5), (2,5)], [(3,0), (4,0)], [(3,1), (4,1)] ,[(5,2),(5,3)]],
                 max_steps=1000,
                 max_episodes=1000,
                ):
        self._dims = dims
        self._rewards = rewards
        self._start = start
        self._end = end
        self._tubes = tubes
        self._walls = walls
        self._positions = positions
        self._max_steps = max_steps
        self._max_episodes = max_episodes
        
        # initialize the grid, R_matrix and Q_matrix
        self._initialize_grid()
        self._initialize_R_matrix()
        self._initialize_Q_matrix()
        
        self.rng = np.random.default_rng(42)
        
    
    # getters and setters
    
    # getter and setter for dims
    @property
    def dims(self):
        return self._dims
    
    @dims.setter
    def dims(self, dims):
        # When changing the dims we have to re-initialize: grid, R, Q
        self._dims = dims
        self._initialize_grid()
        self._initialize_R_matrix()
        self._initialize_Q_matrix()
        
    
    # getter and setter for start
    @property
    def start(self):
        return self._start
    
    @start.setter
    def start(self, start):
        self._start = start
    
    # getter and setter for end
    @property
    def end(self):
        return self._end
    
    @end.setter
    def end(self, end):
        self._end = end
        
    
    # getter and setter for tubes
    @property
    def tubes(self):
        return self._tubes
    
    @tubes.setter
    def tubes(self, tubes):
        self._tubes = tubes
        
    
    # getter and setter for max steps
    @property
    def max_steps(self):
        return self._max_steps
    
    @max_steps.setter
    def max_steps(self, max_steps):
        self._max_steps = max_steps
        
    
    # getter and setter for max episodes
    @property
    def max_episodes(self):
        return self._max_episodes
    
    @max_steps.setter
    def max_episodes(self, max_episodes):
        self._max_episodes= max_episodes
     
    
    # getter and setter for max walls
    @property
    def walls(self):
        return self._walls
    
    @walls.setter
    def walls(self, walls):
        self._walls = walls
     
    # some property with only getter, in other words the user can't modify those properties
    @property
    def grid(self):
        return self._grid
    
    @property
    def R(self):
        return self._R
    
    @property
    def Q(self):
        return self._Q
    
    @property
    def rewards(self):
        return self._rewards
    
    @property
    def positions(self):
        return self._positions
    

    # function to initialize the grid, the propose of the grid is for visualization
    def _initialize_grid(self):
        self._grid = np.zeros(self.dims)
        for position in self._positions:
            for pos in self._positions[position]:
                self._grid[pos[0], pos[1]] = self._rewards['r_'+position]
                #print(position, ":", pos, ":", self._grid[pos[0], pos[1]])
        
        self._grid[self._end[0], self._end[1]] = self._rewards['r_work']
        #print(self._grid)
        
    
    # initialize the rewards matrix
    def _initialize_R_matrix(self):
        d1 = self.dims[0]
        d2 = self.dims[1]
        self._R = np.empty((d1*d2,d1*d2))
        self._R.fill(np.nan) # Fastest way to initilize R matrix
        
        # call some methods instead to write all the function here, more cleaner and better for debuging
        self.__fillPossibleActions()
        self.__initializeTunnels()
        self.__initializeCrogs()
        self.__initializePonds()
        self.__initializeCroissants()
        self.__initializeGoalPoint()
        self.__initializeWalls()
        
    
    # helper function, used in initialization methods
    def move_to(self, l, cell):
            for i in [-self._dims[0], -1, 1, self._dims[0]]:
                if cell + i < self._dims[0]*self._dims[1]:
                    l.append((cell + i, cell))
            return l
        
    # function to fill all the possible moves
    def __fillPossibleActions(self):
        # All moves where reward is -1 for action. Generate programmatically cos writing by hand is tedious
        ones = []
        for i in range(self._dims[0]):
            for j in range(self._dims[1]):
                cell = i*self._dims[0] + j
                if j != 5:
                    ones.append((cell, cell+1)) # move right unless agent is on right edge
                if cell - 6 >= 0:
                    ones.append((cell, cell-6)) # move up if not in top row
                if cell + 6 < 36:
                    ones.append((cell, cell+6)) # move down if cell not in bottom row
                if j != 0:
                    ones.append((cell, cell-1)) # move left if not on left edge
                ones.append((cell, cell)) # staying still is possible, why not?
        
        ones = tuple(zip(*ones))
        self._R[ones] = self._rewards['r_time']
        
        # the propose of this loop is to remove the option if staying in the same cell
        for i in range(self._dims[0]):
            for j in range(self._dims[1]):
                cell = i*self._dims[0] + j
                self._R[(cell,cell)] = np.nan
        
    # initialize the goal rewards
    def __initializeGoalPoint(self):
        end_cell = self._end[0]*self._dims[0] + self._end[1]
        ends = self.move_to([], end_cell)
        ends.append([end_cell, end_cell])
        ends = tuple(zip(*ends))
        self._R[ends] = self._rewards['r_work']
        
    
    # initialize the Tunnels
    def __initializeTunnels(self):
        tubes_cells = []
        for tubes in self._tubes:
            tubes_cell = []
            for tube in tubes:
                cell_nb = tube[0]*self._dims[0]+tube[1]
                tubes_cell.append(cell_nb)
            #print(tubes_cell)
            tubes_cells.append(tuple(tubes_cell))
        for cell in tubes_cells.copy():
            #print(cell)
            tubes_cells.append((cell[1], cell[0]))
        
        tubes_cells = tuple(zip(*tubes_cells))
        self._R[tubes_cells] = self._rewards['r_time']
        
    
    # initialize the Crogs rewards
    def __initializeCrogs(self):
        cogs = []
        for cog in self._positions['cogs']:
            cogs = self.move_to(cogs, cog[0]*self._dims[0]+cog[1])
            
        cogs = tuple(zip(*cogs))
        self._R[cogs] = self._rewards['r_cogs']
        
    
    # initialize the Ponds rewards
    def __initializePonds(self):
        # don't fall in the pond!
        #print(self._positions['pond'])
        ponds = []
        for pond in self._positions['pond']:
            p = pond[0]*self._dims[0]+pond[1]
            ponds = self.move_to(ponds, p)
            ponds.extend([(p,p)])
        
        #print(ponds)
        
        ponds = tuple(zip(*ponds))
        self._R[ponds] = self._rewards['r_pond']
        
    
    # initialize the Croissant rewards
    def __initializeCroissants(self):
        croissants = []
        for croissant in self._positions['croissant']:
            c = croissant[0]*self._dims[0]+croissant[1]
            croissants = self.move_to(croissants, c)
        
        #print(croissants)
        
        croissants = tuple(zip(*croissants))
        self._R[croissants] = self._rewards['r_croissant']
        
    
    # finally, construct the walls
    def __initializeWalls(self):
        for wall in self._walls:
            #print(wall)
            cell0 = wall[0][0]*self._dims[0]+wall[0][1]
            cell1 = wall[1][0]*self._dims[0]+wall[1][1]
            #print(cell0, ":", cell1)
            wall_in_matrix = (cell0, cell1)
            #print(wall_in_matrix)
            self._R[wall_in_matrix] = np.nan
            
            
    
    # display the matrix as pandas dataframe
    def display_matrix(self, matrix, start=None, end=None):
        pd.set_option("display.max_columns", None)
        display(pd.DataFrame(matrix).loc[start:end, start:end])
        
    
    # initialize the Q matrix with the same shape as R matrix
    def _initialize_Q_matrix(self):
        self._Q = np.zeros(self._R.shape)
        
    
    # function to run over only one eposide, takes as input alpha, gamma and epsilon
    def run_episode(self, alpha, gamma, epsilon):
        R_tot = 0
        print(self._start)
        s = self._start[0]*self._dims[0]+self._start[1]
        goal_state = self._end[0]*self._dims[0]+self._end[1]
        Q = self._Q
        R = self._R
        print("Starting Point: ", s)
        print("End Point: ", goal_state)
        
        # some listes to keep track of visisted cogs and croissant cells
        # to prevent the agent from re-visit them in the same episode to collect resources
        cogs_visisted = []
        croissant_visisted = []
                
        cogs_cells = [cog_position[0]*self._dims[0]+cog_position[1] for cog_position in self.positions['cogs']]
        
        croissant_cells = [croissant_position[0]*self._dims[0]+croissant_position[1] for croissant_position in self.positions['croissant']]
           
        print('cogs_cells: ', cogs_cells, type(cogs_cells))
        print('croissant_cells: ', croissant_cells, type(cogs_cells))
        for i in range(self._max_steps):
            print('i', i)
            # actions selection
            available, best = self.__get_actions(R, Q, s)
            print('Available actions: ', available)
            print("Best actions: ",best)
            
            # update states:
            # loop to avoid re visit the same crogs and croissant
            move = False
            while not move:
                # chosse an action first
                a = self.__get_greedy_action(epsilon, available, best)
                print("Chosen Action: ", a)
                
                # if the next sell is cogs, and it is the first time we visit them append it to visited and move one
                if a in cogs_cells:
                    if a not in cogs_visisted:
                        #print(cogs_visisted, a)
                        cogs_visisted.append(a)
                        move = True
                    else:
                        available.remove(a)
                        best.remove(a)
                        continue
                        
                        
                # same thing here
                if a in croissant_cells:
                    if a not in croissant_visisted:
                        #print(croissant_visisted, a)
                        croissant_visisted.append(a)
                        move = True
                    else:
                        available.remove(a)
                        best.remove(a)
                        continue
                        
                else:
                    move = True
                
            
            #print('Move')        
            s_old = s
            s = a
            
            # update Q:
            print('Old Q Value: ',Q[s_old, a])
            Q[s_old, a] = Q[s_old, a] + alpha * (R[s_old, a] +
                                                gamma * Q[s, :].max() -
                                                Q[s_old, a])
            
            print('New Q Value: ',Q[s_old, a])
            # update total accumulated reward for this episode
            print('current R: ', R[s_old, a])
            R_tot += R[s_old, a]
            
            
            if s == goal_state:
                break
            
            print('\n')
    
    def __get_actions(self, R, Q, s):
        """Returns best and all available actions as lists
        """
        available = np.where(~np.isnan(R[s]))[0]
        q_vals = [Q[s,a] for a in available]
        best = available[np.where(q_vals == np.max(q_vals))[0]]
        # change the type from np array to list
        available = available.tolist()
        best = best.tolist()
        return available, best

        
    def __get_greedy_action(self, epsilon, available, best):
        """Given epsilon, and available and best actions,
        Pick an appropriate action.
        """
        if self.rng.uniform() > epsilon:
            a = self.rng.choice(best)
        else:
            a = self.rng.choice(available)
        return a

In [3]:
robotEnv = RobotEnv()

In [4]:
robotEnv.display_matrix(robotEnv.R, 25)

Unnamed: 0,25,26,27,28,29,30,31,32,33,34,35
25,,-1.0,,,,,-1.0,,,,
26,-1.0,,-15.0,,,,,200.0,,,
27,,-1.0,-15.0,-1.0,,,,,-1.0,,
28,,,-15.0,,-1.0,,,,,-1.0,
29,,,,-1.0,,,,,,,15.0
30,,,,,,,-1.0,,,,
31,-1.0,,,,,-1.0,,200.0,,,
32,,-1.0,,,,,-1.0,,,,
33,,,-15.0,,,,,200.0,,-1.0,
34,,,,-1.0,,,,,-1.0,,15.0


In [5]:
%%time
robotEnv.run_episode(1,0.8,0.9)

(1, 0)
Starting Point:  6
End Point:  35
cogs_cells:  [32] <class 'list'>
croissant_cells:  [10] <class 'list'>
i 0
Available actions:  [0, 7, 12]
Best actions:  [0, 7, 12]
Chosen Action:  7
Old Q Value:  0.0
New Q Value:  -1.0
current R:  -1.0
i 1
Available actions:  [1, 6, 8, 13]
Best actions:  [1, 6, 8, 13]
Chosen Action:  6
Old Q Value:  0.0
New Q Value:  -1.0
current R:  -1.0
i 2
Available actions:  [0, 7, 12]
Best actions:  [0, 12]
Chosen Action:  0
Old Q Value:  0.0
New Q Value:  -1.0
current R:  -1.0
i 3
Available actions:  [1, 6, 23]
Best actions:  [1, 6, 23]
Chosen Action:  1
Old Q Value:  0.0
New Q Value:  -1.0
current R:  -1.0
i 4
Available actions:  [0, 2, 7]
Best actions:  [0, 2, 7]
Chosen Action:  7
Old Q Value:  0.0
New Q Value:  -1.0
current R:  -1.0
i 5
Available actions:  [1, 6, 8, 13]
Best actions:  [1, 8, 13]
Chosen Action:  13
Old Q Value:  0.0
New Q Value:  -1.0
current R:  -1.0
i 6
Available actions:  [7, 12, 14, 19]
Best actions:  [7, 12, 14, 19]
Chosen Action:

Chosen Action:  18
Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 62
Available actions:  [12, 19]
Best actions:  [12, 19]
Chosen Action:  12
Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 63
Available actions:  [6, 13, 18]
Best actions:  [6, 13, 18]
Chosen Action:  6
Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 64
Available actions:  [0, 7, 12]
Best actions:  [0, 7, 12]
Chosen Action:  12
Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 65
Available actions:  [6, 13, 18]
Best actions:  [6, 13, 18]
Chosen Action:  6
Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 66
Available actions:  [0, 7, 12]
Best actions:  [0, 7, 12]
Chosen Action:  7
Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 67
Available actions:  [1, 6, 8, 13]
Best actions:  [1]
Chosen Action:  13
Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 68
Available actions:  [7, 12, 14, 19]
Best actions:  [7, 12, 14, 19]
Chosen Action:  19
Old Q Value:  -1.0
New Q V

Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 167
Available actions:  [8, 19, 24, 26, 31]
Best actions:  [8, 19, 31]
Chosen Action:  8
Old Q Value:  0.0
New Q Value:  -1.0
current R:  -1.0
i 168
Available actions:  [2, 7, 14, 25]
Best actions:  [2, 7, 14, 25]
Chosen Action:  14
Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 169
Available actions:  [8, 13, 20]
Best actions:  [8, 13, 20]
Chosen Action:  13
Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 170
Available actions:  [7, 12, 14, 19]
Best actions:  [7, 12, 14, 19]
Chosen Action:  7
Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 171
Available actions:  [1, 6, 8, 13]
Best actions:  [1, 6, 8, 13]
Chosen Action:  13
Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 172
Available actions:  [7, 12, 14, 19]
Best actions:  [7, 12, 14, 19]
Chosen Action:  19
Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 173
Available actions:  [13, 18, 20]
Best actions:  [13, 18, 20]
Chosen Action

Best actions:  [2, 7, 14, 25]
Chosen Action:  7
Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 257
Available actions:  [1, 6, 8, 13]
Best actions:  [1, 6, 8, 13]
Chosen Action:  1
Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 258
Available actions:  [0, 2, 7]
Best actions:  [0, 2, 7]
Chosen Action:  7
Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 259
Available actions:  [1, 6, 8, 13]
Best actions:  [1, 6, 8, 13]
Chosen Action:  6
Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 260
Available actions:  [0, 7, 12]
Best actions:  [0, 7, 12]
Chosen Action:  12
Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 261
Available actions:  [6, 13, 18]
Best actions:  [6, 13, 18]
Chosen Action:  13
Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 262
Available actions:  [7, 12, 14, 19]
Best actions:  [7, 12, 14, 19]
Chosen Action:  14
Old Q Value:  -1.0
New Q Value:  -1.0
current R:  -1.0
i 263
Available actions:  [8, 13, 20]
Best actions:  [8,

In [6]:
robotEnv.grid

array([[  0.,   0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0., 200.,   0.],
       [  0.,   0.,   0.,   0., -15.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0., -15.,   0.,   0.],
       [  0.,   0., 200.,   0.,   0.,  15.]])

In [7]:
robotEnv.display_matrix(robotEnv.Q, 25)

Unnamed: 0,25,26,27,28,29,30,31,32,33,34,35
25,0.0,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0
26,-1.0,0.0,-15.0,0.0,0.0,0.0,0.0,200.0,0.0,0.0,0.0
27,0.0,-1.0,-15.0,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0
28,0.0,0.0,-15.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
29,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0
30,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0
31,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0
32,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0
33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0
34,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
robotEnv.R

array([[nan, -1., nan, ..., nan, nan, nan],
       [-1., nan, -1., ..., nan, nan, nan],
       [nan, -1., nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, -1., nan],
       [nan, nan, nan, ..., -1., nan, 15.],
       [nan, nan, nan, ..., nan, -1., 15.]])

In [9]:
robotEnv.Q

array([[ 0., -1.,  0., ...,  0.,  0.,  0.],
       [-1.,  0., -1., ...,  0.,  0.,  0.],
       [ 0., -1.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0., -1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [10]:
robotEnv.display_matrix(robotEnv.Q, 25)

Unnamed: 0,25,26,27,28,29,30,31,32,33,34,35
25,0.0,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0
26,-1.0,0.0,-15.0,0.0,0.0,0.0,0.0,200.0,0.0,0.0,0.0
27,0.0,-1.0,-15.0,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0
28,0.0,0.0,-15.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
29,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0
30,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0
31,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0
32,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0
33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0
34,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
