# Q-Learning

This notebook is was built by Camille-Amaury JUGE in order to better understands the Q-Learning RL method. 

We will construct an agent which will try to find its path in a matrix (labyrinth) without walls, and then add complexity to it.

## Imports

In [1]:
import numpy as np
import sys
import time
import ipywidgets as widgets
from ipycanvas import MultiCanvas


## Classes

In [2]:
class Labyrinth(object):
    def __init__(self, x, y, max_reward, punition, final_pos, walls, wall_punition):
        super(Labyrinth, self).__init__()
        
        # grid
        self.lab = np.array([np.array([punition for j in range(y)]) for i in range(x)])
        for i in range(self.lab.shape[0]):
            for j in range(self.lab.shape[1]):
                self.lab[0][1] = punition
        self.lab[final_pos[0]][final_pos[1]] = max_reward
        
        # walls
        self.walls = walls
        self.wall_punition = wall_punition
        
        self.final_pos = final_pos
        
    def update_Pos(self, x, y, direction):
        r = 0
        if (y == 0 and direction == 0) or (y == self.lab.shape[1]-1 and direction == 1) or (x == 0 and direction == 2) or (x == self.lab.shape[0]-1 and direction == 3):
            r = self.lab[final_pos[0]][final_pos[1]] if self.final_pos == (x,y) else self.wall_punition
        else:
            if self.walls[x + (-1 if direction == 2 else (1 if direction == 3 else 0))][y + (-1 if direction == 0 else (1 if direction == 1 else 0))]:
                r = self.wall_punition
            else:
                y = y + (-1 if direction == 0 else (1 if direction == 1 else 0))
                x = x + (-1 if direction == 2 else (1 if direction == 3 else 0))
                r = self.lab[x][y]
        return (x,y), r
    
    def add_punition_cell(self, x, y, punition):
        self.lab[x][y] = punition

In [3]:
class Robot(object):
    def __init__(self, init_pos, lab, experiencing_rate, _decrease_rate, learning_rate):
        super(Robot, self).__init__()
        self.init_pos = init_pos
        self.pos = self.init_pos
        
        self.actions = ["Left", "Right", "Top", "Bottom"]
        self.labyrinth = lab
        self.Q = np.zeros((self.labyrinth.lab.shape[0], self.labyrinth.lab.shape[1], len(self.actions)))
        self.init_rates = [experiencing_rate, _decrease_rate, learning_rate]
        self.rates = self.init_rates
        self.history = []
        
    def train(self, epoch, long_term_importance):
        for i in range(epoch):
            self.pos = self.init_pos
            
            sys.stdout.write("\repoch {} / {}".format(i+1, epoch))
            sys.stdout.flush()
            
            while self.pos != self.labyrinth.final_pos:
                action = self.choose_action()
                Q_old = (self.pos[0], self.pos[1], action)
                self.pos, reward = self.labyrinth.update_Pos(self.pos[0], self.pos[1], action)
                self.update_Q(Q_old, self.pos, long_term_importance, reward)
                
            self.rates[0] -= self.rates[1]  
            
    def update_Q(self, q_old, pos_new, long_term_importance, reward):
        Q_new = max(self.Q[pos_new[0]][pos_new[1]])
        
        #update following bellman's equation
        self.Q[q_old[0]][q_old[1]][q_old[2]] += self.rates[2] * (reward + long_term_importance * Q_new - self.Q[q_old[0]][q_old[1]][q_old[2]])
        
    def choose_action(self):
        # if experiencing then random decision
        if np.random.uniform(0, 1) <= self.rates[0]:
            action = np.random.randint(0,4)
        # if using memory then maximize rewards
        else:
            action = np.argmax(self.Q[self.pos[0]][self.pos[1]])
        return action
        
    def __repr__(self):
        return self.__str__()
    def __str__(self):
        s = ""
        for i, rows in enumerate(self.Q):
            s += "|"
            for j, column in enumerate(rows):
                if self.labyrinth.final_pos == (i,j):
                    s+="🞬|"
                else:
                    if self.labyrinth.walls[i][j]:
                        s+="⯀|"
                    else:
                        max_k = np.argmax(column)
                        if max_k == 0:
                            s += "🡄|"
                        elif max_k == 1:
                            s += "🡆|"
                        elif max_k == 2:
                            s += "🡅|"
                        elif max_k == 3:
                            s += "🡇|"
            s += "\n"
        return s
    
    def draw(self):
        
        _border = 2
        _cell_size = 10
        _size = (self.labyrinth.lab.shape[0] * _cell_size + 2 * _border, self.labyrinth.lab.shape[1] * _cell_size + 2 * _border)
        
        # Create a multi-layer canvas with 4 layers
        canvas = MultiCanvas(3, width=_size[0], height=_size[1])
        
        # 
        for i, rows in enumerate(self.Q):
            for j, column in enumerate(rows):
                i_mod = j
                j_mod = i
                if self.labyrinth.final_pos == (i,j):
                    self.draw_cell(canvas[0], i_mod, j_mod, _cell_size, _border, "green")
                else:
                    if self.labyrinth.walls[i][j]:
                        self.draw_cell(canvas[0], i_mod, j_mod, _cell_size, _border, "black")
                    else:
                        if self.init_pos == (i,j):
                            self.draw_cell(canvas[0], i_mod, j_mod, _cell_size, _border, "gray")
                        direction = np.argmax(column)
                        self.draw_arrow(canvas[1], i_mod, j_mod, _cell_size, _border, direction)
        
        canvas[2].fill_style = "black"
        canvas[2].fill_rect(0, 0, _size[0], _border)
        canvas[2].fill_rect(0, _size[1]-2, _size[0], _border)
        canvas[2].fill_rect(0, 0, _border, _size[1])
        canvas[2].fill_rect(_size[0]-2, 0, _border, _size[1])
        
        return canvas
                            
    
    def draw_cell(self, canvas, x, y, size, border, color):
        canvas.fill_style = color
        canvas.fill_rect(x*size + border, y*size + border, size, size)
        
    def draw_arrow(self, canvas, x, y, size, border, direction):
        canvas.fill_style = "brown"
        canvas.begin_path()
        if direction == 0:
            canvas.move_to(border + x*size + 2, border + y*size + int(size/2))
            canvas.line_to(border + x*size + 8, border + y*size + int(size/2) + 3)
            canvas.line_to(border + x*size + 8, border + y*size + int(size/2) - 3)
        elif direction == 1:
            canvas.move_to(border + x*size + 8, border + y*size + int(size/2))
            canvas.line_to(border + x*size + 2, border + y*size + int(size/2) + 3)
            canvas.line_to(border + x*size + 2, border + y*size + int(size/2) - 3)
        elif direction == 2:
            canvas.move_to(border + x*size + int(size/2), border + y*size + 2)
            canvas.line_to(border + x*size + int(size/2) + 3, border + y*size + 8)
            canvas.line_to(border + x*size + int(size/2) - 3, border + y*size + 8)
        elif direction == 3:
            canvas.move_to(border + x*size + int(size/2), border + y*size + 8)
            canvas.line_to(border + x*size + int(size/2) + 3, border + y*size + 2)
            canvas.line_to(border + x*size + int(size/2) - 3, border + y*size + 2)
        canvas.fill()
        

## Process

In [4]:
_final_position = (0,19)
_init_position = (19,0)
_lab_dim = (20,20)
_rewards = (5,0,-20)
# epsilon rate
_experiencing_rate = 1
_decrease_rate = 0.0005
_learning_rate = 0.1
_epochs = 2000
long_term_importance = 0.9

In [5]:
walls_1 = np.array([
    np.array([False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False]),
    np.array([False,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False]),
    np.array([False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False]),
    np.array([False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False]),
    np.array([False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False]),
    np.array([False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False]),
    np.array([False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False]),
    np.array([False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False]),
    np.array([False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False]),
    np.array([True,True,True,True,True,True,True,True,False,False,False,False,False,False,False,False,False,False,True,False]),
    np.array([False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False]),
    np.array([False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False]),
    np.array([False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False]),
    np.array([False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False]),
    np.array([False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False]),
    np.array([False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False]),
    np.array([False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False]),
    np.array([True,True,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False]),
    np.array([False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False]),
    np.array([False,True,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False])
])

In [6]:
lab_1 = Labyrinth(_lab_dim[0], _lab_dim[1], _rewards[0], _rewards[1], _final_position, walls_1, _rewards[2])

In [7]:
robot_1 = Robot(_init_position, lab_1, _experiencing_rate, _decrease_rate, _learning_rate)
canvas = robot_1.draw()

In [8]:
canvas

MultiCanvas(height=204, width=204)

In [9]:
time1 = time.time()
robot_1.train(_epochs, long_term_importance)
time2 = time.time()
print('\nfunction took {:.3f} s'.format((time2-time1)))

epoch 2000 / 2000
function took 30.280 s


## Interpretation

how to interpret :

Arrows represent the way the agent tried to maximize the reward by reaching the green square.
It begins on the grey square.

In [10]:
canvas = robot_1.draw()
canvas

MultiCanvas(height=204, width=204)