## Q-Learning: Reinforcement Learning

![](https://i.ibb.co/c8LXj7X/Capture.png)

<center>Окружение</center>

In [49]:
import numpy as np

In [51]:
gamma = 0.75 # Дискаунт
alpha = 0.9 # Коэффициент обучения

# Кодирование состояний 
location_to_state = {'L1' : 0, 'L2' : 1, 'L3' : 2, 'L4' : 3,
                    'L5' : 4, 'L6' : 5, 'L7' : 6, 'L8' : 7, 'L9' : 8}
state_to_location = dict((state,location) for location,state in location_to_state.items()) # Словарь кода и состояния
# Коды действий
actions = [0,1,2,3,4,5,6,7,8]

![](https://i.ibb.co/k4kgnQS/Capture.png)

<center>Таблица награды</center>

In [52]:
# Награда
rewards = np.array([[0,1,0,0,0,0,0,0,0],
                    [1,0,1,0,0,0,0,0,0],
                    [0,1,0,0,0,1,0,0,0],
                    [0,0,0,0,0,0,1,0,0],
                    [0,1,0,0,0,0,0,1,0],
                    [0,0,1,0,0,0,0,0,0],
                    [0,0,0,1,0,0,0,1,0],
                    [0,0,0,0,1,0,1,0,1],
                    [0,0,0,0,0,0,0,1,0]])

In [53]:
rewards.shape[1]

9

The following function is going to take two arguments: 

- starting location in the warehouse and 
- end location in the warehouse respectively 

It will return the optimal route for reaching the end location from the starting location in the form of an ordered list (containing the letters).

In [58]:
# -----------Q-Learning algorithm-----------
def q_learn(end_location):
  Q = np.array(np.zeros(rewards.shape)) # Инициализация массива наград
  ending_state = location_to_state[end_location]
  rewards_new = np.copy(rewards)
  rewards_new[ending_state,ending_state] = 999 # Увеличиваем награду за конечное состояние
  dimension = rewards.shape[1]
  for i in range(1000):
    current_state = np.random.randint(0, dimension) # Случайное состояние
    playable_actions = [] # Действия для перехода к соседней локации
    for j in range(dimension): # Определяем возможные действия с текущего состояния
        if rewards_new[current_state, j] > 0:
            playable_actions.append(j)
    next_state = np.random.choice(playable_actions) # Случайно выбираем переход
    # Определяем временную разницу
    TD = rewards_new[current_state,next_state] + gamma * Q[next_state, np.argmax(Q[next_state,])] - Q[current_state,next_state]
    # Обновляем значение качества с коэффициентом обучения по уравнению Беллмана
    Q[current_state,next_state] += alpha * TD
  print(Q)
  return Q

In [59]:
def get_optimal_route(start_location, end_location):
    ending_state = location_to_state[end_location]

    Q = q_learn(end_location) # Формируем матрицу качества принятия решений для состояний
    route = [start_location] # Маршрут начинается с текущей позиции
    next_location = start_location
    
    while next_location != end_location:
        # Fetch the starting state
        starting_state = location_to_state[start_location] # Получаем начальное состояние
        next_state = np.argmax(Q[starting_state,]) # Ищем переход в максимальное по качеству состояние
        next_location = state_to_location[next_state]
        route.append(next_location)
        start_location = next_location
    return route

![](https://i.ibb.co/c8LXj7X/Capture.png)

In [62]:
route = get_optimal_route('L9', 'L1')

[[3996. 2249.    0.    0.    0.    0.    0.    0.    0.]
 [2998.    0. 1688.    0.    0.    0.    0.    0.    0.]
 [   0. 2249.    0.    0.    0. 1267.    0.    0.    0.]
 [   0.    0.    0.    0.    0.    0.  951.    0.    0.]
 [   0. 2249.    0.    0.    0.    0.    0. 1267.    0.]
 [   0.    0. 1688.    0.    0.    0.    0.    0.    0.]
 [   0.    0.    0.  714.    0.    0.    0. 1267.    0.]
 [   0.    0.    0.    0. 1688.    0.  951.    0.  951.]
 [   0.    0.    0.    0.    0.    0.    0. 1267.    0.]]


In [63]:
route

['L9', 'L8', 'L5', 'L2', 'L1']