<a href="https://colab.research.google.com/github/biancafsena/REINFORCEMENT-LEARNING/blob/main/Trabalho_Reinforcement_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Bibliotecas**

In [1]:
import numpy as np
import random

### **1. Modelagem do ambiente 2D**

In [2]:
class Environment:
    def __init__(self, grid_size, obstacles, delivery_points): # Inicializa o ambiente com o tamanho da grade, obstáculos e pontos de entrega.
        self.grid_size = grid_size
        self.obstacles = obstacles
        self.delivery_points = delivery_points
        self.agent_position = (0, 0) # Definir a posição inicial do agente.

    def is_valid_move(self, action): # Verificar se o movimento é válido, considerando limites da grade e obstáculos.
        x, y = self.agent_position
        if action == 'up':
            x -= 1
        elif action == 'down':
            x += 1
        elif action == 'left':
            y -= 1
        elif action == 'right':
            y += 1

        if x < 0 or x >= self.grid_size[0] or y < 0 or y >= self.grid_size[1] or (x, y) in self.obstacles:
            return False
        return True

In [3]:

    def take_action(self, action): # Realizar o movimento, caso seja válido.
        if self.is_valid_move(action):
            x, y = self.agent_position
            if action == 'up':
                x -= 1
            elif action == 'down':
                x += 1
            elif action == 'left':
                y -= 1
            elif action == 'right':
                y += 1
            self.agent_position = (x, y)

    def is_at_delivery_point(self): # Verificar se o agente está em um ponto de entrega.
        return self.agent_position in self.delivery_points

### **2. Definição do MDP**

In [4]:
class MDP:
    def __init__(self, environment, discount_factor, exploration_prob):
       # Inicializar o MDP com o ambiente, fator de desconto e probabilidade de exploração.
       self.environment = environment #ambiente
       self.discount_factor = discount_factor #fator de desconto
       self.exploration_prob = exploration_prob #probabilidade de exploração

    def get_possible_actions(self): # Retornar as possíveis ações que o agente pode realizar.
        return ['up', 'down', 'left', 'right']

### **3. Implementação da Q-Table**

In [5]:
class QTable:
    def __init__(self, state_space, action_space):  # Inicializar a tabela Q com zeros.
        self.q_table = np.zeros((state_space, action_space))

    def get_value(self, state, action):  # Obter o valor da tabela Q para um estado e ação específicos.
        return self.q_table[state, action]

    def set_value(self, state, action, value): # Definir o valor da tabela Q para um estado e ação específicos.
        self.q_table[state, action] = value

### **4. Implementação do agente**

In [6]:
class QLearningAgent:
    def __init__(self, environment, mdp, q_table, learning_rate, exploration_prob):
      # Inicializar o agente de Q-Learning com o ambiente, MDP, tabela Q, taxa de aprendizado e probabilidade de exploração.
        self.environment = environment
        self.mdp = mdp
        self.q_table = q_table
        self.learning_rate = learning_rate
        self.exploration_prob = exploration_prob

    def select_action(self):    # Escolher uma ação com base na probabilidade de exploração ou nos valores da tabela Q.
        if random.uniform(0, 1) < self.exploration_prob:
            return random.choice(self.mdp.get_possible_actions())
        else:
            state = self.environment.agent_position
            possible_actions = self.mdp.get_possible_actions()
            q_values = [self.q_table.get_value(state, possible_actions.index(action)) for action in possible_actions]
            return possible_actions[np.argmax(q_values)]

    def update_q_table(self, state, action, reward, next_state):  # Atualizar a tabela Q com base na recompensa obtida.
        max_q = np.max([self.q_table.get_value(next_state, self.mdp.get_possible_actions().index(a)) for a in self.mdp.get_possible_actions()])
        current_q = self.q_table.get_value(state, self.mdp.get_possible_actions().index(action))
        new_q = (1 - self.learning_rate) * current_q + self.learning_rate * (reward + self.mdp.discount_factor * max_q)
        self.q_table.set_value(state, self.mdp.get_possible_actions().index(action), new_q)

### **5. Living Penalty**

In [7]:
def apply_living_penalty(reward, time_penalty):   # Aplicar uma penalidade por à recompensa, baseada no custo de tempo
    return reward - time_penalty

# **6. Treinamento e Avaliação do agente**

In [2]:
def train_and_evaluate(grid_size, obstacles, delivery_points, discount_factor, exploration_prob, learning_rate, num_episodes, time_penalty):
    test_environment = Environment(grid_size, obstacles, delivery_points)  # Cria um ambiente de teste com o tamanho da grade, obstáculos e pontos de entrega fornecidos

class Environment:
    def __init__(self, grid_size, obstacles, delivery_points):
        self.grid_size = grid_size
        self.obstacles = obstacles
        self.delivery_points = delivery_points
        self.agent_position = (0, 0)

    def is_valid_move(self, action):
        x, y = self.agent_position
        if action == 'up':
            x -= 1
        elif action == 'down':
            x += 1
        elif action == 'left':
            y -= 1
        elif action == 'right':
            y += 1

        if x < 0 or x >= self.grid_size[0] or y < 0 or y >= self.grid_size[1] or (x, y) in self.obstacles:
            return False
        return True

    def take_action(self, action):
        if self.is_valid_move(action):
            x, y = self.agent_position
            if action == 'up':
                x -= 1
            elif action == 'down':
                x += 1
            elif action == 'left':
                y -= 1
            elif action == 'right':
                y += 1
            self.agent_position = (x, y)

    def is_at_delivery_point(self):
        return self.agent_position in self.delivery_points


class MDP:
    def __init__(self, environment, discount_factor, exploration_prob):
        self.environment = environment
        self.discount_factor = discount_factor
        self.exploration_prob = exploration_prob

    def get_possible_actions(self):
        return ['up', 'down', 'left', 'right']

class QTable:
    def __init__(self, state_space, action_space):
        self.q_table = np.zeros((state_space, action_space))

    def get_value(self, state, action):
        return self.q_table[state, action]

    def set_value(self, state, action, value):
        self.q_table[state, action] = value

class QLearningAgent:
    def __init__(self, environment, mdp, q_table, learning_rate, exploration_prob):
        self.environment = environment
        self.mdp = mdp
        self.q_table = q_table
        self.learning_rate = learning_rate
        self.exploration_prob = exploration_prob

    def select_action(self):
        if random.uniform(0, 1) < self.exploration_prob:
            return random.choice(self.mdp.get_possible_actions())
        else:
            state = self.environment.agent_position
            possible_actions = self.mdp.get_possible_actions()
            q_values = [self.q_table.get_value(state, possible_actions.index(action)) for action in possible_actions]
            max_q_index = np.argmax(q_values)
            if max_q_index < len(possible_actions):
                return possible_actions[max_q_index]
            else:
                return random.choice(possible_actions)

    def update_q_table(self, state, action, reward, next_state):
        max_q = np.max([self.q_table.get_value(next_state, self.mdp.get_possible_actions().index(a)) for a in self.mdp.get_possible_actions()])
        current_q = self.q_table.get_value(state, self.mdp.get_possible_actions().index(action))
        new_q = (1 - self.learning_rate) * current_q + self.learning_rate * (reward + self.mdp.discount_factor * max_q)
        self.q_table.set_value(state, self.mdp.get_possible_actions().index(action), new_q)

def apply_living_penalty(reward, time_penalty):
    return reward - time_penalty

def train_and_evaluate(grid_size, obstacles, delivery_points, discount_factor, exploration_prob, learning_rate, num_episodes, time_penalty):
    test_environment = Environment(grid_size, obstacles, delivery_points)
    test_mdp = MDP(test_environment, discount_factor, exploration_prob)
    state_space = np.prod(grid_size)
    action_space = len(test_mdp.get_possible_actions())
    test_q_table = QTable(state_space, action_space)
    test_agent = QLearningAgent(test_environment, test_mdp, test_q_table, learning_rate, exploration_prob)
    total_rewards = []

    for episode in range(num_episodes):
        state = test_environment.agent_position
        total_reward = 0

        state = test_environment.agent_position
        # Inicializa a recompensa total para o episódio atual
        total_reward = 0

        while not test_environment.is_at_delivery_point(): # Loop até que o agente alcance um ponto de entrega
            possible_actions = test_mdp.get_possible_actions() # Obtém a lista de ações possíveis do MDP
            action = test_agent.select_action() # O agente escolhe uma ação com base em sua estratégia (Q-Learning)

            if action not in possible_actions:  # Garante que a ação escolhida seja uma das ações possíveis
                action = random.choice(possible_actions)

                test_environment.take_action(action)  # O agente realiza a ação no ambiente e obtém uma recompensa
            reward = -0.1 # Define uma recompensa padrão de -0.1 por movimento

            if test_environment.is_at_delivery_point():  # Se o agente alcançar um ponto de entrega, a recompensa é definida como 1.0 (recompensa máxima)
                reward = 1.0

                reward = apply_living_penalty(reward, time_penalty) # Aplica uma penalidade de tempo à recompensa
            total_reward += reward # Atualiza a recompensa total do episódio
            next_state = test_environment.agent_position  # Obtém o próximo estado após a ação
            test_agent.update_q_table(state, action, reward, next_state) # Atualiza a tabela Q do agente com base na recompensa obtida
            state = next_state # Atualiza o estado atual para o próximo estado

        total_rewards.append(total_reward) # Armazena a recompensa total do episódio atual na lista de recompensas
    average_reward = np.mean(total_rewards) # Calcula a recompensa média ao longo de todos os episódios
    return average_reward # Retorna a recompensa média como resultado

        # Parâmetros de teste
grid_size = (4, 4)
obstacles = [(1, 1), (2, 2)]
delivery_points = [(3, 3), (0, 2)]
discount_factor = 0.9
exploration_prob = 0.2
learning_rate = 0.1
num_episodes = 1000
time_penalty = 0.1

# Chama a função de treinamento e avaliação com os parâmetros de teste
average_reward = train_and_evaluate(grid_size, obstacles, delivery_points, discount_factor, exploration_prob, learning_rate, num_episodes, time_penalty)

# Imprime a recompensa média obtida após o treinamento
print("Recompensa média: ", average_reward)

NameError: ignored