In [1]:
import random
import numpy as np 
import pandas as pd 
from IPython.display import display

In [None]:
class RL:
    def __init__(self, data, A, S, R, T, S0, alpha=0.2, gamma=0.95, epsilon=0.1):
        self.data = data
        self.A = A
        self.S = S
        self.R = R
        self.T = T
        self.S0 = S0
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q = pd.DataFrame(data, index=S, columns=A.keys())
        self.episodes_info = []

        for s in S:
            for a in A:
                if self.next_state(s, a) in S:
                    self.Q.loc[s, a] = 0

    def max_action(self, s):
        return self.Q.loc[s].idxmax()

    def next_state(self, s, a):
        coords = list(map(int, s.strip("()").split(", ")))
        delta = self.A[a]
        new_coords = [coords[i] + delta[i] for i in range(len(coords))]
        return "(" + ", ".join(map(str, new_coords)) + ")"

    def choose_action(self, s, use_epsilon=False):
        if not use_epsilon:
            return self.max_action(s)
        if random.random() < self.epsilon:
            valid_actions = [
                a for a in self.A.keys()
                if self.next_state(s, a) in self.S
            ]
            return random.choice(valid_actions)
        return self.max_action(s)

    def episode(self, steps=8, use_epsilon=False, show_Q=True):
        self.episodes_info = []
        for s0 in self.S0:
            s = s0
            states = [s]
            actions = []
            rewards = []

            for _ in range(steps):
                a = self.choose_action(s, use_epsilon=use_epsilon)
                next_state = self.next_state(s, a)

                if next_state not in self.S:
                    next_state = s

                reward = self.R[next_state]
                next_state_max_reward = self.Q.loc[next_state].max()
                current = self.Q.loc[s, a]
                target = reward + self.gamma * next_state_max_reward
                self.Q.loc[s, a] = current + self.alpha * (target - current)

                actions.append(a)
                rewards.append(reward)
                states.append(next_state)

                if next_state in self.T:
                    break

                s = next_state

            self.episodes_info.append(
                {
                    "start": s0,
                    "states": states,
                    "actions": actions,
                    "rewards": rewards,
                }
            )

            if show_Q:
                display(self.Q)


A = {
    'up':    (1, 0),
    'right': (0, 1),
    'down':  (-1, 0),
    'left':  (0, -1)
}

S = [f"({i}, {j})" for i in range(1, 5) for j in range(1, 5)]

R = {s: -1 for s in S}
R["(3, 2)"] = -5
R["(4, 2)"] = -20
R["(1, 3)"] = -20
R["(2, 4)"] = -5
R["(4, 4)"] = 20

T = ["(4, 2)", "(1, 3)", "(4, 4)"]
S0 = ["(2, 2)", "(3, 1)", "(1, 1)", "(1, 2)", "(1, 4)", "(1, 4)", "(2, 2)", "(3, 1)", "(1, 1)"]


In [3]:
data_a = [[None] * len(A) for _ in range(len(S))]
S0_a = S0[:8]

rl_a = RL(data_a, A, S, R, T, S0_a)
rl_a.episode(steps=8, use_epsilon=False, show_Q=False)

print("Tabela Q no início do 9º episódio (após 8 episódios):")
display(rl_a.Q)


Tabela Q no início do 9º episódio (após 8 episódios):


Unnamed: 0,up,right,down,left
"(1, 1)",-0.398,-0.2,,
"(1, 2)",-0.2,-4.0,,-0.238
"(1, 3)",0.0,0.0,,0.0
"(1, 4)",-1.0,,,-4.0
"(2, 1)",-0.398,-0.2,-0.2,
"(2, 2)",-1.0,-0.2,-0.2,-0.2
"(2, 3)",-0.2,0.0,0.0,0.0
"(2, 4)",0.56,,0.0,0.0
"(3, 1)",-0.2,-1.0,-0.2,
"(3, 2)",-4.0,-0.2,0.0,0.0


In [4]:
data_b = [[None] * len(A) for _ in range(len(S))]

rl_b = RL(data_b, A, S, R, T, S0)
rl_b.episode(steps=8, use_epsilon=False, show_Q=False)

ep9 = rl_b.episodes_info[8]

print("Estados visitados no 9º episódio:")
print(ep9["states"])
print("Ações tomadas no 9º episódio:")
print(ep9["actions"])


Estados visitados no 9º episódio:
['(1, 1)', '(1, 2)', '(2, 2)', '(2, 3)', '(2, 4)', '(3, 4)', '(4, 4)']
Ações tomadas no 9º episódio:
['right', 'up', 'right', 'right', 'up', 'up']


In [5]:
data_c = [[None] * len(A) for _ in range(len(S))]

rl_c = RL(data_c, A, S, R, T, S0)
rl_c.episode(steps=8, use_epsilon=False, show_Q=False)

ep1 = rl_c.episodes_info[0]
ep7 = rl_c.episodes_info[6]

R_total_1 = sum(ep1["rewards"])
R_total_7 = sum(ep7["rewards"])

print("Recompensa total no 1º episódio:", R_total_1)
print("Recompensa total no 7º episódio:", R_total_7)


Recompensa total no 1º episódio: -25
Recompensa total no 7º episódio: -21


In [9]:
data_d = [[None] * len(A) for _ in range(len(S))]

rl_d = RL(data_d, A, S, R, T, S0)
rl_d.episode(steps=8, use_epsilon=False, show_Q=False)

s = "(4, 3)"
valid_actions = [a for a in A.keys() if rl_d.next_state(s, a) in S]

p_down = 0.0
if "down" in valid_actions:
    p_down = rl_d.epsilon / len(valid_actions)

print("Ações válidas em (4,3):", valid_actions)
print("epsilon:", rl_d.epsilon)
print("Probabilidade de escolher 'down' com política epsilon-greedy:", round(p_down * 100, 2), "%")


Ações válidas em (4,3): ['right', 'down', 'left']
epsilon: 0.1
Probabilidade de escolher 'down' com política epsilon-greedy: 3.33 %
