In [52]:
# imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import zipfile
from tqdm import tqdm
from collections import defaultdict
import math

## Preparação dos dados

In [53]:
with zipfile.ZipFile("/content/output_data.zip", "r") as zip_ref:
    zip_ref.extractall("/content/")

In [54]:
def get_stock_df_full(ticker, path="/content/output_data", df=None):
    file_name = ticker.upper() + "-full.csv"
    header = ["date", "codbdi", "codneg", "tpmerc", "nomres", "especi", "prazot", "modref", "preabe", "premax", "premin", "premed", "preult", "preofc", "preofv", "totneg", "quatot", "voltot", "preexe", "indopc", "datven", "fatcot", "ptoexe", "codisi", "dismes"]
    list_dir = os.listdir(path)

    if type(df) is not pd.DataFrame:
        df = pd.DataFrame(columns=header)

    for file in list_dir:
        file_path = os.path.join(path, file)
        if os.path.isfile(file_path):
            if file == file_name:
                df = pd.concat([df, pd.read_csv(file_path, names=header)], ignore_index=True)
        else:
            sub_dir = file_path
            df = get_stock_df_full(ticker, sub_dir, df=df)

    return df

In [55]:
df_petr = get_stock_df_full("PETR4")
df_petr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5943 entries, 0 to 5942
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    5943 non-null   object
 1   codbdi  5943 non-null   object
 2   codneg  5943 non-null   object
 3   tpmerc  5943 non-null   object
 4   nomres  5943 non-null   object
 5   especi  5943 non-null   object
 6   prazot  5943 non-null   object
 7   modref  5943 non-null   object
 8   preabe  5943 non-null   object
 9   premax  5943 non-null   object
 10  premin  5943 non-null   object
 11  premed  5943 non-null   object
 12  preult  5943 non-null   object
 13  preofc  5943 non-null   object
 14  preofv  5943 non-null   object
 15  totneg  5943 non-null   object
 16  quatot  5943 non-null   object
 17  voltot  5943 non-null   object
 18  preexe  5943 non-null   object
 19  indopc  5943 non-null   object
 20  datven  5943 non-null   object
 21  fatcot  5943 non-null   object
 22  ptoexe  5943 non-null   

In [56]:
df_brfs = get_stock_df_full("BRFS3")
df_brfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3479 entries, 0 to 3478
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    3479 non-null   object
 1   codbdi  3479 non-null   object
 2   codneg  3479 non-null   object
 3   tpmerc  3479 non-null   object
 4   nomres  3479 non-null   object
 5   especi  3479 non-null   object
 6   prazot  3479 non-null   object
 7   modref  3479 non-null   object
 8   preabe  3479 non-null   object
 9   premax  3479 non-null   object
 10  premin  3479 non-null   object
 11  premed  3479 non-null   object
 12  preult  3479 non-null   object
 13  preofc  3479 non-null   object
 14  preofv  3479 non-null   object
 15  totneg  3479 non-null   object
 16  quatot  3479 non-null   object
 17  voltot  3479 non-null   object
 18  preexe  3479 non-null   object
 19  indopc  3479 non-null   object
 20  datven  3479 non-null   object
 21  fatcot  3479 non-null   object
 22  ptoexe  3479 non-null   

In [57]:
df_vale = get_stock_df_full("VALE3")
df_vale.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5937 entries, 0 to 5936
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    5937 non-null   object
 1   codbdi  5937 non-null   object
 2   codneg  5937 non-null   object
 3   tpmerc  5937 non-null   object
 4   nomres  5937 non-null   object
 5   especi  5937 non-null   object
 6   prazot  5937 non-null   object
 7   modref  5937 non-null   object
 8   preabe  5937 non-null   object
 9   premax  5937 non-null   object
 10  premin  5937 non-null   object
 11  premed  5937 non-null   object
 12  preult  5937 non-null   object
 13  preofc  5937 non-null   object
 14  preofv  5937 non-null   object
 15  totneg  5937 non-null   object
 16  quatot  5937 non-null   object
 17  voltot  5937 non-null   object
 18  preexe  5937 non-null   object
 19  indopc  5937 non-null   object
 20  datven  5937 non-null   object
 21  fatcot  5937 non-null   object
 22  ptoexe  5937 non-null   

In [58]:
df_brfs["date"] = pd.to_datetime(df_brfs["date"])
df_petr["date"] = pd.to_datetime(df_petr["date"])
df_vale["date"] = pd.to_datetime(df_vale["date"])

In [59]:
min_date_brfs = df_brfs["date"].min()

df_brfs = df_brfs[df_brfs["date"] >= min_date_brfs]
df_petr = df_petr[df_petr["date"] >= min_date_brfs]
df_vale = df_vale[df_vale["date"] >= min_date_brfs]

In [60]:
df_brfs = df_brfs[["codneg", "preabe", "date"]]
df_petr = df_petr[["codneg", "preabe", "date"]]
df_vale = df_vale[["codneg", "preabe", "date"]]

df_stock = pd.concat([df_brfs, df_petr, df_vale], ignore_index=True)

In [61]:
# Garantir que 'preabe' esteja no formato numérico
df_stock["preabe"] = pd.to_numeric(df_stock["preabe"], errors="coerce")
df_stock["preabe"] = df_stock["preabe"].astype(float) / 100
df_pivot = df_stock.pivot_table(index="date", columns="codneg", values="preabe", aggfunc="first")
df_pivot = df_pivot.reset_index()
df_pivot.columns.name = None  # Remove o nome do índice das colunas
df_pivot.head()

Unnamed: 0,date,BRFS3,PETR4,VALE3
0,2009-12-10,40.19,37.81,48.74
1,2009-12-11,40.28,38.05,49.63
2,2009-12-14,42.16,38.02,50.0
3,2009-12-15,41.3,37.71,49.9
4,2009-12-16,42.2,38.11,50.11


In [62]:
df_stock_train = df_pivot[df_pivot["date"] < "2023-01-01"]
df_stock_test = df_pivot[df_pivot["date"] >= "2023-01-01"]
df_stock_train.info()
print(df_stock_train["date"].min())
print(df_stock_train["date"].max())

<class 'pandas.core.frame.DataFrame'>
Index: 3231 entries, 0 to 3230
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          3231 non-null   datetime64[ns]
 1   BRFS3         3231 non-null   float64       
 2   PETR4         3231 non-null   float64       
 3   VALE3         3231 non-null   float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 126.2 KB
2009-12-10 00:00:00
2022-12-29 00:00:00


In [63]:
df_stock_train.head()

Unnamed: 0,date,BRFS3,PETR4,VALE3
0,2009-12-10,40.19,37.81,48.74
1,2009-12-11,40.28,38.05,49.63
2,2009-12-14,42.16,38.02,50.0
3,2009-12-15,41.3,37.71,49.9
4,2009-12-16,42.2,38.11,50.11


In [64]:
df_stock_train.describe()

Unnamed: 0,date,BRFS3,PETR4,VALE3
count,3231,3231.0,3231.0,3231.0
mean,2016-06-23 01:25:07.520891392,36.200009,21.093974,46.500987
min,2009-12-10 00:00:00,6.86,4.2,8.75
25%,2013-03-20 12:00:00,23.445,15.83,31.21
50%,2016-06-24 00:00:00,34.5,21.15,43.51
75%,2019-09-28 12:00:00,47.205,26.895,54.59
max,2022-12-29 00:00:00,72.24,38.11,119.8
std,,14.871806,7.191191,22.632235


## Criação do Ambiente

In [85]:
# Criar ambiente do Gym
import numpy as np
import gym
from enum import Enum
from gym import spaces


class Actions(Enum):
    HOLD = 0
    BUY_PETR4 = 1
    BUY_VALE3 = 2
    BUY_BRFS3 = 3
    SELL_PETR4 = 4
    SELL_VALE3 = 5
    SELL_BRFS3 = 6


class StockTradingEnv(gym.Env):
    def __init__(self, df, initial_balance = 10_000.0, dqn_mode = False):
        """
        Inicializa o ambiente.

        args:
            df: pd.Dataframe -> Dataframe com os dados para execução do ambiente.
            initial_balance: float -> Saldo inicial do agente.
        """
        self.df = df.reset_index(drop=True)
        self.df.columns = self.df.columns.str.strip()  # Remove espaços extras
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.available_balance = initial_balance
        self.hold_actions_count = 0
        self.dqn_mode = dqn_mode
        self.shares_held = {
            "PETR4": 0,
            "VALE3": 0,
            "BRFS3": 0
        }
        self.average_price = {
            "PETR4": 0,
            "VALE3": 0,
            "BRFS3": 0
        }

        print("Colunas disponíveis no DataFrame:", self.df.columns)

        self.current_step = 0
        self.action_space = spaces.Discrete(7)
        self.observation_space = spaces.Box(
            # Valores mínimos possíveis
            low=np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
            # Valores máximos estimados (assumindo maximo de R$1000 por ação, 1bi de saldo e máximo de 100k de unidades)
            high=np.array([1_000, 1_000, 1_000, 1_000_000_000, 1_000_000_000, 100_000, 100_000, 100_000, 1_000, 1_000, 1_000, 1, 1, 1, 1, 1, 1]),
            dtype=np.float32
        )
        self.df.head()


    def reset(self, step=None):
        """Reinicia o ambiente."""
        self.balance = self.initial_balance
        self.available_balance = self.initial_balance
        self.hold_actions_count = 0
        self.shares_held = {
            "PETR4": 0,
            "VALE3": 0,
            "BRFS3": 0
        }
        self.average_price = {
            "PETR4": 0,
            "VALE3": 0,
            "BRFS3": 0
        }
        # tornando o step inicial aleatório
        if step is None:
            self.current_step = np.random.randint(0, len(self.df) - 100)
        elif step < len(self.df) - 1:
            self.current_step = step
        else:
            self.current_step = 0
        return self._next_observation()


    def _next_observation(self):
        """Obtém o próximo estado (observação)"""
        current_observation = self.df.iloc[self.current_step]

        # Features para informar se o agente pode comprar ou vender
        can_buy_petr4 = 1 if self.available_balance >= current_observation["PETR4"] else 0
        can_buy_vale3 = 1 if self.available_balance >= current_observation["VALE3"] else 0
        can_buy_brfs3 = 1 if self.available_balance >= current_observation["BRFS3"] else 0

        can_sell_petr4 = 1 if self.shares_held["PETR4"] > 0 else 0
        can_sell_vale3 = 1 if self.shares_held["VALE3"] > 0 else 0
        can_sell_brfs3 = 1 if self.shares_held["BRFS3"] > 0 else 0

        # Normaliza cada variável com base nos limites máximos definidos no observation_space
        normalized_obs = np.array([
            current_observation["PETR4"] / 1_000,
            current_observation["VALE3"] / 1_000,
            current_observation["BRFS3"] / 1_000,
            self.balance / 1_000_000_000,
            self.available_balance / 1_000_000_000,
            self.shares_held["PETR4"] / 100_000,
            self.shares_held["VALE3"] / 100_000,
            self.shares_held["BRFS3"] / 100_000,
            self.average_price["PETR4"] / 1_000,
            self.average_price["VALE3"] / 1_000,
            self.average_price["BRFS3"] / 1_000,
            can_buy_petr4,
            can_buy_vale3,
            can_buy_brfs3,
            can_sell_petr4,
            can_sell_vale3,
            can_sell_brfs3
        ], dtype=np.float32)

        return normalized_obs


    def _update_balance(self):
        """Atualiza o saldo total, considerando o saldo disponível e o valor das ações mantidas."""
        current_value = self.df.loc[self.current_step]
        stock_value = sum(
            self.shares_held[ticker] * current_value[ticker]
            for ticker in self.shares_held
        )
        self.balance = self.available_balance + stock_value


    def step(self, action):
        """
        Executa a ação e retorna o novo estado, recompensa e status do episódio.

        args:
            action: int -> Ação a ser executada (Valor de 0 a 6, de acordo com Enum Actions).
        """
        current_observation = self.df.loc[self.current_step]
        reward = 0
        truncated = False
        terminated = self.current_step >= (len(self.df) - 1)
        asset = None

        previous_balance = self.balance

        if action == Actions.HOLD.value:
            # Penaliza inatividade para incentivar ação
            #  nesse caso, a penalidade irá aumentar conforme ele mantém a posição
            self.hold_actions_count += 1
            if sum(self.shares_held.values()) > 0:
                # Penalidade maior se estiver segurando ações
                reward = -(5 ** self.hold_actions_count)
            else:
                # Penalidade maior se não estiver segurando ações
                reward = -(2 ** self.hold_actions_count)
        else:
            # reseta a variável de controle de operações HOLD,
            #   para evitar que o agente nunca mantenha posição.
            self.hold_actions_count = 0
            if action in [Actions.BUY_PETR4.value, Actions.SELL_PETR4.value]:
                asset = "PETR4"
            elif action in [Actions.BUY_VALE3.value, Actions.SELL_VALE3.value]:
                asset = "VALE3"
            elif action in [Actions.BUY_BRFS3.value, Actions.SELL_BRFS3.value]:
                asset = "BRFS3"

            price = current_observation[asset]

            if "BUY" in Actions(action).name:
                num_shares = self.available_balance // price
                if num_shares > 0:
                    # Atualiza número de ações compradas
                    self.shares_held[asset] += num_shares
                    self.available_balance -= num_shares * price

                    # Atualiza o preço médio de compra
                    total_cost = self.average_price[asset] * (self.shares_held[asset] - num_shares) + (num_shares * price)
                    self.average_price[asset] = total_cost / self.shares_held[asset]

                    # Recompensa por realizar uma transação
                    reward = 500
                else:
                    # Penaliza compra sem saldo
                    reward = -1000
                    # Encerra o episódio
                    truncated = True
            elif "SELL" in Actions(action).name:
                num_shares = self.shares_held[asset]
                if num_shares > 0:
                    self.available_balance += num_shares * price

                    # Calcula lucro da operação como recompensa, com isso,
                    #  a expectativa é que as operações com maior lucro sejam priorizadas
                    profit = ((price * num_shares) - (self.average_price[asset] * num_shares))

                    if profit > 0:
                        reward = profit
                    else:
                        # em caso de prejuizo, reward negativo fixo, para evitar que o agente deixe de realizar operações
                        reward = -1

                    # Zera a posição no ativo
                    self.shares_held[asset] = 0
                    self.average_price[asset] = 0
                else:
                    # Penaliza venda sem ações
                    reward = -1000
                    # Encerra o episódio
                    truncated = True

        # Atualiza saldo total considerando ações mantidas
        self._update_balance()

        # Limita recompensas entre -1000 e 1000
        reward = float(np.clip(reward, -1000, 1000))

        if not terminated:
            self.current_step += 1

        if self.dqn_mode:
            return self._next_observation(), reward, terminated or truncated, {}
        else:
            return self._next_observation(), reward, terminated, truncated, {}


    def render(self):
        """Exibe informações do estado atual"""
        print(f"Step: {self.current_step}, Saldo disponível: {self.available_balance}, Ações: {self.shares_held}, Saldo Total: {self.balance}")

## Agente Q-Learing

### Criação do agente

In [66]:
# criando agent QLearning (seguindo o exemplo do blackjack do gymnasium)
# <https://gymnasium.farama.org/introduction/train_agent/>
# e adicionado a discretização dos valores, visto que os valores dos estados
# do mercado de ações são continuos, e ficaria inviavel treinar o QLearning dessa forma


class QLearningAgent:
    def __init__(
        self,
        env: gym.Env,
        learning_rate: float = 0.1,
        discount_factor: float = 0.95,
        initial_epsilon: float = 1.0,
        epsilon_decay: float = 0.99,
        final_epsilon: float = 0.01,
        num_bins: int = 10,
    ):
        """
        Agente Q-Learning para o ambiente de trading multi-ativo.

        Args:
            env: O ambiente Gym customizado para trading.
            learning_rate: Taxa de aprendizado (α).
            discount_factor: Fator de desconto (γ).
            initial_epsilon: Epsilon inicial para exploração.
            epsilon_decay: Fator de decaimento do epsilon.
            final_epsilon: Valor mínimo de epsilon.
            num_bins: Número de bins para discretizar os estados contínuos.
        """
        self.env = env
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon

        # Q-Table inicializada como um dicionário de valores Q para cada estado-ação
        self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))

        # Definição de bins para discretização do espaço de estados contínuo
        self.num_bins = num_bins
        self.bin_edges = self._create_bins()


    def _create_bins(self):
        """Cria bins para discretizar os valores contínuos dos estados."""
        bins = {}
        for i in range(self.env.observation_space.shape[0]):
            min_val = self.env.observation_space.low[i]
            max_val = self.env.observation_space.high[i]
            bins[i] = np.linspace(min_val, max_val, self.num_bins)
        return bins


    def discretize_state(self, state):
        """Converte um estado contínuo para um estado discreto."""
        return tuple(np.digitize(state[i], self.bin_edges[i]) for i in range(len(state)))


    def get_action(self, state):
        """Escolhe uma ação baseada na política epsilon-greedy."""
        discrete_state = self.discretize_state(state)

        if np.random.rand() < self.epsilon:
            # Exploration
            return self.env.action_space.sample()
        else:
            # Explotation
            return np.argmax(self.q_values[discrete_state])


    def update(self, state, action, reward, next_state, terminated):
        """Atualiza a Q-Table usando a equação de Bellman."""
        discrete_state = self.discretize_state(state)
        discrete_next_state = self.discretize_state(next_state)

        future_q_value = (not terminated) * np.max(self.q_values[discrete_next_state])
        temporal_difference = reward + self.gamma * future_q_value - self.q_values[discrete_state][action]

        self.q_values[discrete_state][action] = self.lr * temporal_difference


    def decay_epsilon(self):
        """Aplica o decaimento do epsilon para reduzir a exploração ao longo do tempo."""
        self.epsilon = max(self.final_epsilon, self.epsilon * self.epsilon_decay)

### Treinando o Agente (criando a Q-Table)

In [67]:
env = StockTradingEnv(df_stock_train)
agent = QLearningAgent(env, epsilon_decay=0.00001, final_epsilon=0.2, num_bins=100, learning_rate=0.01)
num_episodes = 3200 # o df de treino tem pouco mais que 3200 amostras

for episode in tqdm(range(num_episodes)):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        action = agent.get_action(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        agent.update(state, action, reward, next_state, terminated)
        state = next_state
        total_reward += reward

        if terminated or truncated:
            break

    agent.decay_epsilon()

    if episode % 100 == 0:
        print(f"Episódio {episode}/{num_episodes}, Recompensa Total: {total_reward}")

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


Colunas disponíveis no DataFrame: Index(['date', 'BRFS3', 'PETR4', 'VALE3'], dtype='object')


  0%|          | 14/3200 [00:00<00:23, 135.03it/s]

Episódio 0/3200, Recompensa Total: -1000.0


  3%|▎         | 111/3200 [00:01<00:38, 80.16it/s]

Episódio 100/3200, Recompensa Total: 5496.78


  7%|▋         | 220/3200 [00:02<00:22, 133.96it/s]

Episódio 200/3200, Recompensa Total: -500.0


 10%|▉         | 309/3200 [00:02<00:21, 132.48it/s]

Episódio 300/3200, Recompensa Total: -500.0


 13%|█▎        | 416/3200 [00:03<00:15, 178.15it/s]

Episódio 400/3200, Recompensa Total: 498.0


 16%|█▋        | 523/3200 [00:04<00:18, 142.36it/s]

Episódio 500/3200, Recompensa Total: -237.4300000000003


 19%|█▉        | 622/3200 [00:04<00:18, 136.50it/s]

Episódio 600/3200, Recompensa Total: 3601.260000000004


 23%|██▎       | 738/3200 [00:05<00:13, 185.30it/s]

Episódio 700/3200, Recompensa Total: -1000.0


 25%|██▌       | 809/3200 [00:06<00:16, 141.48it/s]

Episódio 800/3200, Recompensa Total: 0.0


 29%|██▉       | 935/3200 [00:07<00:14, 160.74it/s]

Episódio 900/3200, Recompensa Total: 3427.040000000001


 32%|███▏      | 1024/3200 [00:07<00:13, 161.44it/s]

Episódio 1000/3200, Recompensa Total: 0.0


 35%|███▌      | 1130/3200 [00:08<00:14, 140.78it/s]

Episódio 1100/3200, Recompensa Total: 2439.470000000002


 38%|███▊      | 1221/3200 [00:09<00:14, 136.67it/s]

Episódio 1200/3200, Recompensa Total: 5293.470000000002


 41%|████▏     | 1325/3200 [00:09<00:11, 161.44it/s]

Episódio 1300/3200, Recompensa Total: -500.0


 44%|████▍     | 1417/3200 [00:10<00:11, 149.68it/s]

Episódio 1400/3200, Recompensa Total: 220.8000000000011


 48%|████▊     | 1529/3200 [00:11<00:11, 148.81it/s]

Episódio 1500/3200, Recompensa Total: 9431.61


 50%|█████     | 1611/3200 [00:12<00:15, 104.37it/s]

Episódio 1600/3200, Recompensa Total: 1777.6500000000015


 54%|█████▎    | 1714/3200 [00:13<00:14, 105.33it/s]

Episódio 1700/3200, Recompensa Total: 5493.16


 57%|█████▋    | 1817/3200 [00:14<00:13, 101.71it/s]

Episódio 1800/3200, Recompensa Total: -1000.0


 60%|█████▉    | 1918/3200 [00:15<00:11, 110.93it/s]

Episódio 1900/3200, Recompensa Total: 7235.42


 63%|██████▎   | 2017/3200 [00:16<00:09, 118.94it/s]

Episódio 2000/3200, Recompensa Total: 0.0


 67%|██████▋   | 2135/3200 [00:16<00:05, 187.88it/s]

Episódio 2100/3200, Recompensa Total: -500.0


 70%|██████▉   | 2229/3200 [00:17<00:06, 159.30it/s]

Episódio 2200/3200, Recompensa Total: -500.0


 72%|███████▏  | 2313/3200 [00:17<00:06, 141.44it/s]

Episódio 2300/3200, Recompensa Total: 5834.770000000001


 76%|███████▌  | 2430/3200 [00:18<00:04, 159.10it/s]

Episódio 2400/3200, Recompensa Total: -500.0


 78%|███████▊  | 2506/3200 [00:19<00:05, 133.49it/s]

Episódio 2500/3200, Recompensa Total: 999.8


 82%|████████▏ | 2608/3200 [00:20<00:04, 137.02it/s]

Episódio 2600/3200, Recompensa Total: -107.14999999999964


 85%|████████▍ | 2719/3200 [00:20<00:02, 162.72it/s]

Episódio 2700/3200, Recompensa Total: 999.3499999999999


 88%|████████▊ | 2825/3200 [00:21<00:02, 137.39it/s]

Episódio 2800/3200, Recompensa Total: 7954.07


 92%|█████████▏| 2931/3200 [00:22<00:01, 163.03it/s]

Episódio 2900/3200, Recompensa Total: 997.0


 94%|█████████▍| 3006/3200 [00:22<00:01, 172.23it/s]

Episódio 3000/3200, Recompensa Total: 0.0


 98%|█████████▊| 3123/3200 [00:23<00:00, 142.84it/s]

Episódio 3100/3200, Recompensa Total: 2735.4400000000005


100%|██████████| 3200/3200 [00:24<00:00, 130.89it/s]


### Testando o Agente

In [68]:
# Resetando epsilon para garantir que o agente apenas explora o melhor caminho
agent.epsilon = 0.0
# recriando o ambiente, para usar os dados de teste
env = StockTradingEnv(df_stock_test)
df_stock_test.info()
num_test_episodes = 10
for episode in range(num_test_episodes):
    state = env.reset()
    done = False
    total_reward = 0

    print(f"\n🚀 Testando episódio {episode + 1}...\n")

    while not done:
        action = agent.get_action(state)  # Escolhe ação baseada na Q-Table treinada
        next_state, reward, terminated, truncated, _ = env.step(action)  # Executa ação
        state = next_state
        total_reward += reward
        env.render()  # Exibe o estado atual

        if terminated or truncated:
            print(f"{terminated=}")
            print(f"{truncated=}")
            break

    print(f"Episódio {episode + 1} finalizado. Recompensa Total: {total_reward}")


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


Colunas disponíveis no DataFrame: Index(['date', 'BRFS3', 'PETR4', 'VALE3'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
Index: 248 entries, 3231 to 3478
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          248 non-null    datetime64[ns]
 1   BRFS3         248 non-null    float64       
 2   PETR4         248 non-null    float64       
 3   VALE3         248 non-null    float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 9.7 KB

🚀 Testando episódio 1...

Step: 100, Saldo disponível: 26.68000000000029, Ações: {'PETR4': 372.0, 'VALE3': 0, 'BRFS3': 0}, Saldo Total: 10000.0
Step: 101, Saldo disponível: 3.97000000000029, Ações: {'PETR4': 372.0, 'VALE3': 0, 'BRFS3': 3.0}, Saldo Total: 9977.68
Step: 102, Saldo disponível: 9943.810000000001, Ações: {'PETR4': 0, 'VALE3': 0, 'BRFS3': 3.0}, Saldo Total: 9966.250000000002
Step: 103, Saldo disponível: 44.69000000000051, Ações: 

## Agente DQN (Deep Q-Network)

### Instalação de dependencias

In [69]:
!pip install stable-baselines3[extra] torch gymnasium



In [70]:
!pip install "shimmy>=2.0"



### Treinamento

In [71]:
import gym
from stable_baselines3 import DQN


env = StockTradingEnv(df_stock_train, dqn_mode=True)

# Criando o modelo DQN com a política MLP (rede neural)
model = DQN(
    "MlpPolicy",
    env,
    learning_rate=0.001,
    buffer_size=10000,  # Tamanho do buffer de replay
    learning_starts=1000,  # Passos antes do treinamento começar
    batch_size=32,  # Tamanho do lote de treino
    gamma=0.99,  # Fator de desconto
    tau=1.0,  # Taxa de atualização da rede-alvo
    target_update_interval=500,  # Atualização da rede-alvo a cada 500 passos
    train_freq=4,  # Treina a cada 4 interações
    exploration_fraction=0.1,  # Proporção do tempo para exploração (ε-decay)
    exploration_final_eps=0.01,  # Valor final de epsilon
    verbose=0,  # Mostrar logs de treinamento,
    seed=42
)

model.learn(total_timesteps=100000)

Colunas disponíveis no DataFrame: Index(['date', 'BRFS3', 'PETR4', 'VALE3'], dtype='object')


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  deprecation(


<stable_baselines3.dqn.dqn.DQN at 0x7e2fbda96b50>

### Testando o modelo

In [72]:
env = StockTradingEnv(df_stock_test)
next_state = env.reset()
done = False

while not done:
    action, _states = model.predict(next_state, deterministic=True)
    next_state, reward, done, truncated, _ = env.step(action)
    env.render()

Colunas disponíveis no DataFrame: Index(['date', 'BRFS3', 'PETR4', 'VALE3'], dtype='object')
Step: 121, Saldo disponível: 9.610000000000582, Ações: {'PETR4': 323.0, 'VALE3': 0, 'BRFS3': 0}, Saldo Total: 10000.0
Step: 122, Saldo disponível: 9886.95, Ações: {'PETR4': 0, 'VALE3': 0, 'BRFS3': 0}, Saldo Total: 9886.95
Step: 123, Saldo disponível: 28.950000000000728, Ações: {'PETR4': 318.0, 'VALE3': 0, 'BRFS3': 0}, Saldo Total: 9886.95
Step: 124, Saldo disponível: 9886.95, Ações: {'PETR4': 0, 'VALE3': 0, 'BRFS3': 0}, Saldo Total: 9886.95
Step: 125, Saldo disponível: 19.910000000001673, Ações: {'PETR4': 332.0, 'VALE3': 0, 'BRFS3': 0}, Saldo Total: 9886.95
Step: 126, Saldo disponível: 9966.630000000003, Ações: {'PETR4': 0, 'VALE3': 0, 'BRFS3': 0}, Saldo Total: 9966.630000000003
Step: 127, Saldo disponível: 13.830000000003565, Ações: {'PETR4': 330.0, 'VALE3': 0, 'BRFS3': 0}, Saldo Total: 9966.630000000003
Step: 128, Saldo disponível: 9864.330000000004, Ações: {'PETR4': 0, 'VALE3': 0, 'BRFS3': 0

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


## Avaliação dos modelos

In [88]:
import numpy as np


def total_return(initial_balance, final_balance):
    return (final_balance - initial_balance) / initial_balance


def daily_return(returns):
    return np.mean(returns)


def volatility(returns):
    if len(returns) > 1:
        return np.std(returns, ddof=1)
    return 0.0


def sharpe_ratio(returns, risk_free_rate=0.0):
    return (np.mean(returns) - risk_free_rate) / np.std(returns)


def sortino_ratio(returns, risk_free_rate=0.0):
    negative_returns = [r for r in returns if r < 0]
    downside_risk = np.std(negative_returns) if len(negative_returns) > 0 else 1e-6
    return (np.mean(returns) - risk_free_rate) / downside_risk


def max_drawdown(balance_history):
    peak = np.maximum.accumulate(balance_history)
    drawdowns = (peak - balance_history) / peak
    return np.max(drawdowns)


def pad_sequences(sequences, pad_value=np.nan):
    """Preenche todas as sequências para terem o mesmo tamanho."""
    max_len = max(len(seq) for seq in sequences)
    padded_sequences = np.full((len(sequences), max_len), pad_value)

    for i, seq in enumerate(sequences):
        padded_sequences[i, :len(seq)] = seq

    return padded_sequences


def evaluate_agent(env, agent, is_dqn=False):
    """Avalia o desempenho do agente e calcula métricas financeiras."""
    returns = []
    balance_histories = []
    steps = [0, 10, 30, 50, 90, 100, 110, 130, 150, 220]

    for ep in range(10):
        obs = env.reset(step=steps[ep])
        done = False
        balance_history = [env.initial_balance]

        while not done:
            if is_dqn:
                action, _ = agent.predict(obs, deterministic=True)
            else:
                action = agent.get_action(obs)

            obs, reward, done, truncated, _ = env.step(action)
            balance_history.append(env.balance)

        # Calcula o retorno do episódio
        returns.append((balance_history[-1] - env.initial_balance) / env.initial_balance)
        balance_histories.append(balance_history)

    # Converter para arrays
    returns = np.array(returns)
    balance_histories = pad_sequences(balance_histories)

    # Calcular métricas financeiras
    metrics = {
        "Retorno Total": total_return(env.initial_balance, np.nanmean(balance_histories[:, -1])),
        "Retorno Médio Diário": daily_return(returns),
        "Volatilidade": volatility(returns),
        "Sharpe Ratio": sharpe_ratio(returns),
        "Sortino Ratio": sortino_ratio(returns),
        "Max Drawdown": max_drawdown(np.nanmean(balance_histories, axis=0))
    }

    return metrics


In [89]:
env = StockTradingEnv(df_stock_test)
# Avaliando o agente Q-Learning
qlearning_metrics = evaluate_agent(env, agent)

# Avaliando o agente DQN
dqn_metrics = evaluate_agent(env, model, True)

# Exibir os resultados
print("📊 Avaliação do Q-Learning:")
for key, value in qlearning_metrics.items():
    print(f"{key}: {value:.4f}")

print("\n📊 Avaliação do DQN:")
for key, value in dqn_metrics.items():
    print(f"{key}: {value:.4f}")

Colunas disponíveis no DataFrame: Index(['date', 'BRFS3', 'PETR4', 'VALE3'], dtype='object')


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


📊 Avaliação do Q-Learning:
Retorno Total: -0.1851
Retorno Médio Diário: 0.0160
Volatilidade: 0.1395
Sharpe Ratio: 0.1213
Sortino Ratio: 0.2790
Max Drawdown: 0.2576

📊 Avaliação do DQN:
Retorno Total: 0.5191
Retorno Médio Diário: 0.3972
Volatilidade: 0.1843
Sharpe Ratio: 2.2719
Sortino Ratio: 397230.1000
Max Drawdown: 0.0600
