#### Simple DQN RL Stock Trader (Part1)
 - Simple stock trading RL-based algo trading agaent using S & P 500 data
 - Trading actions are buy, sel or hold
 - Trading window is daily
 - References are:
 - References:
    - Yves J Hilpisch, "Artificial Intelligence in Finance", page 268 - 276, O'Reilly, 2021
    - Mnih, V. et al., "Human-level control through deep reinforcement learning", Nature, 2015.
    - Moody, J., Saffell, M., "Learning to trade via direct reinforcement", IEEE, 2001.
    - Gymnasium API documentation: https://gymnasium.farama.org/
    - PyTorch documentation: https://pytorch.org/docs/stable/index.html

#### Imports

In [1]:
import math
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from collections import deque
import ta
import random
import os
from typing import Tuple, List, Any, Dict
import pandas as pd
from IPython.display import display
from tqdm import tqdm

from stable_baselines3 import DQN, PPO
from stable_baselines3.common.evaluation import evaluate_policy

#### Define global constants and variables

In [2]:
# Make sure RL computations are reproducible by setting the seed
SEED_VALUE = 100

# S & P 500 data configurations
DATA_PATH = "../data"
S_P_RAW_DATA_PATH = f"{DATA_PATH}/s_and_p_raw_data_with_features.csv"
S_P_SCALED_DATA_PATH = f"{DATA_PATH}/s_and_p_scaled_data_with_features.csv"
ASSET_DATA_PATH = f"{DATA_PATH}/aiif_eikon_eod_data.csv"
DATA_START_DATE="2010-01-01"
DATA_END_DATE="2020-01-01"
S_AND_P_YAHOO_TICKER = "^GSPC"
DATA_BAR_TYPE = "Close"
FEATURE_SMA_10 = "SMA_10"
FEATURE_RSI = "RSI"
FEATURE_MACD = "MACD"
S_AND_P_DATA_COLUMNS = [DATA_BAR_TYPE, FEATURE_SMA_10, FEATURE_RSI, FEATURE_MACD]

# Data partition configuration
TEST_SPLIT_FACTOR=0.2

# RL training configuration
TRAINING_EPISODES_COUNT = 100
TRAINING_AVERAGE_ROLLING_WINDOW = 50

# RL validation/test configuration
TEST_EPISODES_COUNT = 3

# DQN agent hyper-parameter configurations
REPLAY_EXPERIENCE_MEMORY_SIZE = 10_000
LEARNING_RATE = 1e-3
GAMMA = 0.95
EPSILON = 1.0
EPSILON_MIN = 0.01
EPSILON_DECAY = 0.995
BATCH_SIZE = 64

# Model persistence configuration
MODEL_FOLDER = "../model"
MODEL_RESULTS_FOLDER = "../results"
os.makedirs(MODEL_FOLDER, exist_ok=True)
os.makedirs(MODEL_RESULTS_FOLDER, exist_ok=True)
MODEL_FILE_PATH = f"{MODEL_FOLDER}/simple_rl_agent_v1.pt"
TRAIN_REWARDS_FILE_PATH = f"{MODEL_RESULTS_FOLDER}/simple_rl_agent_v1_train_rewards.csv"
TRAIN_AVERAGE_REWARDS_FILE_PATH = f"{MODEL_RESULTS_FOLDER}/simple_rl_agent_v1_train_average_rewards.csv"
TEST_REWARDS_FILE_PATH = f"{MODEL_RESULTS_FOLDER}/simple_rl_agent_v1_test_rewards.csv"


#### Define S & P 500 data provider component

In [3]:
class DataProvider:
    """
    Component used to provide the S & P 500 dataset
    :param s_and_p_raw_data_path: File path of raw data
    :param s_and_p_scaled_data_path: File path of scaled data
    """
    def __init__(
        self,
        s_and_p_raw_data_path: str = S_P_RAW_DATA_PATH,
        s_and_p_scaled_data_path: str = S_P_SCALED_DATA_PATH
    ):
        """
        Constructor
        """
        self._s_and_p_raw_data_path = s_and_p_raw_data_path
        self._s_and_p_scaled_data_path = s_and_p_scaled_data_path
        self._closing_price_raw_df = None
        self._closing_price_raw = None
        self._closing_price_scaled = None
        self._closing_price_train = None
        self._closing_price_test = None
        self._data_scaler = None
        self.features = None

    def _featureEngineerData(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Feature engineer the S & P price data
        :param df: Input dataframe
        :return: Feature engineered data
        """
        df[FEATURE_SMA_10] = ta.trend.sma_indicator(df[DATA_BAR_TYPE], window=10)
        df[FEATURE_RSI] = ta.momentum.RSIIndicator(df[DATA_BAR_TYPE], window=14).rsi()
        df[FEATURE_MACD] = ta.trend.macd_diff(df[DATA_BAR_TYPE])
        df.dropna(inplace=True)
        return df

    def getData(self) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray]:
        """
        Gets the S & P closing price data
        :return: Datasets
        """
        if os.path.exists(self._s_and_p_raw_data_path):
            print(f"{self._s_and_p_raw_data_path} already exists in local file system (cache), ingesting the file locally")
            self._closing_price_raw_df = pd.read_csv(self._s_and_p_raw_data_path, index_col=None)
        else:
            print(f"{self._s_and_p_raw_data_path} does not exists in local file system (cache), so ingesting the file from Yahoo Finance remote endpoint..")
            self._closing_price_raw_df = yf.download(S_AND_P_YAHOO_TICKER, start=DATA_START_DATE, end=DATA_END_DATE, multi_level_index=False)
            self._closing_price_raw_df.to_csv(self._s_and_p_raw_data_path, index=False)             
        if os.path.exists(self._s_and_p_scaled_data_path):
            print(f"{self._s_and_p_scaled_data_path} already exists in local file system (cache), ingesting the file locally")
            closing_price_scaled_df =  pd.read_csv(self._s_and_p_scaled_data_path, index_col=None)            
            self._closing_price_with_features_scaled = closing_price_scaled_df[S_AND_P_DATA_COLUMNS].values
        else:
            print(f"{self._s_and_p_scaled_data_path} does not exists in local file system (cache), so will recompute the data scaling..")
            close_prices_df = self._closing_price_raw_df[[DATA_BAR_TYPE]]
            close_prices_with_features_df = self._featureEngineerData(close_prices_df)
            
            close_prices_with_features = close_prices_with_features_df.values
            self._data_scaler = MinMaxScaler()
            self._closing_price_with_features_scaled = self._data_scaler.fit_transform(close_prices_with_features)
            closing_price_scaled_df = pd.DataFrame(self._closing_price_with_features_scaled, columns=[S_AND_P_DATA_COLUMNS])
            closing_price_scaled_df.to_csv(self._s_and_p_scaled_data_path, index=False)
        
        self._partitionDataset()
        
        return  self._closing_price_raw_df, self._closing_price_with_features_scaled, self._closing_price_train, self._closing_price_test

    def _partitionDataset(self, slit_fraction: float=TEST_SPLIT_FACTOR):
        """
        Partitions data into training and test splits
        :param slit_fraction: Split fraction
        """
        prices = self._closing_price_with_features_scaled
        split = int(len(prices) * 0.8)
        self._closing_price_train, self._closing_price_test = prices[:split], prices[split:]
        
        

In [None]:
data_provider = DataProvider()
close_data_raw_df, close_data_scaled, close_data_train, close_data_test = data_provider.getData()

#### Helpers

In [4]:
class Helpers:
    """
    Helper utilities
    """
    @staticmethod
    def appendTableRow(
            df: pd.DataFrame,
            row: pd.Series):
        """
        :param df: Dataframe to append row to
        :param row: Row to append
        :return: New dataframe with appended row
        """
        return pd.concat([
            df,
            pd.DataFrame([row], columns=row.index)]
        ).reset_index(drop=True)

    @staticmethod
    def createTable(
        columns: List[str]
        
    ) -> pd.DataFrame:
        """
        Creates a new data table
        :param columns: Columns
        """
        df = pd.DataFrame(
            columns=columns
        )
        return df

    @staticmethod
    def displayTable(
        df: pd.DataFrame,
        n_rows: int,
        n_columns: int
    ) -> None:
        """
        Displays sample rows of a data table
        :param df: Data table
        :param n_rows: Number of rows
        """
        with pd.option_context("display.max_rows", n_rows, "display.max_columns", n_columns,
                       "max_colwidth", 100):
            print(display(df[:n_rows]))

    @staticmethod
    def setSeeds(seed: int=SEED_VALUE):
        """
        Sets the seed value for the computation to maintain reproducibility
        :param seed: Seed value
        :return: None
        """
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

    @staticmethod
    def getAssetData(data_path: str=ASSET_DATA_PATH) -> pd.DataFrame:
        """
        Gets raw asset (Hilpisch) data
        :param data_path: Data file path
        :return: Data
        """
        raw_data_df = pd.read_csv(data_path, index_col=0,
                               parse_dates=True).dropna()
        return raw_data_df

In [5]:
df = Helpers.getAssetData()
df.head()

Unnamed: 0_level_0,AAPL.O,MSFT.O,INTC.O,AMZN.O,GS.N,SPY,.SPX,.VIX,EUR=,XAU=,GDX,GLD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2010-01-04,30.572827,30.95,20.88,133.9,173.08,113.33,1132.99,20.04,1.4411,1120.0,47.71,109.8
2010-01-05,30.625684,30.96,20.87,134.69,176.14,113.63,1136.52,19.35,1.4368,1118.65,48.17,109.7
2010-01-06,30.138541,30.77,20.8,132.25,174.26,113.71,1137.14,19.16,1.4412,1138.5,49.34,111.51
2010-01-07,30.082827,30.452,20.6,130.0,177.67,114.19,1141.69,19.06,1.4318,1131.9,49.1,110.82
2010-01-08,30.282827,30.66,20.83,133.52,174.31,114.57,1144.98,18.13,1.4412,1136.1,49.84,111.37


#### Define the custom trading environment

In [6]:
class TradingEnv(gym.Env):
    """
    RL asset trading environment    
    """
    
    def __init__(
        self, 
        symbol: str, 
        features: List[str], 
        window: int, 
        lags: int,
        leverage: int=1, 
        min_performance: float=0.85,
        start: int=0, 
        end: int=None, 
        mu: float=None, 
        std: float=None):
        """
        Constructor
        :param symbol: Asset symbol
        :param features: Features
        :param window: Data window
        :param lag: Lag
        :param leverage: Leverage
        :param min_performance: Minimum performance
        :param start: start
        :param end: End
        :param mu: Mean
        :param std: Standard deviation
        """
        self.symbol = symbol
        self.features = features
        self.n_features = len(features)
        self.window = window
        self.lags = lags
        self.leverage = leverage
        self.min_performance = min_performance
        self.start = start
        self.end = end
        self.mu = mu
        self.std = std
        self.observation_space = gym.spaces.Box(low=-2, high=2, shape=(self.lags, self.n_features), dtype=np.float32)
        self.action_space = gym.spaces.Discrete(2)
        self.raw = Helpers.getAssetData()
        self._prepare_data() 

    def step(self, action):
        """
        Step function
        :param action: Action taken by the agent
        :return: Observations, reward, done, truncated and infos
        """
        self.correct = action == self.data_['d'].iloc[self.bar]
        ret = self.data['r'].iloc[self.bar] * self.leverage
        reward_1 = 1 if self.correct else 0
        reward_2 = abs(ret) if self.correct else -abs(ret)
        self.factor = 1 if self.correct else -1
        self.treward += reward_1
        self.bar += 1
        self.accuracy = self.treward / (self.bar - self.lags)
        self.performance *= math.exp(reward_2)
        if self.bar >= len(self.data):
            done = True
        elif reward_1 == 1:
            done = False
        elif (self.performance < self.min_performance and
              self.bar > self.lags + 5):
            done = True
        else:
            done = False
            
        state = self.getState()
        reward = reward_1 + reward_2 * 5
        terminated = False
        info = self._getInfos()
        return state.values, reward, done, terminated, info      
        

    def reset(self, seed=SEED_VALUE, options=None) -> Tuple[np.ndarray, Dict]:
        """
        Resets the RL environment
        :param seed: Seed
        :param options: Options
        """
        self.treward = 0
        self.accuracy = 0
        self.performance = 1
        self.bar = self.lags
        state = self.data_[self.features].iloc[self.bar-
                        self.lags:self.bar]        
        return state.values, {}

    def _prepare_data(self):
        """
        Prepares the asset data
        """
        self.data = pd.DataFrame(self.raw[self.symbol])
        self.data = self.data.iloc[self.start:]
        self.data['r'] = np.log(self.data / self.data.shift(1))
        self.data.dropna(inplace=True)
        self.data['s'] = self.data[self.symbol].rolling(
                                              self.window).mean() 
        self.data['m'] = self.data['r'].rolling(self.window).mean()
        self.data['v'] = self.data['r'].rolling(self.window).std()
        self.data.dropna(inplace=True)
        if self.mu is None:
            self.mu = self.data.mean()
            self.std = self.data.std()
        self.data_ = (self.data - self.mu) / self.std
        self.data_['d'] = np.where(self.data['r'] > 0, 1, 0)
        self.data_['d'] = self.data_['d'].astype(int)
        if self.end is not None:
            self.data = self.data.iloc[:self.end - self.start]
            self.data_ = self.data_.iloc[:self.end - self.start]

    def seed(self, seed):
        """
        Seed to the random number generation
        :param seed: Seed
        """
        random.seed(seed)
        np.random.seed(seed)

    def getState(self) -> np.ndarray:
        """
        Gets the RL state
        :return: State
        """
        return self.data_[self.features].iloc[self.bar -
                                self.lags:self.bar]

    def _getObservations(self) -> np.ndarray:
        """
        Gets the RL environment onservations
        :param observations: Observations
        """
        return np.array([self.features[self.current_step]], dtype=np.float32)

    def _getInfos(self) -> Dict[str, Any]:
        """
        Gets the RL infos
        :return: RL infos
        """
        info = {
            "correct": self.correct,
            "factor": self.factor,
            "accuracy": self.accuracy,
            "bar": self.bar,
            "performance": self.performance,
    
        }
        return info

    

#### Let us use a random agent to trade the asset portfolio

In [7]:
def testRLWithRandomAgent():
    """
    Test the environment with a random RL agent
    """
    columns = ["action", "state", "next_state", "reward", "done", "truncated", "info"]
    results_df = Helpers.createTable(columns=columns)
    
    n_episodes = 5
    symbol = "EUR="
    features = [symbol, "r", "s", "m", "v"]
    window = 10
    lag = 5
    env = TradingEnv(symbol,features, window, lag )
    assert env != None, "Incorrect env constructed!!"
    state, info = env.reset()    
    for i in range(n_episodes):
        action = env.action_space.sample()
        next_state, reward, done, truncated, info = env.step(action)
        new_row = pd.Series(
            {
                "action": action,
                "state": state,
                "next_state": next_state,
                "reward": reward,
                "done": done,
                "truncated": truncated,
                "info": info,
            })
        results_df = Helpers.appendTableRow(results_df, new_row)
        state = next_state
    Helpers.displayTable(results_df, n_rows=10, n_columns=len(columns))
    print(f"\n\nenv.getState():\n{env.getState()}")
    

testRLWithRandomAgent()

  return pd.concat([


Unnamed: 0,action,state,next_state,reward,done,truncated,info
0,1,"[[1.772060547322853, -1.0213535688271376, 1.8772881381524675, -0.39389674115072043, 0.0350896226...","[[1.5973158387206814, -2.4431843864073235, 1.8535858323718988, -1.0211676868251565, 0.6954352920...",-0.027283,False,False,"{'correct': False, 'factor': -1, 'accuracy': 0.0, 'bar': 6, 'performance': 0.9945583038869258}"
1,1,"[[1.5973158387206814, -2.4431843864073235, 1.8535858323718988, -1.0211676868251565, 0.6954352920...","[[1.5875585199662465, -0.12078195478970356, 1.825001029145894, -1.2423312747278847, 0.5999729990...",-0.019936,False,False,"{'correct': False, 'factor': -1, 'accuracy': 0.0, 'bar': 7, 'performance': 0.9906007067137809}"
2,0,"[[1.5875585199662465, -0.12078195478970356, 1.825001029145894, -1.2423312747278847, 0.5999729990...","[[1.6292488819170177, 0.6083036347667816, 1.8089331739163708, -0.6733641480088325, 0.61232777133...",1.019658,False,False,"{'correct': True, 'factor': 1, 'accuracy': 0.3333333333333333, 'bar': 8, 'performance': 0.994502..."
3,0,"[[1.6292488819170177, 0.6083036347667816, 1.8089331739163708, -0.6733641480088325, 0.61232777133...","[[1.6407802586268063, 0.18066536999252064, 1.7856747315399315, -0.9974713645896238, 0.3693072136...",1.03594,False,False,"{'correct': True, 'factor': 1, 'accuracy': 0.5, 'bar': 9, 'performance': 1.0016772547977972}"
4,1,"[[1.6407802586268063, 0.18066536999252064, 1.7856747315399315, -0.9974713645896238, 0.3693072136...","[[1.5724790273457543, -0.9501925693675054, 1.746614751976445, -1.714318473729268, 0.003519664432...",1.023391,False,False,"{'correct': True, 'factor': 1, 'accuracy': 0.6, 'bar': 10, 'performance': 1.0063741976315772}"


None


env.getState():
                EUR=         r         s         m         v
Date                                                        
2010-01-26  1.572479 -0.950193  1.746615 -1.714318  0.003520
2010-01-27  1.522805 -0.689571  1.704270 -1.868586 -0.011352
2010-01-28  1.474019 -0.679698  1.655623 -2.158711 -0.122578
2010-01-29  1.385316 -1.257311  1.598808 -2.541134 -0.144125
2010-02-01  1.442973  0.847393  1.558417 -1.792761  0.118917


In [8]:
symbol = "EUR="
features = [symbol, "r", "s", "m", "v"]
window = 10
lag = 5
env = TradingEnv(symbol,features, window, lag )
# model = DQN('MlpPolicy', env, verbose=1)
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=int(2e5), progress_bar=True)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 182      |
|    ep_rew_mean     | 84.5     |
| time/              |          |
|    fps             | 288      |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 651          |
|    ep_rew_mean          | 325          |
| time/                   |              |
|    fps                  | 237          |
|    iterations           | 2            |
|    time_elapsed         | 17           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0077628917 |
|    clip_fraction        | 0.0385       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.687       |
|    explained_variance   | -0.0141      |
|    learning_r

<stable_baselines3.ppo.ppo.PPO at 0x23ee7b807d0>

In [10]:
model_file_path = "ppo_asset_trader_rl"
model.save(model_file_path)
model_2 = PPO.load(model_file_path, env=env)
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [11]:
mean_reward

np.float64(1481.349875)

#### Define the Deep Q-Network

In [None]:
class DQN(nn.Module):
    """
    Specification for the DQN network
    """
    def __init__(self, input_dim, output_dim):
        """
        Constructor
        :param input_dim: Input dimension
        :param output_dim: Output dimension
        """
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, output_dim)
        )

    def forward(self, x):
        """
        Neural net forward pass
        :param x: Input
        """
        return self.fc(x)

#### RL DQN Agent

In [None]:
class DQNAgent:
    """
    DQN RL agent
    """
    def __init__(
        self, 
        state_dim: int, 
        action_dim: int):
        """
        Cnstructor
        :param state_dim: State dimension
        :param action_dim: Action dimension
        """
        self.model = DQN(state_dim, action_dim)
        self.target_model = DQN(state_dim, action_dim)
        self.optimizer = optim.Adam(self.model.parameters(), lr=LEARNING_RATE)
        self.criterion = nn.MSELoss()
        self.memory = deque(maxlen=REPLAY_EXPERIENCE_MEMORY_SIZE)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.batch_size = 64

    def remember(self, state, action, reward, next_state, done):
        """
        Records the RL experience trajectories
        :param state: State
        :param action: Action
        :param reward: Reward
        :param next_state: Next state
        :param done: Done
        """
        self.memory.append((state, action, reward, next_state, done))

    def act(
        self, 
        state: np.ndarray
    ) -> int:
        """
        Invokes the agents 'act'
        :param state: State
        :return: action
        """
        if np.random.rand() <= self.epsilon:
            return random.randrange(3)
        state_tensor = torch.FloatTensor(state)
        with torch.no_grad():
            q_values = self.model(state_tensor)
        return torch.argmax(q_values).item()

    def replay(self):
        """
        Experience replay
        """
        if len(self.memory) < self.batch_size:
            return

        minibatch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in minibatch:
            state_tensor = torch.FloatTensor(state)
            next_state_tensor = torch.FloatTensor(next_state)
            target = reward
            if not done:
                target += self.gamma * torch.max(self.target_model(next_state_tensor)).item()

            target_f = self.model(state_tensor)
            target_f = target_f.clone().detach()
            target_f[0, action] = target

            self.model.train()
            output = self.model(state_tensor)
            loss = self.criterion(output, target_f)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


    def update_target_model(self):
        """
        Updates the network model
        """
        self.target_model.load_state_dict(self.model.state_dict())

#### Train the Agent

In [None]:
class TrainAgent:
    """
    Component used to train the RL agent
    """
    def __init__(
        self, 
        price_train_data: np.ndarray,
        n_episodes: int=TRAINING_EPISODES_COUNT
    ):
        """
        Constructor
        :param price_train_data: Train data
        :param n_episodes: Number of episodes for RL validation
        """
        self.n_episodes = n_episodes
        self.price_train_data = price_train_data
        self.train_env = TradingEnv(self.price_train_data)
        self.agent = DQNAgent(state_dim=self.price_train_data.shape[1], action_dim=3)
        self.rewards = []
        self.rewards_average = []

    def run(self):
        """
        Runs the RL agent training cycle
        """
        for episodes in tqdm(range(self.n_episodes), desc="Episodes"):
            state, _ = self.train_env.reset()
            total_reward = 0
            done = False

            for _ in range(len(self.price_train_data)):
                action = self.agent.act(state)
                next_state, reward, done, _, _ = self.train_env.step(action)
                self.agent.remember(state, action, reward, next_state, done)
                self.agent.replay()
                state = next_state
                total_reward += reward
                self.rewards.append(total_reward)
        
                if done: 
                    average_reward = sum(self.rewards[-TRAINING_AVERAGE_ROLLING_WINDOW:]) / TRAINING_AVERAGE_ROLLING_WINDOW
                    self.rewards_average.append(average_reward)
                    break
        
            self.agent.update_target_model()
            
            #print(f"Epoch {e+1}/{epochs}, Total Profit: {total_reward:.4f}, Epsilon: {self.agent.epsilon:.4f}")

        self.saveModel()

    def saveModel(self):
        """
        Save the neural net model
        :return: None
        """
        if os.path.exists(MODEL_FOLDER):
            torch.save(self.agent.model, MODEL_FILE_PATH)
            rewards_df = pd.DataFrame(self.rewards, columns=["rewards"])
            rewards_average_df = pd.DataFrame(self.rewards_average, columns=["average_rewards"])
            rewards_df.to_csv(TRAIN_REWARDS_FILE_PATH, index=False)
            rewards_average_df.to_csv(TRAIN_AVERAGE_REWARDS_FILE_PATH, index=False)
        else:
            print(f"The folder {MODEL_FOLDER} does not exist and the model could not be saved!!")

    def loadModel(self) -> DQN:
        """
        Loads the neural net model
        :return: Loaded model
        """
        model = None
        if os.path.exist(MODEL_FILE_PATH):
            model = torch.load(MODEL_FILE_PATH)
        else:
            print(f"The path {MODEL_FILE_PATH} does not exist and the model could not be loaded!!")
        return model
    
    

#### Run the RL training cycle..

#### Validate/Test the Agent

In [None]:
Helpers.setSeeds()
# train_agent = TrainAgent(price_train_data=close_data_train[:50])
train_agent = TrainAgent(price_train_data=close_data_train)
train_agent.run()

In [None]:
class ValidateAgent:
    """
    Component used to validate/test the RL agent
    """
    def __init__(
        self, 
        price_test_data: np.ndarray,
        n_episodes: int=TEST_EPISODES_COUNT
    ):
        """
        Constructor
        :param price_test_data: Test data
        :param n_episodes: Number of episodes for RL validation/test
        """
        self.n_episodes = n_episodes
        self.price_test_data = price_test_data
        self.test_env = TradingEnv(self.price_train_data)
        self.agent = DQNAgent(state_dim=1, action_dim=3)
        self.reward = []

    def run(self):
        """
        Runs the RL agent validation/test cycle
        """
        self.agent.epsilon = 0.0  # Turn off exploration
        for episodes in range(self.n_episodes):
            state, _ = self.test_env.reset()
            total_reward = 0
            done = False
            while not done:
                action = self.agent.act(state)
                next_state, reward, done, _, _ = self.test_env.step(action)
                state = next_state
                total_reward += reward

            self.rewards.append(total_reward)
            print(f"Epoch {e+1}/{epochs}, Total Profit: {total_reward:.4f}")
        
        

#### Report RL performance

In [None]:
class ReportRLPerformance:
    """
    Component used to report the RL performance
    """
    def __init__(
        self,
        train_rewards: List[float],
        train_rewards_average: List[float]
    ):
        """
        Constructor
        :param train_rewards: Training rewards
        :param train_rewards_average: Validation rewards
        """
        self.train_rewards = train_rewards
        self.train_rewards_average = train_rewards_average

    def plotRewardCurves(
        self,
        rewards: List[float],
        plot_type: str
    ):
        """
        Plots the reward curves
        :param rewards: Rewards
        :param plot_type: Plot type
        :return: None
        """
        plt.plot(rewards)
        plt.xlabel("Episode")
        plt.ylabel("Total Profit")
        plt.title(f"{plot_type} Reward Progress")
        plt.grid(True)
        plt.show()

    def plotTrainingRewardCurves(
        self,
        plot_type: str = "Training"
    ):
        """
        Plots the training reward curves
        :param plot_type: Plot type
        :return: None
        """
        self.plotRewardCurves(self.train_rewards, plot_type=plot_type)

    def plotsmoothedTrainingRewardCurves(
        self,
        plot_type: str = "Smoothed Training"
    ):
        """
        Plots the training reward curves
        :param plot_type: Plot type
        :return: None
        """
        self.plotRewardCurves(self.train_rewards_average, plot_type=plot_type)
        

In [None]:
reporter = ReportRLPerformance(train_rewards = train_agent.rewards, train_rewards_average=train_agent.rewards_average)
reporter.plotTrainingRewardCurves()
reporter.plotsmoothedTrainingRewardCurves()

In [None]:
def foo():
    trewards = []
    for _ in range(10):
        
        treward = _ + 1
        trewards.append(treward)
        print(trewards)


foo()

In [None]:
t = list(range(100))
t[-20:]

In [None]:
df_1 = pd.DataFrame({"prices": range(100)})
df_1.rolling(10).sum().dropna()