In [7]:
import os
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import pandas as pd
from gym import spaces
import csv


# Set random seeds for reproducibility
auto_seed = 42
np.random.seed(auto_seed)
torch.manual_seed(auto_seed)


# =============================================================================
# 1. Trading Environment Including Standardization, Sentiment Score, Forecast
# =============================================================================
class TradingEnv(gym.Env):
   """
   A custom trading environment that includes:
     - sentiment score,
     - forecasted price,
     - standard ATR stop-loss,
     - z-score standardization of features.


   Expected CSV columns:
     - Date, High, Low, Price (close), MA5, MA20, RSI, MACD,
       sentiment_score, Forecasted_Price


   State vector (length = n_features + 1):
     [z(Price), z(MA5), z(MA20), z(RSI), z(MACD),
      z(sentiment_score), z(Forecasted_Price), Position]
   where Position is 0 or 1.


   Actions:
     0 = Hold, 1 = Buy, 2 = Sell
   """
   metadata = {'render.modes': ['human']}


   def __init__(self,
                data_path="/Users/User/Desktop/DIA/FinancialTradingBot/Merged_Data_For_RL/"
                          "AAPL_stock_data_2022-01-01_to_2024-12-31.csv",
                initial_balance=10000,
                stop_loss_multiplier=2,
                window=5):
       super(TradingEnv, self).__init__()
       # 1) Load data and parse dates
       self.data = pd.read_csv(data_path)
       self.data.rename(columns={
            "Close": "Price",
            "Sentiment_Score": "sentiment_score",
            "Predicted_Next_Close": "Forecasted_Price"
        }, inplace=True)

       self.data["Date"] = pd.to_datetime(self.data["Date"])
       self.num_data = len(self.data)


       # 2) Portfolio params
       self.initial_balance      = initial_balance
       self.stop_loss_multiplier = stop_loss_multiplier
       self.window               = window


       # 3) Compute z-score stats for features
       feature_cols = [
           "Price", "MA5", "MA20",
           "RSI", "MACD",
           "sentiment_score", "Forecasted_Price"
       ]
       self.feature_cols  = feature_cols
       self.feature_mean  = self.data[feature_cols].mean()
       self.feature_std   = self.data[feature_cols].std()


       # 4) Action & Observation spaces
       self.action_space = spaces.Discrete(3)  # 0=Hold,1=Buy,2=Sell
       state_dim = len(feature_cols) + 1       # +1 for Position
       self.observation_space = spaces.Box(
           low=-np.inf,
           high=np.inf,
           shape=(state_dim,),
           dtype=np.float32
       )


       # 5) Initialize state
       _ = self.reset()


   def reset(self):
       """
       Resets portfolio and time pointer; returns first observation.
       """
       self.balance        = self.initial_balance
       self.shares         = 0
       self.position       = 0
       self.entry_price    = 0.0
       self.current_step   = 0
       self.done           = False
       self.equity_history = [self.initial_balance]
       return self._get_state()


   def _get_state(self):
       row = self.data.iloc[self.current_step]
       vals = (row[self.feature_cols] - self.feature_mean) / self.feature_std
       state = np.concatenate([vals.values, [self.position]])
       return state.astype(np.float32)


   def step(self, action):
        if self.done:
            raise RuntimeError("Episode has ended. Please reset().")

        executed_action = "Hold"  # Default

        # Force Hold if trying to Buy when already holding
        if self.shares == 1 and action == 1:
            action = 0

        prev_close  = self.data.loc[self.current_step, "Price"]
        prev_equity = self.balance + self.shares * prev_close

        # advance timestep
        self.current_step += 1
        if self.current_step >= self.num_data - 1:
            self.done = True

        current_close = self.data.loc[self.current_step, "Price"]

        # Execute action
        if action == 1 and self.shares == 0 and self.balance >= current_close:
            self.shares      = 1
            self.position    = 1
            self.entry_price = current_close
            self.balance    -= current_close
            executed_action = "Buy"
        elif action == 2 and self.shares > 0:
            self.balance    += current_close * self.shares
            self.shares      = 0
            self.position    = 0
            self.entry_price = 0.0
            executed_action = "Sell"

        # ATR stop-loss
        if self.shares > 0:
            trs = []
            start = max(1, self.current_step - self.window + 1)
            for i in range(start, self.current_step + 1):
                hi = self.data.loc[i,   "High"]
                lo = self.data.loc[i,   "Low"]
                prev_c = self.data.loc[i-1, "Price"]
                tr = max(hi - lo, abs(hi - prev_c), abs(lo - prev_c))
                trs.append(tr)
            atr = np.mean(trs) if trs else 0.0
            if current_close < self.entry_price - self.stop_loss_multiplier * atr:
                self.balance    += current_close * self.shares
                self.shares      = 0
                self.position    = 0
                self.entry_price = 0.0
                executed_action = "Sell"

        # Reward
        equity = self.balance + self.shares * current_close
        reward = equity - prev_equity
        self.equity_history.append(equity)

        return self._get_state(), reward, self.done, {"executed_action": executed_action}



   def render(self, action=None, mode='human'):
       """
       Prints the current trading date, action taken, price and portfolio state.
       """
       row      = self.data.iloc[self.current_step]
       date_str = row["Date"].strftime("%Y-%m-%d")
       price    = row["Price"]
       equity   = self.balance + self.shares * price
       action_str = {0: 'Hold', 1: 'Buy', 2: 'Sell'}.get(action, 'N/A')


       print(f"Date: {date_str},  "
             f"Action: {action_str},  "
             f"Price: {price:.2f},  "
             f"Balance: {self.balance:.2f},  "
             f"Shares: {self.shares},  "
             f"Equity: {equity:.2f},  "
             f"Position: {self.position}")




# =============================================================================
# 2. PPO Actor-Critic Network
# =============================================================================
class ActorCritic(nn.Module):
   def __init__(self, input_dim, action_dim, hidden_dim=64):
       super().__init__()
       self.shared = nn.Sequential(
           nn.Linear(input_dim, hidden_dim),
           nn.ReLU()
       )
       self.actor = nn.Sequential(
           nn.Linear(hidden_dim, hidden_dim),
           nn.ReLU(),
           nn.Linear(hidden_dim, action_dim),
           nn.Softmax(dim=-1)
       )
       self.critic = nn.Sequential(
           nn.Linear(hidden_dim, hidden_dim),
           nn.ReLU(),
           nn.Linear(hidden_dim, 1)
       )


   def forward(self, x):
       h = self.shared(x)
       return self.actor(h), self.critic(h)




# =============================================================================
# 3. PPO Agent
# =============================================================================
class PPOAgent:
   def __init__(self, input_dim, action_dim,
                hidden_dim=64, lr=3e-4,
                gamma=0.99, lam=0.95,
                clip_epsilon=0.2,
                update_epochs=10,
                batch_size=64):
       self.gamma = gamma
       self.lam = lam
       self.clip_epsilon = clip_epsilon
       self.update_epochs = update_epochs
       self.batch_size = batch_size


       self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
       self.model = ActorCritic(input_dim, action_dim, hidden_dim).to(self.device)
       self.optimizer = optim.Adam(self.model.parameters(), lr=lr)


   def select_action(self, state):
       s = torch.FloatTensor(state).to(self.device)
       probs, val = self.model(s)
       dist = torch.distributions.Categorical(probs)
       a = dist.sample()
       return a.item(), dist.log_prob(a).item(), val.item()


   def compute_gae(self, rewards, values, dones):
       advantages = []
       gae = 0
       values = values + [0]
       for t in reversed(range(len(rewards))):
           delta = rewards[t] + self.gamma * (1 - dones[t]) * values[t+1] - values[t]
           gae = delta + self.gamma * self.lam * (1 - dones[t]) * gae
           advantages.insert(0, gae)
       returns = [adv + v for adv, v in zip(advantages, values[:-1])]
       return advantages, returns


   def update(self, traj):
       states      = torch.FloatTensor(traj['states']).to(self.device)
       actions     = torch.LongTensor(traj['actions']).to(self.device)
       old_logprobs= torch.FloatTensor(traj['log_probs']).to(self.device)
       rewards     = traj['rewards']
       dones       = traj['dones']
       values      = traj['values']


       advs, rets = self.compute_gae(rewards, values, dones)
       advs = torch.FloatTensor(advs).to(self.device)
       rets = torch.FloatTensor(rets).to(self.device)
       advs = (advs - advs.mean()) / (advs.std() + 1e-8)


       dataset_size = states.size(0)
       for _ in range(self.update_epochs):
           perm = torch.randperm(dataset_size)
           for i in range(0, dataset_size, self.batch_size):
               idx = perm[i:i+self.batch_size]
               bs, ba, blp, ba_adv, br = (
                   states[idx], actions[idx], old_logprobs[idx], advs[idx], rets[idx]
               )
               probs, vals = self.model(bs)
               dist = torch.distributions.Categorical(probs)
               lp   = dist.log_prob(ba)
               ent  = dist.entropy().mean()


               ratio = torch.exp(lp - blp)
               s1    = ratio * ba_adv
               s2    = torch.clamp(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * ba_adv
               actor_loss = -torch.min(s1, s2).mean()
               critic_loss= nn.MSELoss()(vals.squeeze(-1), br)
               loss       = actor_loss + 0.5 * critic_loss - 0.01 * ent


               self.optimizer.zero_grad()
               loss.backward()
               self.optimizer.step()




# =============================================================================
# 4. PPO Training Loop
# =============================================================================
def train_ppo(env, agent, num_episodes=500, rollout_length=None):
   rewards_hist = []
   rollout_length = rollout_length or env.num_data
   for ep in range(num_episodes):
       state = env.reset()
       traj = {'states': [], 'actions': [], 'log_probs': [], 'rewards': [], 'dones': [], 'values': []}
       total, done = 0, False
       for _ in range(rollout_length):
           a, lp, v = agent.select_action(state)
           ns, r, done, _ = env.step(a)
           traj['states'].append(state)
           traj['actions'].append(a)
           traj['log_probs'].append(lp)
           traj['rewards'].append(r)
           traj['dones'].append(float(done))
           traj['values'].append(v)
           state = ns
           total += r
           if done:
               break
       agent.update(traj)
       rewards_hist.append(total)
       if (ep+1) % 50 == 0:
           print(f"Episode {ep+1}/{num_episodes} — Reward {total:.2f}")
   return rewards_hist




# =============================================================================
# 5. Performance Metrics
# =============================================================================
def compute_performance_metrics(equity_history, risk_free_rate=0.0):
   e = np.array(equity_history)
   roi = (e[-1] - e[0]) / e[0]
   dr = np.diff(e) / e[:-1]
   mu, sigma = dr.mean(), dr.std()
   sr = ((mu - risk_free_rate) / sigma * np.sqrt(252)) if sigma > 0 else 0.0
   peak, maxdd = e[0], 0
   for x in e:
       peak = max(peak, x)
       dd = (peak - x) / peak
       maxdd = max(maxdd, dd)
   return roi, sr, maxdd




# =============================================================================
# 6. Main: Train, Eval, and Plot with date printing
# =============================================================================
if __name__ == "__main__":
    folder_path = "/Users/wongyule/Documents/Designing Intelligent Agents/FinancialTradingBot/Merged_Data_For_RL"  # CHANGE THIS to your directory
    output_folder = "logs"  # log output folder
    os.makedirs(output_folder, exist_ok=True)

    for filename in os.listdir(folder_path):
        if not filename.endswith(".csv"):
            continue

        symbol = filename.split("_")[0]  # e.g., AAPL_stock_data_... → AAPL
        file_path = os.path.join(folder_path, filename)

        print(f"\n🚀 Running PPO on: {symbol}")

        # Create environment and agent
        env = TradingEnv(data_path=file_path)
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        agent = PPOAgent(input_dim=state_dim, action_dim=action_dim)

        # Train PPO
        rewards = train_ppo(env, agent, num_episodes=500, rollout_length=env.num_data)

        # Evaluate and collect logs
        state, done = env.reset(), False
        log_rows = []

        while not done:
            action, log_prob, value = agent.select_action(state)
            obs_row = env.data.iloc[env.current_step]
            price = obs_row["Price"]
            equity = env.balance + env.shares * price
            feature_vals = (obs_row[env.feature_cols] - env.feature_mean) / env.feature_std

            state, _, done, info = env.step(action)

            log_rows.append({
                "Date": obs_row["Date"].strftime("%Y-%m-%d"),
                **{f"z_{col}": feature_vals[col] for col in env.feature_cols},
                "Action": info["executed_action"],
                "Price": price,
                "Forecasted_Price": obs_row.get("Forecasted_Price", np.nan),
                "Balance": env.balance,
                "Shares": env.shares,
                "Equity": equity,
                "Position": env.position
            })


        # Save log
        log_csv = os.path.join(output_folder, f"ppo_trading_log_{symbol}.csv")
        with open(log_csv, mode='w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=log_rows[0].keys())
            writer.writeheader()
            writer.writerows(log_rows)

        # Print metrics
        roi, sharpe, maxdd = compute_performance_metrics(env.equity_history)
        print(f"📈 {symbol} — ROI: {roi*100:.2f}%, Sharpe: {sharpe:.2f}, Max Drawdown: {maxdd*100:.2f}%")





🚀 Running PPO on: MSFT
Episode 50/500 — Reward 48.43
Episode 100/500 — Reward 271.03
Episode 150/500 — Reward 380.53
Episode 200/500 — Reward 406.31
Episode 250/500 — Reward 415.74
Episode 300/500 — Reward 473.73
Episode 350/500 — Reward 512.18
Episode 400/500 — Reward 432.30
Episode 450/500 — Reward 598.44
Episode 500/500 — Reward 526.99
📈 MSFT — ROI: 5.82%, Sharpe: 3.28, Max Drawdown: 0.32%

🚀 Running PPO on: VZ
Episode 50/500 — Reward 16.11
Episode 100/500 — Reward 39.43
Episode 150/500 — Reward 39.91
Episode 200/500 — Reward 46.38
Episode 250/500 — Reward 54.91
Episode 300/500 — Reward 50.99
Episode 350/500 — Reward 53.43
Episode 400/500 — Reward 66.49
Episode 450/500 — Reward 63.83
Episode 500/500 — Reward 68.17
📈 VZ — ROI: 0.70%, Sharpe: 4.00, Max Drawdown: 0.04%

🚀 Running PPO on: KO
Episode 50/500 — Reward 9.60
Episode 100/500 — Reward 31.30
Episode 150/500 — Reward 35.00
Episode 200/500 — Reward 38.67
Episode 250/500 — Reward 41.46
Episode 300/500 — Reward 55.00
Episode 350/5