## Training

### Normalizing the Dataset
We'll use the MinMaxScaler from scikit-learn to normalize the features. here's how we can normalize the dataset and ensure the 'Open Time' and 'Close Time' columns are not included in the normalization process.

In [1]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Function to normalize features
def normalize_features(df):
    scaler = MinMaxScaler()
    # Define the columns to be normalized
    columns_to_normalize = ['Open', 'High', 'Low', 'Close', 'Volume', 'Quote Asset Volume',
                            'Taker Buy Base Asset Volume', 'Taker Buy Quote Asset Volume',
                            'Mark_Price', 'VWAP', 'EMA12', 'EMA26', 'MACD', 'RSI', 'ROC', 'OBV',
                            'ATR', 'Mark_Price_Lag1', 'Mark_Price_Lag2']
    df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])
    return df

# Load our historical data
df = pd.read_csv('MarkPricebinance_data_for_model_training.csv')
df = normalize_features(df)

# Verify normalization
df.head()


Unnamed: 0,Open Time,Open,High,Low,Close,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,...,RSI,ROC,OBV,ATR,Mark_Price_Lag1,Mark_Price_Lag2,Day_of_Week,Hour_of_Day,Is_Weekend,Is_Holiday
0,2017-08-17 04:13:00,0.020177,0.020177,0.020383,0.020357,0.0,2017-08-17 04:13:59.999,0.0,0,0.0,...,0.5,0.435868,0.000261,0.001594,0.020357,0.020357,3,4,0,0
1,2017-08-17 04:14:00,0.020177,0.020177,0.020383,0.020357,0.0,2017-08-17 04:14:59.999,0.0,0,0.0,...,0.5,0.426372,0.000261,0.001594,0.020357,0.020357,3,4,0,0
2,2017-08-17 04:15:00,0.020177,0.020177,0.020383,0.020357,0.0,2017-08-17 04:15:59.999,0.0,0,0.0,...,0.5,0.435868,0.000261,0.001594,0.020357,0.020357,3,4,0,0
3,2017-08-17 04:16:00,0.020177,0.020177,0.020383,0.020357,0.0,2017-08-17 04:16:59.999,0.0,0,0.0,...,0.0,0.435868,0.000261,0.000797,0.020357,0.020357,3,4,0,0
4,2017-08-17 04:17:00,0.020177,0.020225,0.020383,0.020405,1.3e-05,2017-08-17 04:17:59.999,2e-06,2,2.1e-05,...,1.0,0.437568,0.000261,0.000142,0.020357,0.020357,3,4,0,0


### The Trading Environment
Now, we will incorporate the normalized dataset into the custom trading environment. This environment will simulate trading actions and provide rewards based on the agent's performance.

In [6]:
import gym
from gym import spaces
import numpy as np

class TradingEnv(gym.Env):
    def __init__(self, df, initial_balance=10000, leverage=1.0, transaction_fee=0.001):
        super(TradingEnv, self).__init__()
        self.df = df
        self.initial_balance = initial_balance
        self.leverage = leverage
        self.transaction_fee = transaction_fee
        self.current_step = 0
        self.balance = initial_balance
        self.position = None  # 'long' or 'short'
        self.entry_price = 0

        # Action space: Buy, Sell, Hold
        self.action_space = spaces.Discrete(3)
        
        # Observation space: OHLCV + Technical Indicators (excluding 'Open Time' and 'Close Time')
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(len(self._get_observation()),), dtype=np.float32
        )

    def _get_observation(self):
        obs = self.df.iloc[self.current_step].drop(['Open Time', 'Close Time']).values
        obs = obs.astype(np.float32)  # Ensure the observation is of type float32
        return obs

    def reset(self):
        self.balance = self.initial_balance
        self.position = None  # 'long' or 'short'
        self.entry_price = 0
        self.current_step = 0
        self.done = False
        return self._get_observation()

    def step(self, action):
        current_price = self.df.iloc[self.current_step]['Close']
        reward = 0
    
        if action == 0:  # Hold
            pass
        elif action == 1:  # Buy
            if self.position is None:
                self.position = 'long'
                self.entry_price = current_price
        elif action == 2:  # Sell
            if self.position == 'long':
                profit = (current_price - self.entry_price) * self.leverage
                reward = profit - (self.transaction_fee * abs(profit))
                self.balance += reward
                self.position = None
    
        self.current_step += 1
        self.done = self.current_step >= len(self.df) - 1
    
        return self._get_observation(), reward, self.done, {}


    def render(self, mode='human'):
        print(f'Step: {self.current_step}, Balance: {self.balance}')

# Create the environment with the normalized data
env = TradingEnv(df)

# Test the environment
obs = env.reset()
for _ in range(10):  # Example run for 10 steps
    action = env.action_space.sample()
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
        break


Step: 1, Balance: 10000
Step: 2, Balance: 10000
Step: 3, Balance: 10000
Step: 4, Balance: 10000.0
Step: 5, Balance: 10000.0
Step: 6, Balance: 10000.0
Step: 7, Balance: 10000.0
Step: 8, Balance: 10000.0
Step: 9, Balance: 10000.0
Step: 10, Balance: 10000.0


### Training the RL Model
With the environment set up, we can now train an RL agent using Stable Baselines3 with the normalized dataset.

In [9]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

In [14]:
# Create a vectorized environment
vec_env = DummyVecEnv([lambda: env])

# Define the PPO model
model = PPO('MlpPolicy', vec_env, verbose=1)

# Train the agent
model.learn(total_timesteps=900000)  # we can adjust timesteps as needed

# Save the model
model.save("ppo_trading_model")



Using cpu device
-----------------------------
| time/              |      |
|    fps             | 855  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 633         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.006121232 |
|    clip_fraction        | 0.032       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.1        |
|    explained_variance   | -0.957      |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00856    |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00649    |
|    value_loss           | 0.00572     |
-----------------------------------------
-----------------

### Testing and Evaluation
Finally lets test the trained agent on the environment and evaluate its performance.

In [None]:
# Load the trained model
model = PPO.load("ppo_trading_model")

# Test the agent
obs = env.reset()
done = False
total_reward = 0

actions_taken = []

while not done:
    action, _states = model.predict(obs, deterministic=True)
    actions_taken.append(action)
    obs, reward, done, info = env.step(action)
    total_reward += reward
    env.render()

print(f'Total Reward: {total_reward}')
print(f'Actions Taken: {actions_taken}')


Step: 1, Balance: 10000
Step: 2, Balance: 10000
Step: 3, Balance: 10000
Step: 4, Balance: 10000
Step: 5, Balance: 10000
Step: 6, Balance: 10000
Step: 7, Balance: 10000
Step: 8, Balance: 10000
Step: 9, Balance: 10000
Step: 10, Balance: 10000
Step: 11, Balance: 10000
Step: 12, Balance: 10000
Step: 13, Balance: 10000
Step: 14, Balance: 10000
Step: 15, Balance: 10000
Step: 16, Balance: 10000
Step: 17, Balance: 10000
Step: 18, Balance: 10000
Step: 19, Balance: 10000
Step: 20, Balance: 10000
Step: 21, Balance: 10000
Step: 22, Balance: 10000
Step: 23, Balance: 10000
Step: 24, Balance: 10000
Step: 25, Balance: 10000
Step: 26, Balance: 10000
Step: 27, Balance: 10000
Step: 28, Balance: 10000
Step: 29, Balance: 10000
Step: 30, Balance: 10000
Step: 31, Balance: 10000
Step: 32, Balance: 10000
Step: 33, Balance: 10000
Step: 34, Balance: 10000
Step: 35, Balance: 10000
Step: 36, Balance: 10000
Step: 37, Balance: 10000
Step: 38, Balance: 10000
Step: 39, Balance: 10000
Step: 40, Balance: 10000
Step: 41,