# Import data

In [1]:
import pandas as pd

# Minimalist dataset, stock values time series
df = pd.read_csv("data/dataset_full.csv")
print("Dataset Head:")
print(df.head(2))


Dataset Head:
         Date     BTC-USD         SPY        XLB        XLE        XLF  \
0  2015-10-09  243.931000  170.126617  36.063808  45.904068  15.627900   
1  2015-10-12  245.307999  170.287201  35.753407  45.311443  15.641348   

         XLI        XLK        XLP       XLRE        XLU        XLV  \
0  44.584923  36.932663  37.893860  21.381659  31.748842  57.945053   
1  44.593281  36.977180  37.993462  21.516359  32.031609  58.097439   

         XLY    VIX  
0  69.639816  17.08  
1  69.971359  16.17  


# Environment Setup, Continuous case

Env setup, reward, state_space definition

In [None]:
import numpy as np
import pandas as pd
from portfolio_env import PortfolioEnv

# reward function
def reward_log_return(env):
    """
    R = ln(V_t / V_{t-1})
    """
    if len(env.history['portfolio_value']) < 2:
        return 0.0
    
    curr_val = env.history['portfolio_value'][-1]
    prev_val = env.history['portfolio_value'][-2]
    return np.log(curr_val / prev_val)

# state function (enables setting the form of the states we want)
def state_fn(env):
    """
    State = [Price Returns (window), Current Weights]
    """
    start_idx = env.current_step - env.window_size
    if start_idx < 0: 
        start_idx = 0
        
    raw_window = env.df.iloc[start_idx : env.current_step + 1][env.asset_names].values
    
    returns = np.diff(raw_window, axis=0) / raw_window[:-1]
    
    # Flatten the matrix
    # If we use a CNN in the future, we might keep the 2D shape.
    flat_returns = returns.flatten()
    
    state = np.concatenate([flat_returns, env.weights])
    
    return state

# dummy policy for test
def uniform_policy(obs):
    return np.ones(6) # give the same weight to each action


# Env initialization for continuous state space/ action space
env = PortfolioEnv(
    df=df,
    reward_fn=reward_log_return,
    state_fn=state_fn,
    initial_amount=1, # Initial amount of cash, doesn't really matter for the RL algo
    window_size=5 # number of lags (passed-time steps) to pass to the observation function
)

print(f"Action Space: {env.action_space}")
print(f"State Space: {env.state_space}")

Action Space: Box(-1.0, 1.0, (14,), float32)
State Space: Box(-inf, inf, (79,), float32)


Continuous test: Running a random policy

In [26]:
obs, _ = env.reset(
    options={
        #'start_date': '2021-01-04', # optionally pass the starting date
    #'episode_length': 126       # optionally pass the episode lenght
})

terminated = False
while not terminated:
    # 1. Get random action
    action = env.action_space.sample()
    
    # 2. Step
    next_obs, reward, terminated, truncated, info = env.step(action)
    
    if env.current_step % 250 == 0:
        print(f"Step: {env.current_step}, Value: {env.portfolio_value:.2f}, Action Index: {action}")

Step: 250, Value: 13103.76, Action Index: 110371
Step: 500, Value: 18221.25, Action Index: 677561
Step: 750, Value: 22226.74, Action Index: 628539
Step: 1000, Value: 24669.34, Action Index: 1372584
Step: 1250, Value: 37278.85, Action Index: 705054
Step: 1500, Value: 47976.74, Action Index: 1048617
Step: 1750, Value: 39879.13, Action Index: 1506336
Step: 2000, Value: 45631.09, Action Index: 1520775
Step: 2250, Value: 60133.61, Action Index: 912525


# Environment Setup, Discrete case

Env setup, reward, state_space definition

The discrete case is not particularly well suited for this task, explaning the lack of usage of Q-Learning / SARSA in finance. 

Indeed, the number of possible actions grows exponentially with the number of Stocks: (n_bins)**(n_stocks). 
With 10 bins, 10 stocks, that's 1e11 possible actions. Way too big for any DNN to learn.


A work-around to limit that is to change the action space in the following way: for each action we consider selling, buying or keeping a fixed portion of the stock (ex 5%). With this workaround we have: 3**(n_stocks) actions.

For 10 actions, that's 3e10 = 5,9e4 possible actions (still a lot but considerably 1e7 times less). The price to pay is that we have less precision and rapidity in the change of portfolio weights.

We will likely have to start by considering a small number of stocks to study the performances and we can then iterate to a larger set of stocks if the computational power / quantity of data allows.

In [None]:
import numpy as np
import pandas as pd
import gymnasium as gym

#reward function
def reward_log_return(env):
    """R = ln(V_t / V_{t-1})"""
    if len(env.history['portfolio_value']) < 2:
        return 0.0
    curr_val = env.history['portfolio_value'][-1]
    prev_val = env.history['portfolio_value'][-2]
    return np.log(curr_val / prev_val)

# state function (enables setting the form of the states we want)
def state_fn(env):
    """
    State = [Price Returns (window), Current Weights]
    """
    start_idx = env.current_step - env.window_size
    if start_idx < 0: 
        start_idx = 0
        
    raw_window = env.df.iloc[start_idx : env.current_step + 1][env.asset_names].values
    
    returns = np.diff(raw_window, axis=0) / raw_window[:-1]
    
    # Flatten the matrix
    # If we use a CNN in the future, we might keep the 2D shape.
    flat_returns = returns.flatten()
    
    state = np.concatenate([flat_returns, env.weights])
    
    return state



# init discrete env
env = PortfolioEnv(
    df=df,
    reward_fn=reward_log_return,
    state_fn=state_fn,
    initial_amount=10000,
    window_size=5,
    
    # -------------------- Discrete settings !---------------------------
    action_space_type='Discrete', # Action is an Integer index
    state_space_type='Discrete',  # State is mapped to Bins
    
    n_bins=10, # High bin count helps capture small return movements, but considerably increase comp/memory costs
    
    # Limits: 
    # Low: -0.2 (To capture negative returns down to -20%)
    # High: 0.2 (To capture positive returns down to +20%)
    state_space_lim=(-0.2, +0.2), 
    
    # Step size: How much we shift weights per action (e.g. 5%)
    step_size=0.05
)

print(f"Action Space: {env.action_space}") #3**(len(list(df.columns))-1) !!
print(f"State Space: {env.state_space}")   



Discrete Mode: Created 1594323 unique portfolio shift actions.
Action Space: Discrete(1594323)
State Space: MultiDiscrete([10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10])


Discrete test: Running a random policy

In [40]:
obs, _ = env.reset()
print(f"\nInitial Discrete State (Bins): {obs}")

terminated = False
while not terminated:
    # 1. Get Discrete Action (Integer)
    action = env.action_space.sample()
    
    # 2. Step
    next_obs, reward, terminated, truncated, info = env.step(action)
    
    if env.current_step % 250 == 0:
        print(f"Step: {env.current_step}, Value: {env.portfolio_value:.2f}, Action Index: {action}")


Initial Discrete State (Bins): [5 5 4 4 5 5 5 5 5 5 5 5 3 5 4 4 4 4 4 4 4 4 4 4 4 7 5 4 5 5 4 4 4 4 5 4 4
 4 5 5 5 5 5 5 5 5 5 4 5 5 5 2 5 5 5 5 5 4 5 5 5 5 5 5 3 5 5 5 5 5 5 5 5 5
 5 5 5 5 9]
Step: 250, Value: 10718.11, Action Index: 678621
Step: 500, Value: 14038.59, Action Index: 1257877
Step: 750, Value: 19197.87, Action Index: 37322
Step: 1000, Value: 22460.21, Action Index: 567157
Step: 1250, Value: 38565.53, Action Index: 1512738
Step: 1500, Value: 56328.80, Action Index: 324618
Step: 1750, Value: 48801.01, Action Index: 1576784
Step: 2000, Value: 53061.93, Action Index: 507774
Step: 2250, Value: 69143.06, Action Index: 478076
