In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv(r"SPY.csv", index_col=0, parse_dates=True)
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,112.370003,113.389999,111.510002,113.330002,92.246048,118944600
2010-01-05,113.260002,113.68,112.849998,113.629997,92.490204,111579900
2010-01-06,113.519997,113.989998,113.43,113.709999,92.555328,116074400
2010-01-07,113.5,114.330002,113.18,114.190002,92.94606,131091100
2010-01-08,113.889999,114.620003,113.660004,114.57,93.255348,126402800


In [3]:
# make features

df['SlowSMA'] = df['Close'].rolling(33).mean()
df['FastSMA'] = df['Close'].rolling(16).mean()
feats = ['FastSMA', 'SlowSMA']

In [4]:
df['LogReturn'] = np.log(df['Close']).diff()

In [5]:
# split into train and test

Ntest = 1000
train_data = df.iloc[:-Ntest].copy()
test_data = df.iloc[-Ntest:].copy()

In [6]:
# Env Class
class Env:
    def __init__(self, df): # Constructor taking df
        self.df = df # set df to an instant variable
        self.n = len(df)
        self.current_idx = 0
        self.action_space = [0, 1, 2] # BUY, SELL, HOLD
        self.invested = 0

        self.states = self.df[feats].to_numpy()
        self.rewards = self.df['LogReturn'].to_numpy()
        self.total_buy_and_hold = 0

    def reset(self):
        self.current_idx = 0
        self.invested = 0
        self.total_buy_and_hold = 0
        return self.states[self.current_idx] # return initial state

    def step(self, action):
        # need to return (next_state, reward, done)
        self.current_idx += 1
        if self.current_idx >= self.n:
            raise Exception("Exception already done")

        if action == 0: # BUY
            self.invested = 1
        elif action == 1: # SELL
            self.invested = 0

        # compute reward
        if self.invested:
            reward = self.rewards[self.current_idx]
        else:
            reward = 0

        # state transtition
        next_state = self.states[self.current_idx]
        
        # baseline
        self.total_buy_and_hold += self.rewards[self.current_idx]

        # done flag
        done = (self.current_idx == self.n - 1)
        return next_state, reward, done

In [7]:
class Agent:
    def __init__(self):
        self.is_invested = False
        
    def act(self, state):
        assert(len(state) == 2)
        # (fast, slow)
        
        if state[0] > state[1] and not self.is_invested: # fast > slow
            self.is_invested = True
            return 0 # Buy
    
        if state[0] < state[1] and self.is_invested:
            self.is_invested = False
            return 1 # Sell
    
        return 2

In [8]:
def play_one_episode(agent, env):
    state = env.reset()
    done = False
    total_reward = 0
    agent.is_invested = False
    
    while not done:
        action = agent.act(state)
        next_state, reward, done = env.step(action)
        total_reward += reward
        state = next_state
        
    return total_reward

In [9]:
train_env = Env(train_data)
test_env = Env(test_data)

In [10]:
agent = Agent()

In [11]:
train_reward = play_one_episode(agent, train_env)

In [12]:
test_reward = play_one_episode(agent, test_env)

In [13]:
train_reward, train_env.total_buy_and_hold

(0.43459304796456966, 0.5970866514889392)

In [14]:
test_reward, test_env.total_buy_and_hold

(0.08889132894199214, 0.19307543946998518)