In [11]:
import pandas as pd

data = pd.read_csv('data_sample_one_year.csv')

data = data.iloc[:, [0,3]]
data.rename(columns={'high':'close'}, inplace=True)

from talib import RSI
data['rsi'] = RSI(data['close'])

data.dropna(axis=0)

Unnamed: 0,timestamp,close,rsi
14,1675864800,1661.40,43.483899
15,1675868400,1651.57,35.387550
16,1675879200,1641.08,29.150096
17,1675882800,1642.42,30.827374
18,1675886400,1636.22,27.574649
...,...,...,...
8092,1707336000,2418.66,79.801442
8093,1707339600,2424.86,81.156058
8094,1707343200,2421.03,77.689871
8095,1707346800,2421.81,77.896916


In [2]:
states_trade = {'offtrade': 0, 'long': 1, 'short': 2}
states_relative = {'neutral': 0, 'positive': 1, 'negative': 2}
states_global = {'neutral': 0, 'profit': 1, 'loss': 2}

In [3]:
import itertools

state_space = []
for combo in itertools.product(states_trade.values(), states_relative.values(), states_global.values()):
    state_space.append(combo)

In [4]:
actions = {'hold': 0, 'buy': 1, 'sell': 2}
action_space = [i for i in actions.values()]

In [8]:
import numpy as np

class MySarsaAgent:
    def __init__(self, data, states, actions, alpha=0.3, gamma=0.7, epsilon=0.3):
        self.data = data
        
        self.states = states
        self.actions = actions
        self.pseudo_actions = []
        
        self.num_states = len(states)
        self.num_actions = len(actions)
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q = np.zeros((self.num_states, self.num_actions))

    def choose_action(self, state):
        if np.random.uniform(0, 1) < self.epsilon:
            return np.random.choice(self.actions)
        else:
            return self.actions[np.argmax(self.Q[self.states.index(state)])]

    def update_q_table(self, state, action, reward, next_state, next_action):
        # Get current Q-value
        current_q_value = self.Q[state, action]
        # Get next state's Q-value for the next action
        next_q_value = self.Q[next_state, next_action]
        # Calculate TD error
        td_error = reward + self.gamma * next_q_value - current_q_value
        # Update Q-value using SARSA update rule
        self.Q[state, action] += self.alpha * td_error

    def train(self, num_episodes):
        for episode in range(num_episodes):
            total_reward = 0
            state = [0, 0, 0]

            for i in range(1, len(self.data)):
                price_open = self.data['close'].iloc[i] # position open price
                price_recent = self.data['close'].iloc[i-1] - price_open

                if price_recent > 0:
                    state[1] = 1
                elif price_recent < 0:
                    state[1] = 2
                else:
                    state[1] = 0

                if state[0] == 0:
                    self.pseudo_actions = [1, 2]
                elif state[0] == 1:
                    self.pseudo_actions = [0, 2]
                elif state[0] == 2:
                    self.pseudo_actions = [0, 1]

                j = i+1
                while j < len(self.data):
                    action = None
                    while not action in self.pseudo_actions:
                        action = self.choose_action(tuple(state))

                    if action == 1:
                        price_delta = self.data['close'].iloc[j] - price_open
                        price_recent = self.data['close'].iloc[j] - self.data['close'].iloc[j-1]
                        
                        reward = self.get_reward(tuple(state), action)

                        if price_delta > 0:
                            next_state = [ state[0], state[1], 1]
                        elif price_delta < 0:
                            next_state = [ state[0], state[1], 2]
                        else:
                            next_state = [ state[0], state[1], 0]

                        if price_recent > 0:
                            next_state = [ state[0], 1, next_state[2]]
                        elif price_recent < 0:
                            next_state = [ state[0], 2, next_state[2]]
                        else:
                            next_state = [ state[0], 0, next_state[2]]

                        if state[0] == 0:
                            next_state = [ 1, next_state[1], next_state[2]]
                        elif state[0] == 2:
                            next_state = [ 0, next_state[1], next_state[2]]

                        self.update_q_table(tuple(state), action, reward, tuple(next_state), 0)
                        state = next_state

                    elif action == 2:
                        price_delta = self.data['close'].iloc[j] - price_open
                        price_recent = self.data['close'].iloc[j] - self.data['close'].iloc[j-1]

                        reward = self.get_reward(tuple(state), action)

                        if price_delta > 0:
                            next_state = [ state[0], state[1], 1]
                        elif price_delta < 0:
                            next_state = [ state[0], state[1], 2]
                        else:
                            next_state = [ state[0], state[1], 0]

                        if price_recent > 0:
                            next_state = [ state[0], 1, next_state[2]]
                        elif price_recent < 0:
                            next_state = [ state[0], 2, next_state[2]]
                        else:
                            next_state = [ state[0], 0, next_state[2]]

                        if state[0] == 0:
                            next_state = [ 2, next_state[1], next_state[2]]
                        elif state[0] == 1:
                            next_state = [ 0, next_state[1], next_state[2]]

                        self.update_q_table(tuple(state), action, reward, tuple(next_state), 0)
                        state = next_state

                    elif action == 0:
                        price_delta = self.data['close'].iloc[j] - price_open
                        price_recent = self.data['close'].iloc[j] - self.data['close'].iloc[j-1]

                        reward = self.get_reward(tuple(state), action)

                        if price_delta > 0:
                            next_state = [ state[0], state[1], 1]
                        elif price_delta < 0:
                            next_state = [ state[0], state[1], 2]
                        else:
                            next_state = [ state[0], state[1], 0]

                        if price_recent > 0:
                            next_state = [ state[0], 1, next_state[2]]
                        elif price_recent < 0:
                            next_state = [ state[0], 2, next_state[2]]
                        else:
                            next_state = [ state[0], 0, next_state[2]]

                        if state[0] == 1:
                            next_state = [ state[0], next_state[1], next_state[2]]
                        elif state[0] == 2:
                            next_state = [ state[0], next_state[1], next_state[2]]

                        self.update_q_table(tuple(state), action, reward, tuple(next_state), np.random.choice([1, 2]))
                        state = next_state

                    j += 1
                    
                    # Check for terminal state
                    if state[0] == 0:
                        break

                total_reward += reward

            print("Episode {}: Total Reward = {}".format(episode + 1, total_reward))
            # ---------------------------------------------------
    
    def get_reward(self, state, action):
        
        reward = 0
        
        if state[0] == 0 and state[2] == 1:
            reward += 5
        if state[0] == 0 and state[2] == 2:
            reward += -5
        
        if state[0] == 1 and state[1] == 1:
            reward += 1
        if state[0] == 1 and state[1] == 2:
            reward += -1
        if state[0] == 1 and state[2] == 1 and action == 2:
            reward += 7
        if state[0] == 1 and state[2] == 2 and action == 0:
            reward += 1
        if state[0] == 1 and state[2] == 2 and action == 2:
            reward += -3
        
        if state[0] == 2 and state[1] == 1:
            reward += -1
        if state[0] == 2 and state[1] == 2:
            reward += 1
        if state[0] == 2 and state[2] == 1 and action == 1:
            reward += -3
        if state[0] == 2 and state[2] == 2 and action == 1:
            reward += 7
        if state[0] == 2 and state[2] == 1 and action == 1:
            reward += -3

        return reward

In [9]:
agent = MySarsaAgent(data, state_space, action_space)
agent.train(num_episodes=5)

Episode 1: Total Reward = 9950
Episode 2: Total Reward = 9451
Episode 3: Total Reward = 8979
Episode 4: Total Reward = 9256
Episode 5: Total Reward = 8676


In [10]:
agent.Q

array([[0.        , 4.25865349, 5.85766286],
       [3.21844415, 2.13111633, 6.56371406],
       [2.03152356, 1.62081512, 3.33245233],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.

# Fifth Trial