In [84]:
import numpy as np

class QLearningAgent_mine:
    def __init__(self, data, states, actions, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.data = data
        
        self.states = states
        self.actions = actions
        self.pseudo_actions = []
        
        self.num_states = len(states)
        self.num_actions = len(actions)
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q = np.zeros((self.num_states, self.num_actions))

    def choose_action(self, state):
        if np.random.uniform(0, 1) < self.epsilon:
            return np.random.choice(self.actions)
        else:
            return self.actions[np.argmax(self.Q[self.states.index(state)])]

    def update_q_table(self, state, action, reward, next_state):
        self.Q[self.states.index(state), self.actions.index(action)] += self.alpha * (
                reward + self.gamma * np.max(self.Q[self.states.index(next_state)]) -
                self.Q[self.states.index(state), self.actions.index(action)])

    def train(self, num_episodes):
        for episode in range(num_episodes):
            state = np.random.choice(self.states)
            #print(state)

            if state == 'long':
                self.pseudo_actions = ['sell', 'hold']
            elif state == 'short':
                self.pseudo_actions = ['buy', 'hold']

            i = 0
            total_reward = 0

            while True:
                price_open = self.data['close'].iloc[i] # position open price

                for j in range(i+1, len(self.data['close'])):
                    action = None
                    while not action in self.pseudo_actions:
                        action = self.choose_action(state)

                    #print(action)

                    if action == 'hold':
                        price_close = self.data['close'].iloc[j]
                        reward = self.get_reward(state, action, price_open, price_close)  # Define your reward function
                        
                        self.update_q_table(state, action, reward, state)
                        
                        continue
                    else:
                        price_close = self.data['close'].iloc[j]

                        # Simulate environment and get reward and next state
                        # Update state based on your financial data
                        reward = self.get_reward(state, action, price_open, price_close)  # Define your reward function
                        
                        if reward >= 0:
                            next_state = state
                        elif reward < 0:
                            next_idx = 1 if self.states.index(state) == 0 else 0
                            next_state = self.states[next_idx]
                            
                        self.update_q_table(state, action, reward, next_state)
                        state = next_state
                i += 1

                total_reward += reward
                
                # Check for terminal state
                if i == len(self.data['close'])-1:
                    break

            print("Episode {}: Total Reward = {}".format(episode + 1, total_reward))

    def get_reward(self, state, action, price_open, price_close):

        if state == 'long' and action == 'sell':
            return 2 if price_close > price_open else -2
        elif state == 'short' and action == 'buy':
            return 2 if price_close < price_open else -2
        elif state == 'long' and action == 'hold':
            return 1 if price_close > price_open else -1
        elif state == 'short' and action == 'hold':
            return 1 if price_close < price_open else -1
        else:
            return 0

In [85]:
import pandas as pd

data = pd.read_csv('data_sample.csv')
data = data.iloc[:500]

In [86]:
# Example usage
states = ['long', 'short']
actions = ['buy', 'sell', 'hold']
agent = QLearningAgent_mine(data, states, actions)
agent.train(num_episodes=50)

Episode 1: Total Reward = 447
Episode 2: Total Reward = -209
Episode 3: Total Reward = 447
Episode 4: Total Reward = -243
Episode 5: Total Reward = -217
Episode 6: Total Reward = -218
Episode 7: Total Reward = -224
Episode 8: Total Reward = -254
Episode 9: Total Reward = -232
Episode 10: Total Reward = 439
Episode 11: Total Reward = 445
Episode 12: Total Reward = 446
Episode 13: Total Reward = 456
Episode 14: Total Reward = 453
Episode 15: Total Reward = -226
Episode 16: Total Reward = 448
Episode 17: Total Reward = 442
Episode 18: Total Reward = -246
Episode 19: Total Reward = -257
Episode 20: Total Reward = -253
Episode 21: Total Reward = 483
Episode 22: Total Reward = -231
Episode 23: Total Reward = 479
Episode 24: Total Reward = -255
Episode 25: Total Reward = 479
Episode 26: Total Reward = 476
Episode 27: Total Reward = -254
Episode 28: Total Reward = -257
Episode 29: Total Reward = 481
Episode 30: Total Reward = 482
Episode 31: Total Reward = -243
Episode 32: Total Reward = 483
E

In [87]:
agent.Q

array([[ 5.6410695 ,  6.2348898 ,  6.20885497],
       [10.96925115,  9.87232604,  9.08574894]])

In [67]:
import pickle

# Specify the file path where you want to save the pickled Q-matrix
file_path = 'q_matrix.pkl'

# Pickle the Q-matrix and save it to a file
with open(file_path, 'wb') as f:
    pickle.dump(agent.Q, f)