In [1]:
import numpy as np

class QLearningAgent:
    def __init__(self, num_states, num_actions, learning_rate=0.1, discount_factor=0.9, exploration_rate=0.1):
        self.num_states = num_states
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.q_table = np.zeros((num_states, num_actions))

    def select_action(self, state):
        # Epsilon-greedy policy for action selection
        if np.random.rand() < self.exploration_rate:
            # Explore: select a random action
            return np.random.choice(self.num_actions)
        else:
            # Exploit: select action with highest Q-value for current state
            return np.argmax(self.q_table[state])

    def update_q_table(self, state, action, reward, next_state):
        # Q-value update using Q-learning equation
        best_next_action = np.argmax(self.q_table[next_state])
        td_target = reward + self.discount_factor * self.q_table[next_state, best_next_action]
        td_error = td_target - self.q_table[state, action]
        self.q_table[state, action] += self.learning_rate * td_error

In [2]:
# Example usage:
# Define environment parameters
num_states = 3
num_actions = 2

# Create Q-learning agent
agent = QLearningAgent(num_states, num_actions)

# Simulate episodes
num_episodes = 1000
for episode in range(num_episodes):
    # Reset environment and initialize state
    state = 0
    done = False
    total_reward = 0
    
    while not done:
        # Select action based on current state
        action = agent.select_action(state)
        
        # Simulate environment (transition to next state and receive reward)
        if state == 0 and action == 1:
            next_state = 1
            reward = 1
        elif state == 1 and action == 0:
            next_state = 0
            reward = 2
        elif state == 1 and action == 1:
            next_state = 2
            reward = 3
        else:
            next_state = state
            reward = 0
        
        # Update Q-table based on observed transition
        agent.update_q_table(state, action, reward, next_state)
        
        # Update current state
        state = next_state
        
        # Accumulate total reward
        total_reward += reward
        
        # Check if episode is done
        if state == num_states - 1:
            done = True
    
    # Print total reward for episode
    print("Episode {}: Total Reward = {}".format(episode + 1, total_reward))

Episode 1: Total Reward = 70
Episode 2: Total Reward = 79
Episode 3: Total Reward = 19
Episode 4: Total Reward = 22
Episode 5: Total Reward = 124
Episode 6: Total Reward = 31
Episode 7: Total Reward = 40
Episode 8: Total Reward = 265
Episode 9: Total Reward = 31
Episode 10: Total Reward = 16
Episode 11: Total Reward = 118
Episode 12: Total Reward = 85
Episode 13: Total Reward = 22
Episode 14: Total Reward = 31
Episode 15: Total Reward = 103
Episode 16: Total Reward = 52
Episode 17: Total Reward = 49
Episode 18: Total Reward = 148
Episode 19: Total Reward = 109
Episode 20: Total Reward = 7
Episode 21: Total Reward = 10
Episode 22: Total Reward = 10
Episode 23: Total Reward = 127
Episode 24: Total Reward = 100
Episode 25: Total Reward = 88
Episode 26: Total Reward = 31
Episode 27: Total Reward = 94
Episode 28: Total Reward = 28
Episode 29: Total Reward = 55
Episode 30: Total Reward = 28
Episode 31: Total Reward = 55
Episode 32: Total Reward = 64
Episode 33: Total Reward = 19
Episode 34: 

In [35]:
import numpy as np

class QLearningAgent_mine:
    def __init__(self, data, states, actions, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.data = data
        
        self.states = states
        self.actions = actions
        self.pseudo_actions = []
        
        self.num_states = len(states)
        self.num_actions = len(actions)
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q = np.zeros((self.num_states, self.num_actions))

    def choose_action(self, state):
        if np.random.uniform(0, 1) < self.epsilon:
            return np.random.choice(self.actions)
        else:
            return self.actions[np.argmax(self.Q[self.states.index(state)])]

    def update_q_table(self, state, action, reward, next_state):
        self.Q[self.states.index(state), self.actions.index(action)] += self.alpha * (
                reward + self.gamma * np.max(self.Q[self.states.index(next_state)]) -
                self.Q[self.states.index(state), self.actions.index(action)])

    def train(self, num_episodes):
        for episode in range(num_episodes):
            #state = np.random.choice(self.states)
            state = self.states[2] # begin 'neutral'
            i = 0
            total_reward = 0

            while True:
                action = self.choose_action(state) # strategy applied
                price_open = self.data['close'].iloc[i] # position open price

                for j in range(i, len(self.data['close'])):
                    price_close = self.data['close'].iloc[j]
                    # Simulate environment and get reward and next state
                    # Update state based on your financial data
                    reward = self.get_reward(state, action, price_open, price_close)  # Define your reward function
                    if reward > 0:
                        next_state = self.states[0]
                    elif reward < 0:
                        next_state = self.states[1]
                    else:
                        next_state = self.states[2]
                    self.update_q_table(state, action, reward, next_state)
                    state = next_state
                i += 1

                total_reward += reward
                
                # Check for terminal state
                if i == len(self.data['close'])-1:
                    break

            print("Episode {}: Total Reward = {}".format(episode + 1, total_reward))

    def get_reward(self, state, action, price_open, price_close):

        if action == 'buy' and price_close > price_open:
            return 1
        elif action == 'buy' and price_close < price_open:
            return -1
        elif action == 'sell' and price_open > price_close:
            return 1
        elif action == 'sell' and price_open < price_close:
            return -1
        else:
            return 0

In [36]:
import pandas as pd

data = pd.read_csv('data_sample.csv')
data = data.iloc[:100]

In [37]:
# Example usage
states = ['long', 'short', 'await']
actions = ['buy', 'sell', 'hold']
agent = QLearningAgent_mine(data, states, actions)
agent.train(num_episodes=100)

Episode 1: Total Reward = 34
Episode 2: Total Reward = 53
Episode 3: Total Reward = 25
Episode 4: Total Reward = 43
Episode 5: Total Reward = 10
Episode 6: Total Reward = 21
Episode 7: Total Reward = 39
Episode 8: Total Reward = 28
Episode 9: Total Reward = 40
Episode 10: Total Reward = -12
Episode 11: Total Reward = -4
Episode 12: Total Reward = 41
Episode 13: Total Reward = 49
Episode 14: Total Reward = 18
Episode 15: Total Reward = 32
Episode 16: Total Reward = 31
Episode 17: Total Reward = 41
Episode 18: Total Reward = 24
Episode 19: Total Reward = 22
Episode 20: Total Reward = 34
Episode 21: Total Reward = 25
Episode 22: Total Reward = 19
Episode 23: Total Reward = 24
Episode 24: Total Reward = -13
Episode 25: Total Reward = 34
Episode 26: Total Reward = -9
Episode 27: Total Reward = 22
Episode 28: Total Reward = 23
Episode 29: Total Reward = 23
Episode 30: Total Reward = 35
Episode 31: Total Reward = -5
Episode 32: Total Reward = 34
Episode 33: Total Reward = 20
Episode 34: Total

In [38]:
agent.Q

array([[6.03899801, 6.3616363 , 5.35597354],
       [4.30093833, 4.78168404, 5.35599407],
       [4.96314273, 6.15857983, 5.72002849]])

In [17]:
agent.Q[states.index('neutral')]

array([4.99888917, 5.19097445, 4.58921745])

In [21]:
np.argmax(agent.Q[states.index('neutral')])

1

In [23]:
import pickle

# Specify the file path where you want to save the pickled Q-matrix
file_path = 'q_matrix.pkl'

# Pickle the Q-matrix and save it to a file
with open(file_path, 'wb') as f:
    pickle.dump(agent.Q, f)

In [24]:
1706504400 - 1706522460

-18060