In [5]:
!pip install -q yfinance gym



In [6]:
import numpy as np
import random
from collections import deque
import yfinance as yf
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gym
from gym import spaces

class NasdaqMarketEnv(gym.Env):
    def __init__(self, ticker, initial_cash=10000, start_date='2010-01-01', end_date='2023-12-31'):
        super(NasdaqMarketEnv, self).__init__()
        self.ticker = ticker
        self.start_date = start_date
        self.end_date = end_date
        self.initial_cash = initial_cash
        self.data = self._load_data()
        self.action_space = spaces.Discrete(2)  # Buy, Sell
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(4,))
        self.reset()

    def _load_data(self):
        stock_data = yf.download(self.ticker, start=self.start_date, end=self.end_date)
        return stock_data['Close'].values

    def reset(self):
        # Reset environment to initial state
        self.current_step = 0
        self.cash = self.initial_cash
        self.shares = 0
        self.portfolio_value = self.initial_cash
        return self._get_observation()

    def step(self, action):
        # Execute one time step within the environment
        reward = 0
        done = False

        # Execute action
        if action == 0:  # Buy
            if self.cash > self.data[self.current_step]:
                self.shares += 1
                self.cash -= self.data[self.current_step]
            else:
                reward -= 1  # Penalize for invalid action
        elif action == 1:  # Sell
            if self.shares > 0:
                self.shares -= 1
                self.cash += self.data[self.current_step]
            else:
                reward -= 1  # Penalize for invalid action

        # Move to the next time step
        self.current_step += 1

        # Calculate portfolio value
        self.portfolio_value = self.cash + self.shares * self.data[self.current_step]

        # Check if done
        if self.current_step >= len(self.data) - 1:
            done = True

        # Calculate reward (simple return)
        reward += (self.portfolio_value - self.initial_cash) / self.initial_cash

        return self._get_observation(), reward, done, {}

    def _get_observation(self):
        # Get the observation
        return np.array([self.cash, self.shares, self.portfolio_value, self.data[self.current_step]])

class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3 = nn.Linear(24, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

class DQNAgent:
    def __init__(self, state_size, action_size, learning_rate=0.001, gamma=0.95, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = QNetwork(state_size, action_size).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
        self.loss = []

    def remember(self, state, action, reward, next_state, done):
        # Store the experience in memory
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        # Epsilon-greedy action selection
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            with torch.no_grad():
                state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
                q_values = self.model(state)
                return q_values.max(1)[1].item()

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        
        # Sample minibatch from the memory
        minibatch = random.sample(self.memory, batch_size)
        
        for state, action, reward, next_state, done in minibatch:
            state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            next_state = torch.FloatTensor(next_state).unsqueeze(0).to(self.device)
            target = reward
            if not done:
                target = reward + self.gamma * torch.max(self.model(next_state)[0])
            
            q_values = self.model(state)
            old_q_value = q_values[0][action]
            q_values[0][action] = target
            loss = F.mse_loss(q_values, self.model(state))
            
            # Optimize the model
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            self.loss.append(abs(loss.item() - old_q_value.item()))

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [9]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ticker = '^IXIC'  # Nasdaq index ticker symbol
start_date = '2010-01-01'
end_date = '2023-12-31'

env = NasdaqMarketEnv(ticker, start_date=start_date, end_date=end_date)
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
# agent.model.to(device)

batch_size = 256
episodes = 100

# 2시간에 43th episode...
for e in range(episodes):
    state = env.reset()
    agent.loss = []
    for time in range(500):
        
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -10
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        # print(e, time, action, reward)
        if done:
            break
        agent.replay(batch_size)
    print(f"Episode: {e + 1}/{episodes}, Loss: {np.mean(agent.loss)}, Last Reward: {reward}")

# batch_size 128에서는 23분에 13th episode...
# batch_size 256에서는 12분에 4th episode...
# batch_size 256에서는 91분에 27th episode...

[*********************100%%**********************]  1 of 1 completed




Episode: 1/100, Loss: 25.16925684812544, Last Reward: -0.87581484375
Episode: 2/100, Loss: 1.903290557432102, Last Reward: -1.0524020751953125
Episode: 3/100, Loss: 5.156110075053837, Last Reward: -1.0032190185546874
Episode: 4/100, Loss: 7.918836825633438, Last Reward: -0.9467000244140625
Episode: 5/100, Loss: 9.719792309220042, Last Reward: -0.8752760498046875
Episode: 6/100, Loss: 13.919986788634898, Last Reward: -0.8633280029296875
Episode: 7/100, Loss: 17.19683413287164, Last Reward: -0.766784033203125
Episode: 8/100, Loss: 17.46519942345452, Last Reward: -1.0400640380859376
Episode: 9/100, Loss: 16.918044972061608, Last Reward: -0.773969091796875
Episode: 10/100, Loss: 15.426840573503245, Last Reward: 0.1772359619140625
Episode: 11/100, Loss: 13.373699585697949, Last Reward: 0.1458140869140625
Episode: 12/100, Loss: 10.931734827897744, Last Reward: -0.903178271484375
Episode: 13/100, Loss: 8.357315390813106, Last Reward: 0.1594358642578125
Episode: 14/100, Loss: 6.476055632636213

KeyboardInterrupt: 