In [2]:
import pandas as pd
import quandl
import datetime
 
# We will look at stock prices over the past year, starting at January 1, 2016
start = datetime.datetime(2001,1,1)
end = datetime.datetime(2001,12,31)

In [8]:
s = "AAPL"
apple = quandl.get("WIKI/" + s, start_date=start, end_date=end)

In [9]:
apple = apple.reset_index()[["Date","Open","High","Low","Close","Adj. Close","Volume"]]

In [11]:
apple.to_csv("stock_data.csv", index = False)
apple.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj. Close,Volume
0,2001-01-02,14.88,15.25,14.56,14.88,0.956144,8077000.0
1,2001-01-03,14.5,16.69,14.44,16.37,1.051887,14590600.0
2,2001-01-04,18.14,18.5,16.81,17.06,1.096224,13203500.0
3,2001-01-05,16.94,17.37,16.06,16.37,1.051887,7363500.0
4,2001-01-08,16.94,16.98,15.94,16.56,1.064095,6673200.0


In [12]:
#make our agent
import numpy as np
import random
class DQN:
    #this sets up the bolltmann parameters
    def __init__(self, env):
        self.env     = env
        #can hold 100 sequences at most
        self.memory  = deque(maxlen=100)
        
        self.gamma = 0.85
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.005
        self.tau = .125

        self.model        = self.create_model()
        self.target_model = self.create_model()

    #This creates our model
    def create_model(self):
        model   = Sequential()
        state_shape  = 30 #for now assume window size is 30
        model.add(Dense(24, input_dim=state_shape, activation="relu"))
        model.add(Dense(48, activation="relu"))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(3))
        model.compile(loss="mean_squared_error",
            optimizer=Adam(lr=self.learning_rate))
        return model
    
    #This is the exploration vs exploitation part. sample() gives us our 1 of 3 action choices. Actually take the action
    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon:
            return random.randint(0,2)
        return np.argmax(self.model.predict(state.values.reshape(1,30))[0])

    #puts the current trial in memory so we can sample from it later and use it
    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])

    #sample our paths from memory and fit them to get current and future state rewards
    def replay(self):
        batch_size = 5
        
        if len(self.memory) < batch_size: #don't let it sample from a spot where we don't have at least 5 options
            return

        samples = random.sample(self.memory, batch_size) #samples 5 observations from memory
        for sample in samples:
            state, action, reward, new_state, done = sample
            target = self.target_model.predict(state.values.reshape(1,30)) #predicts all 3 options in an array
            if done:
                target[0][action] = reward #if we win/end then there are no more terminal states so we quit
            else:
                #otherwise choose the the max Q for estimating future reward
                Q_future = max(self.target_model.predict(new_state.values.reshape(1,30))[0]) 
                target[0][action] = reward + Q_future * self.gamma #Boltzmann Equation
            #fit a model to your max possible target (gain) value
            #why do we fit it to all 3 values but only update 1 of the values?
            self.model.fit(state.values.reshape(1,30), target, epochs=1, verbose=0)

    #updates the training weights
    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            #update training weights as explained in the blog for convergence
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)

In [13]:
import numpy as np
stock_close = pd.Series(apple["Adj. Close"])
stock_close.shape

(248,)

In [137]:
#we will need to make our environment with a class
from gym.utils import seeding
from gym import spaces

class stock_env:
    def __init__(self):
        
        self.observation = 0
        self.observation_space = spaces.Discrete(3)
        self.guess_max = 50 #this is our choice. how long do we want it to be able to look
        self.ending_value = 0
        self.starting_value = 0
        self.counter = 0
        self.seed()
        self.reset()
    
    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]
    
    def step(self, action,counter, num_stocks, money_sum):
        
        #add in penatlies for incorrect moves like going below our number of stocks we can buy
        if num_stocks < 0:
            money_sum -= -500
            return 2, reward, True, num_stocks, money_sum
        #trying to buy when we don't have any money to buy
        if (money_sum < 0) & (action == 1):
            money_sum = -100 
            return 1, money_sum , True, num_stocks, money_sum
        
        #other
        if action == 0:
            self.observation = 1
            money_sum = money_sum - 1 #this serves as a penalty for not buying or selling
            
        #buy
        if action == 1:
            self.observation = 2
            num_stocks = num_stocks - 1 #this is the number of more stocks you can still buy
            money_sum = money_sum - list(stock_close)[ending_value + counter]
            
        #sell
        if action == 2:
            self.observation == 3
            num_stocks = num_stocks + 1 #this is the number of more stocks you can still buy
            money_sum = money_sum + list(stock_close)[ending_value + counter]
            
        
        reward = money_sum
        #takes counter from our for loop. We will only go 20 observations at a time so it doesn't learn time spanned bias
        done = counter >= self.guess_max
        return self.observation, reward, done, num_stocks, money_sum
    
    #don't sample from places where you can't get 30 in a row
    def reset(self):
        from random import randint
        self.observation = 0
        ending_value = randint(30, len(stock_close) - 1)
        starting_value = ending_value - 30
        self.counter = 0
        num_stocks = 10
        money_sum = 10
        return self.observation, ending_value, starting_value, self.counter, num_stocks, money_sum

In [138]:
#initiazlied parameters... still need to create environment
env = stock_env
gamma   = 0.9
epsilon = .95
trials  = 500
env().observation_space

Discrete(3)

In [139]:
from collections import deque
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
dqn_agent = DQN(env)

In [140]:
#start point has to have at least 31 days of previous information till the end

#here is how we loop through our states
for trial in range(trials):
    
    #get initial parameters
    observation, ending_value, starting_value, counter, num_stocks, money_sum = stock_env.reset(stock_env)
    
    for i in range(ending_value, len(stock_close)-30): #range(ending_value, len(stock_close)-30): this might be a little long for now
        current_state = stock_close[starting_value + counter:ending_value + counter]
        current_price = stock_close[ending_value + counter]
        counter += 1

        #make an action
        action = dqn_agent.act(current_state)

        #take next action
        observation, reward, done, num_stocks, money_sum = env.step(env(), action, counter, num_stocks, money_sum)

        #next state
        next_state = stock_close[starting_value + counter:ending_value + counter]
        
        #done = True if t == l - 1 else False
        dqn_agent.remember(current_state, action, reward, next_state, done)


        dqn_agent.replay()
        dqn_agent.target_train()
        
        state = next_state
        if done:
            print("--------------------------------")
            print("Total Profit at {}: ".format(trial) + str(reward))
            print("--------------------------------")
            break  

--------------------------------
Total Profit at 0: 7.240952561215606
--------------------------------
--------------------------------
Total Profit at 2: 32.29764549635448
--------------------------------
--------------------------------
Total Profit at 5: 33.658927535604995
--------------------------------
--------------------------------
Total Profit at 6: 62.870791823890784
--------------------------------
--------------------------------
Total Profit at 7: 51.51256844925598
--------------------------------
--------------------------------
Total Profit at 9: 76.39094275996518
--------------------------------
--------------------------------
Total Profit at 10: 60.32959027193
--------------------------------
--------------------------------
Total Profit at 12: 71.77520312416709
--------------------------------
--------------------------------
Total Profit at 14: 74.20299297146438
--------------------------------
--------------------------------
Total Profit at 16: 61.27818469686949
