In [61]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from collections import deque
from datetime import datetime
from decimal import Decimal
from enum import Enum
from itertools import cycle
from math import floor
import numpy as np
import pandas as pd
from random import randrange, seed
from os.path import join

from pprint import pprint

import matplotlib.pyplot as plt
from sklearn import preprocessing

from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.optimizers import Adam

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [62]:
#path = '/home/b3arjuden/crocket/sql_data/PRODUCTION40'

path = '/Users/bhsu/crypto/sql_data/PRODUCTION40'

file = 'BTC-ETH.csv'

In [63]:
file_path = join(path, file)

data = pd.read_csv(file_path, 
                   dtype={'time': str, 'buy_order': int, 'sell_order': int},
                   converters={'price': Decimal,
                               'wprice': Decimal,
                               'base_volume': Decimal,
                               'buy_volume': Decimal,
                               'sell_volume': Decimal})

In [64]:
def time_transform(df, n):
    
    df1 = pd.DataFrame()
    
    nrows = n * floor(df.shape[0] / n)
    df = df.iloc[:nrows, :]
    
    df1['time'] = df.loc[df.index[::n], 'time'].reset_index(drop=True)
    df1['wprice'] = ((df.loc[:, 'wprice'] * df.loc[:, 'base_volume']).groupby(df.index // n).sum() / 
                     df.loc[:, 'base_volume'].groupby(df.index // n).sum()).apply(lambda x: float(x.quantize(Decimal(10) ** -8)))
    df1['buy_volume'] = df.loc[:, 'buy_volume'].groupby(df.index // n).sum().apply(float)
    df1['sell_volume'] = df.loc[:, 'sell_volume'].groupby(df.index // n).sum().apply(float)
    df1['buy_order'] = df.loc[:, 'buy_order'].groupby(df.index // n).sum()
    df1['sell_order'] = df.loc[:, 'sell_order'].groupby(df.index // n).sum()
    
    return df1

def vectorize(array):
    
    return np.transpose(array).reshape(1, array.size)

def unvectorize(vector, x, y):
    
    return vector.reshape(x, y)

In [65]:
class Parameters:
    
    def __init__(self):
        
        # Time series parameters
        self.num_actions = 3
        
        # Neural network parameters
        self.discount_factor = 0.97
        self.epsilon = 0.99
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.95
        
        # Replay memory parameters
        self.replay_memory_size = 10000
        self.batch_size = 128
        
        # Training parameters
        self.update_target_weights_step_size = 100
        self.train_model_step_size = 100
        self.interpolation_factor = 0.003

In [66]:
class NeuralNetwork:
    
    def __init__(self, 
                 num_actions, 
                 checkpoint_path=None):
        
        self.checkpoint_path = checkpoint_path
        
        self.count_states = 0
        self.count_episodes = 0
    
    def build(self):
        
        model = Sequential()
        model.add(Dense(units=30, input_dim=20, activation='relu'))
        model.add(Dense(units=3, activation='softmax'))
        
        # Optimizer: adam, RMSProp
        model.compile(optimizer='adam', loss='mse')
        
        self.model = model
    
    def save(self):
        
        save_file = 'network_S{}_E{}.h5'.format(self.count_states, self.count_episodes)
        model.save(join(self.checkpoint_path, save_file))
    
    def load(self, model_path):
        
        self.model = load_model(model_path)
        
        # TODO: load count_states and count_episodes from file
    
    def get_weights(self):
        
        return [layer.get_weights() for layer in self.model.layers]
    
    def set_weights(self, weights):
        
        for layer, new_weights in zip(self.model.layers, weights):
            layer.set_weights(new_weights)
    
    def interpolate_weights(self, weights, interpolation_factor=0.001):
        
        for layer, new_weights in zip(self.model.layers, weights):
            layer.set_weights([w1 * interpolation_factor + (1 - interpolation_factor) * w0 for w0, w1 in zip(layer.get_weights(), new_weights)])
    
    # TODO: add initialization function?

In [67]:
class Agent:
    
    def __init__(self, parameters):
        
        self.num_actions = parameters.num_actions
        self.discount_factor = parameters.discount_factor
        self.epsilon = parameters.epsilon
        self.epsilon_min = parameters.epsilon_min
        self.epsilon_decay = parameters.epsilon_decay
        
        self._initialize()
        
    def _initialize(self, model_path=None):
        
        self.train_model = NeuralNetwork(num_actions=self.num_actions)
        self.target_model = NeuralNetwork(num_actions=self.num_actions)
        
        if model_path:
            pass
            # TODO: implement load model from folder (network_weights, replay_memory, additional_params)
        else:
            self.replay_memory = ReplayMemory(memory_size=parameters.replay_memory_size,
                                              state_shape=parameters.state_length,
                                              num_actions = self.num_actions,
                                              discount_factor=self.discount_factor)
            
            self.train_model.build()
            self.target_model.build()
            
            self.target_model.set_weights(self.train_model.get_weights())
        
    def act(self, state):
        
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.num_actions)
        
        values = self.target_model.model.predict(state)
        
        return np.argmax(values[0])
    
    def replay(self, batch_size):
        
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * \
                       np.amax(self.model.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            

In [77]:
class ReplayMemory:
    
    def __init__(self, memory_size, state_shape, num_actions, discount_factor):
        
        self.memory_size = memory_size
        self.state_shape = state_shape
        self.num_actions = num_actions
        self.discount_factor = discount_factor
        
        self.states = np.zeros(shape=[memory_size] + state_shape, dtype=np.float64)
        self.states_next = np.zeros(shape=[memory_size] + state_shape, dtype=np.float64)
        self.actions = np.zeros(shape=[memory_size, num_actions], dtype=np.int8)
        self.rewards = np.zeros(shape=[memory_size, num_actions], dtype=np.float64)
        
        self.indexes = cycle(range(memory_size))
        self.pointer = next(self.indexes)
        
    def add(self, state, action, reward, next_state):
        
        k = self.pointer

        self.states[k, :] = state
        self.actions[k] = action
        self.rewards[k] = reward  # TODO: consider clipping
        self.states_next[k, :] = next_state
        
        self.pointer = next(self.indexes)
        
    def random_batch(self, batch_size):
        
        idx = np.random.randint(self.memory_size, size=batch_size)
        
        return self.states[idx], self.actions[idx], self.rewards[idx], self.states_next[idx]
    
    # TODO: implement prioritized state-action values

In [140]:
class Environment:
    
    def __init__(self,
                 input_data,
                 steps=None):
        
        self.steps = steps
        self._load(input_data)
        
        self.episode = {}
        
    def _load(self, input_data):
        
        self.data = input_data
        
        self.shape = input_data.shape
        
        if self.steps is None or self.steps > self.shape[0]:
            self.steps = self.shape[0] - 1
    
    def _seed(self):
        
        seed()
    
    def _add_state(self):
        
        np.insert(self.data, self.shape[1], np.zeros(self.shape[1]), axis=1)
        
    def reset(self):
        
        self._seed()
        self.episode['start'] = 0
        self.episode['current_index'] = self.episode.get('start')
        self.episode['buy_price'] = []
        #self.data[:, -1] = 0  # Reset buy state
        
        return self.data[self.episode.get('current_index')]
        
    def step(self, action, price_index):
        
        current_index = self.episode.get('current_index')
        state = self.data[current_index]
        next_state = self.data[current_index + 1]
        print(state)
        price = state[price_index]  # TODO: add correct index for current price
        total = np.sum(self.episode.get('buy_price')) 
        
        if action == 1:
            self.episode['buy_price'].append(price)
        elif action == 2:
            self.episode['buy_price'] = []
            
        # TODO: add only buy once behavior maybe (penalty on every buy/sell should train not to buy many times)
        
        # Return rewards 
        reward = np.array([0, -price, price*len(self.episode.get('buy_price'))-total])       
        
        self.episode['current_index'] += 1
        
        return next_state, reward
    
    def start_new_episode(self):
        
        self.episode['start'] = np.random.randint(0, self.shape[0] - self.steps)
        self.episode['current_index'] = self.episode.get('start')
        
        return self.data[self.episode.get('current_index')]
    
class Action(Enum):

    HOLD = 0
    BUY = 1
    SELL = 2

In [70]:
df = time_transform(data, 15)
array = df.iloc[:,1:].as_matrix()

In [71]:
df.head()

Unnamed: 0,time,wprice,buy_volume,sell_volume,buy_order,sell_order
0,2017-12-11 21:24:36.000000,0.032071,35.10247,47.00551,826,680
1,2017-12-11 21:39:36.000000,0.032045,31.962237,36.13473,515,520
2,2017-12-11 21:54:36.000000,0.031504,52.720665,44.804259,748,805
3,2017-12-11 22:09:36.000000,0.031444,37.816393,46.708993,662,574
4,2017-12-11 22:24:36.000000,0.0315,27.480918,13.696069,477,309


In [72]:
# Data processing parameters
n = 2
m = 4

In [73]:
# Combine data
df = time_transform(data, n)
array = df.iloc[:,1:].as_matrix()

In [74]:
# Normalize data 
# TODO: change normalization if necessary
min_max_scaler = preprocessing.MinMaxScaler()
normalized_array = min_max_scaler.fit_transform(array)

In [75]:
states = np.stack([vectorize(normalized_array[x:x+m, :]) for x in range(normalized_array.shape[0] - m)])

In [153]:
[vectorize(normalized_array[x:x+m, :]) for x in range(normalized_array.shape[0] - m)]

[array([[ 0.06686944,  0.06218316,  0.0610968 ,  0.06062606,  0.05121947,
          0.10096536,  0.04969294,  0.12471206,  0.10883151,  0.15335675,
          0.10973378,  0.08206551,  0.22743682,  0.17148014,  0.30505415,
          0.29422383,  0.11575563,  0.09860665,  0.08360129,  0.06859593]]),
 array([[ 0.06218316,  0.0610968 ,  0.06062606,  0.06713849,  0.10096536,
          0.04969294,  0.12471206,  0.02777308,  0.15335675,  0.10973378,
          0.08206551,  0.06066503,  0.17148014,  0.30505415,  0.29422383,
          0.12274368,  0.09860665,  0.08360129,  0.06859593,  0.07717042]]),
 array([[ 0.0610968 ,  0.06062606,  0.06713849,  0.07271737,  0.04969294,
          0.12471206,  0.02777308,  0.0400495 ,  0.10973378,  0.08206551,
          0.06066503,  0.0601016 ,  0.30505415,  0.29422383,  0.12274368,
          0.07039711,  0.08360129,  0.06859593,  0.07717042,  0.07931404]]),
 array([[ 0.06062606,  0.06713849,  0.07271737,  0.0689215 ,  0.12471206,
          0.02777308,  0.0400

In [133]:
# Training parameters
EPISODES = 10
EPISODE_LENGTH = 48

In [141]:
## Main loop

parameters = Parameters()
parameters.state_length = [states.shape[-1]]
agent = Agent(parameters)
env = Environment(states, steps=EPISODE_LENGTH)
step = 0

# Populate the replay memory with initial experiences
state = env.reset()
for i in range(parameters.replay_memory_size):
    
    action = agent.act(state)
    next_state, reward = env.step(action, price_index=n-1)
    agent.replay_memory.add(state, action, reward, next_state)
    state = next_state

for e in range(EPISODES):
    
    state = env.start_new_episode()
    
    for ii in range(EPISODE_LENGTH):
        
        # Update the target network weights every X iterations
        if step % parameters.update_target_weights_step_size == 0:
            print('Step {}: Updating target Q network weights with latest Q network weights.'.format(step))
            agent.target_model.interpolate_weights(agent.train_model.get_weights(), parameters.interpolation_factor)
                
        action = agent.act(state)
        next_state, reward = env.step(action)
        agent.replay_memory.add(state, action, reward, next_state)
        state = next_state
        
        if step % parameters.train_model_step_size == 0:
            print('Step {}: Sampling replay memory and training Q network.'.format(step))
            samples = agent.replay_memory.random_batch(parameters.batch_size)
            states_batch, action_batch, reward_batch, next_states_batch = map(np.array, samples)

            q_values_next = agent.train_model.model.predict(next_states_batch)
            best_actions = np.argmax(q_values_next, axis=1)
            q_values_next_target = agent.target_model.model.predict(next_states_batch)
            targets_batch = reward_batch + parameters.discount_factor * np.repeat(q_values_next_target[np.arange(parameters.batch_size), best_actions].reshape(parameters.batch_size, 1), parameters.num_actions, axis=1)

            agent.train_model.model.fit(states_batch, targets_batch)
            
        step += 1

[[ 0.06686944  0.06218316  0.0610968   0.06062606  0.05121947  0.10096536
   0.04969294  0.12471206  0.10883151  0.15335675  0.10973378  0.08206551
   0.22743682  0.17148014  0.30505415  0.29422383  0.11575563  0.09860665
   0.08360129  0.06859593]]


IndexError: index 1 is out of bounds for axis 0 with size 1