# Bitcoin Reinforcement Learning

## Assumptions
* Agent's actions are small enough that it will not affect the future price of bitcon. (If so, will reinforcement learning be affective?)
* Agent will make decisions and execute transactions in a fixed timestep.
* The price of latest transaction will be the price of bitcoin at that time.

In [6]:
from enum import Enum, unique
from collections import deque
import random

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [8]:
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution1D, MaxPooling1D
from keras.utils import np_utils

In [4]:
btc_csv = pd.read_csv("coinbaseUSD.csv", header = None, usecols = [0, 1], names = ["timestamp", "price"])

In [9]:
btc_csv.info()
print(btc_csv.head(5))
print(btc_csv.describe())

plt.clf()
plt.figure()
btc_csv.hist(column = "price", bins=100)
plt.figure()
btc_csv.plot(x = "timestamp", y = "price")
plt.show()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18192817 entries, 0 to 18192816
Data columns (total 2 columns):
timestamp    int64
price        float64
dtypes: float64(1), int64(1)
memory usage: 277.6 MB
    timestamp  price
0  1417412036  300.0
1  1417412423  300.0
2  1417415048  370.0
3  1417416612  370.0
4  1417498166  377.0
          timestamp         price
count  1.819282e+07  1.819282e+07
mean   1.465828e+09  8.738391e+02
std    2.354160e+07  7.622207e+02
min    1.417412e+09  6.000000e-02
25%    1.446888e+09  3.479200e+02
50%    1.464890e+09  5.389900e+02
75%    1.489606e+09  1.142120e+03
max    1.500333e+09  2.999990e+03


In [10]:
# timestamps = btc_np[:, 0]
# rolled_timestamps = np.roll(timestamps, 1)

# timestamps = timestamps[1:-1]
# rolled_timestamps = rolled_timestamps[1:-1]

# deltas = timestamps - rolled_timestamps

In [17]:
TIMESTEP = 1000

btc_np = btc_csv.values
current_timestamp = btc_np[0, 0]
current_price = btc_np[0, 1]
fixed_btc_data = np.empty((0, 2))

for x in btc_np[1:]:
    target_timestamp = x[0]
    
    while current_timestamp < target_timestamp:
        row = [[current_timestamp, current_price]]
        fixed_btc_data = np.append(fixed_btc_data, row, axis = 0)
        current_timestamp += TIMESTEP
    
    current_price = x[1]

In [106]:
class BTCAgent(object):
    """
    Will this make me rich?
    """
    
    @unique
    class Action(Enum):
        BUY_HALF = 0
        BUY_MAX = 1
        HOLD = 2
        SELL_HALF = 3
        SELL_MAX = 4
    
    def __init__(self, btc_data, replay_memory_size = 300000, context_size = 100, num_iterations = 1, batch_size = 32):
        self.btc_data = btc_data
        self.usd_amount = 200
        self.btc_amount = 0
        self.replay_memory = deque(maxlen=replay_memory_size)
        self.state = np.zeros((1, context_size, 1))
        self.num_iterations = num_iterations
        self.batch_size = batch_size
        self.epsilon = 1.0
    
    def _calculate_total_asset(self, usd_amount, btc_amount, btc_price):
        return usd_amount + btc_amount * btc_price
    
    def _execute_order(self, action_index, usd_amount, btc_amount, btc_price, next_btc_price):
        action = self.Action(action_index)
        
        if action == self.Action.BUY_HALF: # BUY
            next_usd_amount = usd_amount / 2.0
            next_btc_amount = btc_amount + usd_amount / (2.0 * btc_price)
            
        if action == self.Action.BUY_MAX: # BUY
            next_usd_amount = 0.0
            next_btc_amount = btc_amount + usd_amount / btc_price

        if action == self.Action.HOLD: # HOLD
            next_usd_amount = usd_amount
            next_btc_amount = btc_amount

        if action == self.Action.SELL_HALF: # SELL
            next_usd_amount = usd_amount + (btc_amount / 2.0) * btc_price
            next_btc_amount = btc_amount / 2.0
            
        if action == self.Action.SELL_MAX: # SELL
            next_usd_amount = usd_amount + btc_amount * btc_price
            next_btc_amount = 0.0

        total_asset = self._calculate_total_asset(usd_amount, btc_amount, btc_price)
        next_total_asset = self._calculate_total_asset(next_usd_amount, next_btc_amount, next_btc_price)
        reward = next_total_asset - total_asset

        return next_usd_amount, next_btc_amount, reward
    
    def _initialize_q_function(self):
        model = Sequential()

        model.add(Convolution1D(filters=128, kernel_size=2, padding='valid', activation='relu', input_shape=(self.state.shape[-2], 1)))
        model.add(MaxPooling1D(pool_size=(2), padding='valid'))
        model.add(Dropout(0.05))

        model.add(Convolution1D(filters=256, kernel_size=2, padding='valid', activation='relu'))
        model.add(MaxPooling1D(pool_size=(2), padding='valid'))
        model.add(Dropout(0.10))

        model.add(Convolution1D(filters=512, kernel_size=2, padding='valid', activation='relu'))
        model.add(MaxPooling1D(pool_size=(2), padding='valid'))
        model.add(Dropout(0.05))
        
        model.add(Flatten())

        model.add(Dense(1024, activation='relu'))
        model.add(Dropout(0.25))
        model.add(Dense(len(self.Action.__members__), activation='softmax'))
        
        model.compile(loss='mse', optimizer='adam')
        
        return model
    
    def _get_action(self):
        if random.random() < self.epsilon:
            return random.choice(range(len(self.Action.__members__)))
        else:
            action_values = self.model.predict(self.state)
            return np.argmax(action_values)
    
    def run(self):
        self.model = self._initialize_q_function()

        usd_wallet = self.usd_amount
        btc_wallet = self.btc_amount

        self.state[:, :, :] = self.btc_data[0, 1]
        
        loss = float('Inf')
        store = [0] * len(self.Action.__members__)

        for index in range(self.btc_data.shape[0] - 1):
            self.epsilon = max(1 - (index / float(1000)), 0.1)
            
            btc_price = self.btc_data[index, 1]
            next_btc_price = self.btc_data[index+1, 1]
            
            if index % 100 == 0:
                total_asset = self._calculate_total_asset(usd_wallet, btc_wallet, btc_price)
                print("Index %d, epsilon %.02f" % (index, self.epsilon))
                
                for index, count in enumerate(store):
                    print("%s -- %d / 100" % (self.Action(index), count))
                
                print("Total asset: $%.02f USD ($%.02f USD, %0.2f BTC)" % (total_asset, usd_wallet, btc_wallet))
                print("Loss: %.02f" % (loss))
                
                print("\n\n")
                
                loss = 0
                store = [0] * len(self.Action.__members__)

            ### Run
            action = self._get_action()
            store[action] += 1
            next_usd_wallet, next_btc_wallet, reward = self._execute_order(action, usd_wallet, btc_wallet, btc_price, next_btc_price)

            next_state = np.append(np.array([[[next_btc_price]]]), self.state[:, :-1, :], axis=1)

            self.replay_memory.append((self.state, action, reward, next_state))

            ### Training
            if len(self.replay_memory) > self.batch_size:
                minibatch = random.sample(self.replay_memory, self.batch_size)

                inputs = np.zeros((self.batch_size, self.state.shape[1], self.state.shape[2]))
                targets = np.zeros((self.batch_size, len(self.Action.__members__)))

                for i in range(self.batch_size):
                    state_t, action_t, reward_t, next_state_t = minibatch[i]

                    inputs[i] = state_t
                    targets[i] = self.model.predict(state_t)
                    next_action_values = self.model.predict(next_state_t)

                    targets[i, action_t] = reward_t + 0.99 * np.max(next_action_values)

                loss += self.model.train_on_batch(inputs, targets)

            ### Update
            usd_wallet = next_usd_wallet
            btc_wallet = next_btc_wallet
            self.state = next_state

In [107]:
agent = BTCAgent(fixed_btc_data)

In [None]:
K.clear_session()
agent.run()

Index 0, epsilon 1.00
Action.BUY_HALF -- 0 / 100
Action.BUY_MAX -- 0 / 100
Action.HOLD -- 0 / 100
Action.SELL_HALF -- 0 / 100
Action.SELL_MAX -- 0 / 100
Total asset: $200.00 USD ($200.00 USD, 0.00 BTC)
Loss: inf



Index 100, epsilon 0.90
Action.BUY_HALF -- 17 / 100
Action.BUY_MAX -- 23 / 100
Action.HOLD -- 16 / 100
Action.SELL_HALF -- 24 / 100
Action.SELL_MAX -- 20 / 100
Total asset: $246.67 USD ($246.67 USD, 0.00 BTC)
Loss: 527.23



Index 200, epsilon 0.80
Action.BUY_HALF -- 15 / 100
Action.BUY_MAX -- 37 / 100
Action.HOLD -- 16 / 100
Action.SELL_HALF -- 21 / 100
Action.SELL_MAX -- 11 / 100
Total asset: $246.67 USD ($246.67 USD, 0.00 BTC)
Loss: 313.42



Index 300, epsilon 0.70
Action.BUY_HALF -- 11 / 100
Action.BUY_MAX -- 41 / 100
Action.HOLD -- 18 / 100
Action.SELL_HALF -- 16 / 100
Action.SELL_MAX -- 14 / 100
Total asset: $246.99 USD ($0.00 USD, 0.65 BTC)
Loss: 231.03



