In [4]:
! pip install imblearn
! pip install skorch

Collecting skorch
  Using cached https://files.pythonhosted.org/packages/c7/df/1e0be91bf4c91fce5f99cc4edd89d3dfc16930d3fc77588493558036a8d2/skorch-0.6.0-py3-none-any.whl
Collecting tabulate>=0.7.7 (from skorch)
  Using cached https://files.pythonhosted.org/packages/66/d4/977fdd5186b7cdbb7c43a7aac7c5e4e0337a84cb802e154616f3cfc84563/tabulate-0.8.5.tar.gz
Building wheels for collected packages: tabulate
  Running setup.py bdist_wheel for tabulate ... [?25ldone
[?25h  Stored in directory: /Users/yhhan/Library/Caches/pip/wheels/e1/41/5e/e201f95d90fc84f93aa629b6638adacda680fe63aac47174ab
Successfully built tabulate
Installing collected packages: tabulate, skorch
Successfully installed skorch-0.6.0 tabulate-0.8.5


In [2]:
import pandas as pd
import sqlite3
import os, sys
from sklearn.preprocessing import MinMaxScaler
import torch
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
import time

idx = os.getcwd().index("trade")
PROJECT_HOME = os.getcwd()[:idx] + "trade/"
sys.path.append(PROJECT_HOME)

from common.global_variables import *
from upbit.upbit_api import Upbit
from db.sqlite_handler import *

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



In [4]:
def get_invest_krw(current_price, total_ask_size, total_bid_size):
    base_price = current_price * (total_ask_size + total_bid_size) * 0.001
    if base_price > 300000:
        return 300000
    elif 150000 < base_price <= 300000:
        return 200000
    else:
        return 100000

In [5]:
def get_expected_buy_coin_price_for_krw_and_ask_list(ask_price_lst, ask_size_lst, krw, transaction_fee_rate):
    original_krw = krw

    fee = krw * transaction_fee_rate
    krw = krw - fee

    calc_size_sum = 0.0

    # print(0, krw, calc_size_sum, 0)
    for i, ask_size in enumerate(ask_size_lst):
        calc_krw_sum = ask_price_lst[i] * ask_size
        if calc_krw_sum > krw:
            calc_size_sum += krw / ask_price_lst[i]
            # print(i+1, krw, calc_size_sum)
            break
        else:
            calc_size_sum += ask_size
            krw = krw - calc_krw_sum
            # print(i+1, krw, calc_size_sum)

    calc_price = (original_krw - fee) / calc_size_sum

    # 매수원금: 1000000, 수수료: 500.0, 매수단가: 1823.7691975619496, 확보한 코인수량: 548.0408383561644
    return original_krw, fee, calc_price, calc_size_sum

In [6]:
def get_expected_sell_coin_price_for_volume_and_bid_list(bid_price_lst, bid_size_lst, volume, transaction_fee_rate):
    calc_krw_sum = 0.0
    original_volume = volume

    #print(0, volume, calc_krw_sum)
    for i, bid_size in enumerate(bid_size_lst):
        if bid_size > volume:
            calc_krw_sum += bid_price_lst[i] * volume
            #print(i+1, volume, calc_krw_sum)
            break
        else:
            calc_krw_sum += bid_price_lst[i] * bid_size
            volume = volume - bid_size
            #print(i+1, volume, calc_krw_sum)

    calc_price = calc_krw_sum / original_volume

    fee = calc_krw_sum * transaction_fee_rate

    calc_krw_sum = calc_krw_sum - fee

    # 매도 코인수량: 548.0408383561644, 매도단가: 1805.0, 수수료: 494.79924644171336, 매도결과금:989103.693636985
    return original_volume, calc_price, fee, calc_krw_sum

In [7]:
sql = select_all_from_order_book_for_one_coin.format("BTC").replace("\n", "")

In [8]:
def build_timeseries(data, data_normalized, window_size, future_target_size, up_rate):
    future_target = future_target_size - 1

    dim_0 = data.shape[0] - window_size - future_target
    dim_1 = data.shape[1]

    x = torch.zeros((dim_0, window_size, dim_1)).to(DEVICE)
    x_normalized = torch.zeros((dim_0, window_size, dim_1)).to(DEVICE)

    y = torch.zeros((dim_0,)).to(DEVICE)
    y_up = torch.zeros((dim_0,)).float().to(DEVICE)

    for i in range(dim_0):
        x[i] = data[i: i + window_size]
        x_normalized[i] = data_normalized[i: i + window_size]

    count_one = 0
    for i in range(dim_0):
        max_price = -1.0

        ask_price_lst = []
        ask_size_lst = []
        for w in range(0, 60, 4):
            ask_price_lst.append(x[i][-1][1 + w].item())
            ask_size_lst.append(x[i][-1][3 + w].item())

        invest_krw = get_invest_krw(
            current_price=x[i][-1][1].item(),
            total_ask_size=x[i][-1][121],
            total_bid_size=x[i][-1][123]
        )

        original_krw, fee, calc_price, calc_size_sum = get_expected_buy_coin_price_for_krw_and_ask_list(
            ask_price_lst=ask_price_lst,
            ask_size_lst=ask_size_lst,
            krw=invest_krw,
            transaction_fee_rate=TRANSACTION_FEE_RATE
        )

        for j in range(future_target + 1):
            bid_price_lst = []
            bid_size_lst = []
            for w in range(0, 60, 4):
                bid_price_lst.append(data[i + window_size + j][61 + w].item())
                bid_size_lst.append(data[i + window_size + j][63 + w].item())

            original_volume, future_price, fee, future_krw_sum = get_expected_sell_coin_price_for_volume_and_bid_list(
                bid_price_lst=bid_price_lst,
                bid_size_lst=bid_size_lst,
                volume=calc_size_sum,
                transaction_fee_rate=TRANSACTION_FEE_RATE
            )

            if future_price > max_price:
                max_price = future_price

        y[i] = max_price

        if y[i] > calc_price * (1 + up_rate):
            y_up[i] = 1
            count_one += 1

    return x, x_normalized, y, y_up, count_one / dim_0, dim_0

In [9]:
coin_name = "ADA"
sqlite3_order_book_db_filename = PROJECT_HOME + "db/upbit_order_book_info.db"

In [10]:
def get_dataset(data_length, split=True):
    df = pd.read_sql_query(
        select_all_from_order_book_for_one_coin.format(coin_name),
        sqlite3.connect(sqlite3_order_book_db_filename, timeout=10, check_same_thread=False)
    )

    df = df.drop(["base_datetime", "collect_timestamp"], axis=1)[:data_length]
    
    data = torch.from_numpy(df.values).to(DEVICE)

    min_max_scaler = MinMaxScaler()
    data_normalized = min_max_scaler.fit_transform(df.values)
    data_normalized = torch.from_numpy(data_normalized).to(DEVICE)

    x, x_normalized, y, y_up, one_rate, total_size = build_timeseries(
        data=data,
        data_normalized=data_normalized,
        window_size=WINDOW_SIZE,
        future_target_size=FUTURE_TARGET_SIZE,
        up_rate=UP_RATE
    )
    
    print(one_rate, total_size)
    
    # Imbalanced Preprocessing - Start
    if one_rate > 0.01:
        x_normalized = x_normalized.cpu()
        y_up = y_up.cpu()

        try:
            x_samp, y_up_samp = RandomUnderSampler(sampling_strategy=0.75).fit_sample(
                x_normalized.reshape((x_normalized.shape[0], x_normalized.shape[1] * x_normalized.shape[2])),
                y_up
            )
            x_normalized = torch.from_numpy(
                x_samp.reshape(x_samp.shape[0], x_normalized.shape[1], x_normalized.shape[2])
            ).to(DEVICE)
            y_up = torch.from_numpy(y_up_samp).to(DEVICE)
        except ValueError:
            logger.info("{0} - {1}".format(coin_name, "RandomUnderSampler - ValueError"))
            x_normalized = x_normalized.to(DEVICE)
            y_up = y_up.to(DEVICE)
    # Imbalanced Preprocessing - End
    
    total_size = len(x_normalized)
        
    if split:

        indices = list(range(total_size))
        np.random.shuffle(indices)

        train_indices = list(set(indices[:int(total_size * 0.8)]))
        validation_indices = list(set(range(total_size)) - set(train_indices))

        x_train_normalized = x_normalized[train_indices]
        x_valid_normalized = x_normalized[validation_indices]

        y_up_train = y_up[train_indices]
        y_up_valid = y_up[validation_indices]

        one_rate_train = y_up_train.sum().float() / y_up_train.size(0)
        one_rate_valid = y_up_valid.sum().float() / y_up_valid.size(0)

        train_size = x_train_normalized.size(0)
        valid_size = x_valid_normalized.size(0)

        return x_train_normalized, y_up_train, one_rate_train, train_size,\
               x_valid_normalized, y_up_valid, one_rate_valid, valid_size
    else:
        one_rate = y_up.sum().float() / y_up.size(0)
        return x_normalized, y_up, one_rate, total_size

In [11]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import GradientBoostingClassifier


def get_best_model_by_nested_cv(X, y, inner_cv, outer_cv, Classifier, parameter_grid):
    outer_score_list = []
    best_param_list = []
    model_list = []
    
    num_outer_split = 1
    for training_samples_idx, test_samples_idx in outer_cv.split(X, y):
        print("[Outer Split: #{0}]".format(num_outer_split))
        best_parms = {}
        best_score = -np.inf

        for parameters in parameter_grid:
#             print("Parameters: {0}".format(parameters))
            cv_scores = []
            num_inner_split = 1
            for inner_train_idx, inner_test_idx in inner_cv.split(X[training_samples_idx], y[training_samples_idx]):
                clf = Classifier(**parameters)
                clf.fit(X[inner_train_idx], y[inner_train_idx])
                score = clf.score(X[inner_test_idx], y[inner_test_idx])
                
                cv_scores.append(score)
#                 print("Inner Split: #{0}, Score: #{1}".format(
#                     num_inner_split,
#                     score
#                 ))
                num_inner_split += 1

            mean_score = np.mean(cv_scores)
            if mean_score > best_score:
                best_score = mean_score
                best_params = parameters
#             print("Mean Score:{0}, Best Score:{1}".format(mean_score, best_score))

        print("* Outer Split: #{0}, Best Score: {1}, Best Parameter: #{2} ***\n".format(
            num_outer_split,
            best_score,
            best_params
        ))

        clf = Classifier(**best_params)
        clf.fit(X[training_samples_idx], y[training_samples_idx])

        best_param_list.append(best_params)
        outer_score_list.append(clf.score(X[test_samples_idx], y[test_samples_idx]))
        model_list.append(clf)
        
        num_outer_split += 1

    best_score = -np.inf
    best_model = None
    for idx, score in enumerate(outer_score_list):        
        if score > best_score:
            best_score = score
            best_model = model_list[idx]

    return best_score, best_model

In [12]:
DATA_LENGTH = 480

x_normalized_original, y_up_original, one_rate, total_size = get_dataset(DATA_LENGTH, split=False)

print("x_normalized_original: {0}, y_up_original: {1}, one_rate: {2}, total_size: {3}".format(
    x_normalized_original.size(),
    y_up_original.size(),
    one_rate,
    total_size
))

0.13258426966292136 445
x_normalized_original: torch.Size([137, 18, 125]), y_up_original: torch.Size([137]), one_rate: 0.43065693974494934, total_size: 137


In [13]:
def make_sklearn_model(coin_name, x_normalized_original, y_up_original, total_size, one_rate):    
    param_grid = {
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': np.linspace(1, 9, 4, endpoint=True),
        'n_estimators': [32, 64, 128],
        'max_features': list(range(int(x_normalized_original.shape[1] / 2), x_normalized_original.shape[1], 2)),
    }
        
    is_high_quality = False

    batch_size = 16
    patience = 30

    coin_model_start_time = time.time()
    
    X = x_normalized_original.numpy().reshape(total_size, -1)
    y = y_up_original.numpy()
    
#     print("X.shape: {0}".format(X.shape))
#     print("y.shape: {0}".format(y.shape))

    best_score, best_model = get_best_model_by_nested_cv(
        X=X,
        y=y,
        inner_cv=StratifiedKFold(n_splits=4, shuffle=True),
        outer_cv=StratifiedKFold(n_splits=4, shuffle=True),
        Classifier=GradientBoostingClassifier,
        parameter_grid=ParameterGrid(param_grid)
    )
    
    print(best_model)
    return best_model

In [6]:
def get_dataset_for_buy(coin_name, model_type="LSTM"):
    df = pd.read_sql_query(
        select_all_from_order_book_for_one_coin_recent_window.format(coin_name, WINDOW_SIZE),
        sqlite3.connect(sqlite3_order_book_db_filename, timeout=10, check_same_thread=False)
    )

    df = df.sort_values(['collect_timestamp', 'base_datetime'], ascending=True)
    df = df.drop(["base_datetime", "collect_timestamp"], axis=1)

    min_max_scaler = MinMaxScaler()
    data_normalized = min_max_scaler.fit_transform(df.values)
    data_normalized = torch.from_numpy(data_normalized).float().to(DEVICE)
    
    if model_type == "LSTM":
        return data_normalized.unsqueeze(dim=0)
    else:
        data_normalized = data_normalized.flatten()
        return data_normalized.unsqueeze(dim=0)
        

In [13]:
df = pd.read_sql_query(
        select_all_from_order_book_for_one_coin.format("BTC"),
        sqlite3.connect(sqlite3_order_book_db_filename, timeout=10, check_same_thread=False)
    )
df

Unnamed: 0,base_datetime,daily_base_timestamp,collect_timestamp,ask_price_0,ask_price_0_btc,ask_size_0,ask_size_0_btc,ask_price_1,ask_price_1_btc,ask_size_1,...,bid_size_13,bid_size_13_btc,bid_price_14,bid_price_14_btc,bid_size_14,bid_size_14_btc,total_ask_size,total_ask_size_btc,total_bid_size,total_bid_size_btc
0,2019-08-23 17:20:00,1720,1566548408744,12267000.0,12267000.0,1.346000,1.346000,12268000.0,12268000.0,0.322926,...,1.878479,1.878479,12250000.0,12250000.0,1.180089,1.180089,9.148790,9.148790,10.544709,10.544709
1,2019-08-23 17:30:00,1730,1566549029542,12270000.0,12270000.0,0.897563,0.897563,12272000.0,12272000.0,0.029113,...,0.410482,0.410482,12253000.0,12253000.0,0.021083,0.021083,8.410809,8.410809,4.258245,4.258245
2,2019-08-23 17:40:00,1740,1566549628911,12286000.0,12286000.0,1.071652,1.071652,12289000.0,12289000.0,0.029502,...,0.035302,0.035302,12267000.0,12267000.0,0.091292,0.091292,10.744477,10.744477,8.759968,8.759968
3,2019-08-23 17:50:00,1750,1566550214444,12268000.0,12268000.0,0.101908,0.101908,12274000.0,12274000.0,0.900721,...,0.301900,0.301900,12246000.0,12246000.0,0.490251,0.490251,13.090079,13.090079,6.447169,6.447169
4,2019-08-23 18:00:00,1800,1566550820369,12256000.0,12256000.0,0.930588,0.930588,12259000.0,12259000.0,0.178970,...,10.153142,10.153142,12238000.0,12238000.0,0.098706,0.098706,19.962658,19.962658,17.441340,17.441340
5,2019-08-23 18:10:00,1810,1566551405545,12250000.0,12250000.0,0.122890,0.122890,12251000.0,12251000.0,0.449994,...,0.452764,0.452764,12233000.0,12233000.0,0.001025,0.001025,19.668914,19.668914,17.967293,17.967293
6,2019-08-23 18:20:00,1820,1566552019441,12252000.0,12252000.0,0.040306,0.040306,12256000.0,12256000.0,1.346439,...,0.143700,0.143700,12236000.0,12236000.0,2.372531,2.372531,20.322819,20.322819,14.989362,14.989362
7,2019-08-23 18:30:00,1830,1566552621808,12270000.0,12270000.0,0.282101,0.282101,12271000.0,12271000.0,0.720865,...,0.100000,0.100000,12246000.0,12246000.0,0.065526,0.065526,9.401994,9.401994,4.920960,4.920960
8,2019-08-23 18:40:00,1840,1566553216548,12262000.0,12262000.0,0.000106,0.000106,12263000.0,12263000.0,0.025795,...,0.186700,0.186700,12236000.0,12236000.0,0.086700,0.086700,10.029046,10.029046,6.192228,6.192228
9,2019-08-23 18:50:00,1850,1566553829954,12248000.0,12248000.0,0.745966,0.745966,12252000.0,12252000.0,0.078363,...,0.921546,0.921546,12228000.0,12228000.0,0.026567,0.026567,13.689478,13.689478,7.465113,7.465113


In [11]:
df = pd.read_sql_query(
        select_all_from_order_book_for_one_coin_recent_window.format("BTC", WINDOW_SIZE),
        sqlite3.connect(sqlite3_order_book_db_filename, timeout=10, check_same_thread=False)
    )
df = df.sort_values(['collect_timestamp', 'base_datetime'], ascending=True)
df

Unnamed: 0,base_datetime,daily_base_timestamp,collect_timestamp,ask_price_0,ask_price_0_btc,ask_size_0,ask_size_0_btc,ask_price_1,ask_price_1_btc,ask_size_1,...,bid_size_13,bid_size_13_btc,bid_price_14,bid_price_14_btc,bid_size_14,bid_size_14_btc,total_ask_size,total_ask_size_btc,total_bid_size,total_bid_size_btc
17,2019-09-26 05:20:00,520,1569442810711,10084000.0,10084000.0,1.09991,1.09991,10095000.0,10095000.0,0.05,...,0.923364,0.923364,10039000.0,10039000.0,1.299933,1.299933,9.682059,9.682059,6.188109,6.188109
16,2019-09-26 05:40:00,540,1569444055475,10159000.0,10159000.0,0.165097,0.165097,10162000.0,10162000.0,0.011868,...,1.36115,1.36115,10127000.0,10127000.0,0.2673,0.2673,11.501531,11.501531,7.944073,7.944073
15,2019-09-26 07:30:00,730,1569450642490,10194000.0,10194000.0,0.4,0.4,10195000.0,10195000.0,2.313921,...,0.27,0.27,10151000.0,10151000.0,0.491841,0.491841,15.510737,15.510737,11.451062,11.451062
14,2019-09-26 07:50:00,750,1569451829971,10145000.0,10145000.0,2.055608,2.055608,10159000.0,10159000.0,0.291,...,0.8,0.8,10111000.0,10111000.0,0.394472,0.394472,7.565756,7.565756,8.032207,8.032207
13,2019-09-26 08:00:00,800,1569452423449,10109000.0,10109000.0,0.783312,0.783312,10111000.0,10111000.0,0.45,...,0.01,0.01,10075000.0,10075000.0,1.196923,1.196923,14.571007,14.571007,14.51918,14.51918
12,2019-09-26 08:10:00,810,1569453025614,10057000.0,10057000.0,0.682926,0.682926,10058000.0,10058000.0,0.264071,...,0.001679,0.001679,10027000.0,10027000.0,0.199461,0.199461,8.548207,8.548207,3.796122,3.796122
11,2019-09-26 08:20:00,820,1569453606570,10062000.0,10062000.0,0.132772,0.132772,10069000.0,10069000.0,0.009986,...,0.01,0.01,10036000.0,10036000.0,0.03,0.03,10.765888,10.765888,3.778154,3.778154
10,2019-09-26 08:30:00,830,1569454223267,10075000.0,10075000.0,2.548977,2.548977,10076000.0,10076000.0,0.653518,...,0.196284,0.196284,10049000.0,10049000.0,0.4217,0.4217,17.219948,17.219948,3.31459,3.31459
9,2019-09-26 08:40:00,840,1569454826102,10123000.0,10123000.0,0.164163,0.164163,10124000.0,10124000.0,1.354098,...,1.0,1.0,10091000.0,10091000.0,0.8,0.8,15.098455,15.098455,7.695795,7.695795
8,2019-09-26 08:50:00,850,1569455430682,10153000.0,10153000.0,0.359567,0.359567,10155000.0,10155000.0,0.501,...,0.3078,0.3078,10120000.0,10120000.0,0.026989,0.026989,7.526906,7.526906,12.924913,12.924913


In [16]:
upbit = Upbit(CLIENT_ID_UPBIT, CLIENT_SECRET_UPBIT, fmt)
for coin_name in upbit.get_all_coin_names():
    print("***************", coin_name)
    best_model = make_sklearn_model(coin_name, x_normalized_original, y_up_original, total_size, one_rate)
    
    X_prediction = get_dataset_for_buy(coin_name)
    y_prediction = best_model.predict_proba(X_prediction)

    print(y_prediction)

*************** QKC
[Outer Split: #1]
* Outer Split: #1, Best Score: 0.9703846153846154, Best Parameter: #{'learning_rate': 0.01, 'max_depth': 3.6666666666666665, 'max_features': 13, 'n_estimators': 128} ***

[Outer Split: #2]
* Outer Split: #2, Best Score: 0.97, Best Parameter: #{'learning_rate': 0.1, 'max_depth': 1.0, 'max_features': 11, 'n_estimators': 32} ***

[Outer Split: #3]
* Outer Split: #3, Best Score: 0.9615384615384616, Best Parameter: #{'learning_rate': 0.1, 'max_depth': 1.0, 'max_features': 9, 'n_estimators': 128} ***

[Outer Split: #4]
* Outer Split: #4, Best Score: 0.9518660968660968, Best Parameter: #{'learning_rate': 0.1, 'max_depth': 3.6666666666666665, 'max_features': 17, 'n_estimators': 32} ***

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=1.0,
                           max_features=11, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_

ValueError: cannot reshape array of size 308250 into shape (4137,newaxis)

In [2]:
from upbit.upbit_order_book_based_data import UpbitOrderBookBasedData, get_data_loader
from predict.model_rnn import LSTM
from skorch import NeuralNetClassifier
upbit_order_book_data = UpbitOrderBookBasedData("ADA")

ModuleNotFoundError: No module named 'upbit'

In [6]:
x_normalized_original, y_up_original, one_rate, total_size = upbit_order_book_data.get_dataset(split=False)

In [53]:
def make_lstm_model(coin_name, x_normalized_original, y_up_original, total_size, one_rate):
    lstm_model = LSTM(input_size=INPUT_SIZE).to(DEVICE)
    net = NeuralNetClassifier(
        lstm_model,
        max_epochs=10,
        lr=0.1,
        # Shuffle training data on each epoch
        iterator_train__shuffle=True,
    )

    X = x_normalized_original.numpy()
    y = y_up_original.numpy()
    
    print(type(X), type(y))
    print(X.shape, y.shape)
#    print(X.size(), y.size())        
#    print(X.dim(), y.dim())    
    
    net.fit(X=X, y=y)

In [54]:
best_model = make_lstm_model("ADA", x_normalized_original, y_up_original, total_size, one_rate)

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
(1064, 18, 125) (1064,)


ValueError: Expected 2 or more dimensions (got 1)