In [1]:
! pip install imblearn



In [3]:
import pandas as pd
import sqlite3
import os, sys
from sklearn.preprocessing import MinMaxScaler
import torch
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
import time

idx = os.getcwd().index("trade")
PROJECT_HOME = os.getcwd()[:idx] + "trade/"
sys.path.append(PROJECT_HOME)

from common.global_variables import *
from upbit.upbit_api import Upbit
from db.sqlite_handler import *

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



In [4]:
def get_invest_krw(current_price, total_ask_size, total_bid_size):
    base_price = current_price * (total_ask_size + total_bid_size) * 0.001
    if base_price > 300000:
        return 300000
    elif 150000 < base_price <= 300000:
        return 200000
    else:
        return 100000

In [5]:
def get_expected_buy_coin_price_for_krw_and_ask_list(ask_price_lst, ask_size_lst, krw, transaction_fee_rate):
    original_krw = krw

    fee = krw * transaction_fee_rate
    krw = krw - fee

    calc_size_sum = 0.0

    # print(0, krw, calc_size_sum, 0)
    for i, ask_size in enumerate(ask_size_lst):
        calc_krw_sum = ask_price_lst[i] * ask_size
        if calc_krw_sum > krw:
            calc_size_sum += krw / ask_price_lst[i]
            # print(i+1, krw, calc_size_sum)
            break
        else:
            calc_size_sum += ask_size
            krw = krw - calc_krw_sum
            # print(i+1, krw, calc_size_sum)

    calc_price = (original_krw - fee) / calc_size_sum

    # 매수원금: 1000000, 수수료: 500.0, 매수단가: 1823.7691975619496, 확보한 코인수량: 548.0408383561644
    return original_krw, fee, calc_price, calc_size_sum

In [6]:
def get_expected_sell_coin_price_for_volume_and_bid_list(bid_price_lst, bid_size_lst, volume, transaction_fee_rate):
    calc_krw_sum = 0.0
    original_volume = volume

    #print(0, volume, calc_krw_sum)
    for i, bid_size in enumerate(bid_size_lst):
        if bid_size > volume:
            calc_krw_sum += bid_price_lst[i] * volume
            #print(i+1, volume, calc_krw_sum)
            break
        else:
            calc_krw_sum += bid_price_lst[i] * bid_size
            volume = volume - bid_size
            #print(i+1, volume, calc_krw_sum)

    calc_price = calc_krw_sum / original_volume

    fee = calc_krw_sum * transaction_fee_rate

    calc_krw_sum = calc_krw_sum - fee

    # 매도 코인수량: 548.0408383561644, 매도단가: 1805.0, 수수료: 494.79924644171336, 매도결과금:989103.693636985
    return original_volume, calc_price, fee, calc_krw_sum

In [7]:
sql = select_all_from_order_book_for_one_coin.format("BTC").replace("\n", "")

In [8]:
def build_timeseries(data, data_normalized, window_size, future_target_size, up_rate):
    future_target = future_target_size - 1

    dim_0 = data.shape[0] - window_size - future_target
    dim_1 = data.shape[1]

    x = torch.zeros((dim_0, window_size, dim_1)).to(DEVICE)
    x_normalized = torch.zeros((dim_0, window_size, dim_1)).to(DEVICE)

    y = torch.zeros((dim_0,)).to(DEVICE)
    y_up = torch.zeros((dim_0,)).float().to(DEVICE)

    for i in range(dim_0):
        x[i] = data[i: i + window_size]
        x_normalized[i] = data_normalized[i: i + window_size]

    count_one = 0
    for i in range(dim_0):
        max_price = -1.0

        ask_price_lst = []
        ask_size_lst = []
        for w in range(0, 60, 4):
            ask_price_lst.append(x[i][-1][1 + w].item())
            ask_size_lst.append(x[i][-1][3 + w].item())

        invest_krw = get_invest_krw(
            current_price=x[i][-1][1].item(),
            total_ask_size=x[i][-1][121],
            total_bid_size=x[i][-1][123]
        )

        original_krw, fee, calc_price, calc_size_sum = get_expected_buy_coin_price_for_krw_and_ask_list(
            ask_price_lst=ask_price_lst,
            ask_size_lst=ask_size_lst,
            krw=invest_krw,
            transaction_fee_rate=TRANSACTION_FEE_RATE
        )

        for j in range(future_target + 1):
            bid_price_lst = []
            bid_size_lst = []
            for w in range(0, 60, 4):
                bid_price_lst.append(data[i + window_size + j][61 + w].item())
                bid_size_lst.append(data[i + window_size + j][63 + w].item())

            original_volume, future_price, fee, future_krw_sum = get_expected_sell_coin_price_for_volume_and_bid_list(
                bid_price_lst=bid_price_lst,
                bid_size_lst=bid_size_lst,
                volume=calc_size_sum,
                transaction_fee_rate=TRANSACTION_FEE_RATE
            )

            if future_price > max_price:
                max_price = future_price

        y[i] = max_price

        if y[i] > calc_price * (1 + up_rate):
            y_up[i] = 1
            count_one += 1

    return x, x_normalized, y, y_up, count_one / dim_0, dim_0

In [9]:
coin_name = "ADA"
sqlite3_order_book_db_filename = PROJECT_HOME + "db/upbit_order_book_info.db"

In [10]:
def get_dataset(data_length, split=True):
    df = pd.read_sql_query(
        select_all_from_order_book_for_one_coin.format(coin_name),
        sqlite3.connect(sqlite3_order_book_db_filename, timeout=10, check_same_thread=False)
    )

    df = df.drop(["base_datetime", "collect_timestamp"], axis=1)[:data_length]
    
    data = torch.from_numpy(df.values).to(DEVICE)

    min_max_scaler = MinMaxScaler()
    data_normalized = min_max_scaler.fit_transform(df.values)
    data_normalized = torch.from_numpy(data_normalized).to(DEVICE)

    x, x_normalized, y, y_up, one_rate, total_size = build_timeseries(
        data=data,
        data_normalized=data_normalized,
        window_size=WINDOW_SIZE,
        future_target_size=FUTURE_TARGET_SIZE,
        up_rate=UP_RATE
    )
    
    print(one_rate, total_size)
    
    # Imbalanced Preprocessing - Start
    if one_rate > 0.01:
        x_normalized = x_normalized.cpu()
        y_up = y_up.cpu()

        try:
            x_samp, y_up_samp = RandomUnderSampler(sampling_strategy=0.75).fit_sample(
                x_normalized.reshape((x_normalized.shape[0], x_normalized.shape[1] * x_normalized.shape[2])),
                y_up
            )
            x_normalized = torch.from_numpy(
                x_samp.reshape(x_samp.shape[0], x_normalized.shape[1], x_normalized.shape[2])
            ).to(DEVICE)
            y_up = torch.from_numpy(y_up_samp).to(DEVICE)
        except ValueError:
            logger.info("{0} - {1}".format(coin_name, "RandomUnderSampler - ValueError"))
            x_normalized = x_normalized.to(DEVICE)
            y_up = y_up.to(DEVICE)
    # Imbalanced Preprocessing - End
    
    total_size = len(x_normalized)
        
    if split:

        indices = list(range(total_size))
        np.random.shuffle(indices)

        train_indices = list(set(indices[:int(total_size * 0.8)]))
        validation_indices = list(set(range(total_size)) - set(train_indices))

        x_train_normalized = x_normalized[train_indices]
        x_valid_normalized = x_normalized[validation_indices]

        y_up_train = y_up[train_indices]
        y_up_valid = y_up[validation_indices]

        one_rate_train = y_up_train.sum().float() / y_up_train.size(0)
        one_rate_valid = y_up_valid.sum().float() / y_up_valid.size(0)

        train_size = x_train_normalized.size(0)
        valid_size = x_valid_normalized.size(0)

        return x_train_normalized, y_up_train, one_rate_train, train_size,\
               x_valid_normalized, y_up_valid, one_rate_valid, valid_size
    else:
        one_rate = y_up.sum().float() / y_up.size(0)
        return x_normalized, y_up, one_rate, total_size

In [11]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import GradientBoostingClassifier


def get_best_model_by_nested_cv(X, y, inner_cv, outer_cv, Classifier, parameter_grid):
    outer_score_list = []
    best_param_list = []
    model_list = []
    
    num_outer_split = 1
    for training_samples_idx, test_samples_idx in outer_cv.split(X, y):
        print("[Outer Split: #{0}]".format(num_outer_split))
        best_parms = {}
        best_score = -np.inf

        for parameters in parameter_grid:
#             print("Parameters: {0}".format(parameters))
            cv_scores = []
            num_inner_split = 1
            for inner_train_idx, inner_test_idx in inner_cv.split(X[training_samples_idx], y[training_samples_idx]):
                clf = Classifier(**parameters)
                clf.fit(X[inner_train_idx], y[inner_train_idx])
                score = clf.score(X[inner_test_idx], y[inner_test_idx])
                
                cv_scores.append(score)
#                 print("Inner Split: #{0}, Score: #{1}".format(
#                     num_inner_split,
#                     score
#                 ))
                num_inner_split += 1

            mean_score = np.mean(cv_scores)
            if mean_score > best_score:
                best_score = mean_score
                best_params = parameters
#             print("Mean Score:{0}, Best Score:{1}".format(mean_score, best_score))

        print("* Outer Split: #{0}, Best Score: {1}, Best Parameter: #{2} ***\n".format(
            num_outer_split,
            best_score,
            best_params
        ))

        clf = Classifier(**best_params)
        clf.fit(X[training_samples_idx], y[training_samples_idx])

        best_param_list.append(best_params)
        outer_score_list.append(clf.score(X[test_samples_idx], y[test_samples_idx]))
        model_list.append(clf)
        
        num_outer_split += 1

    best_score = -np.inf
    best_model = None
    for idx, score in enumerate(outer_score_list):        
        if score > best_score:
            best_score = score
            best_model = model_list[idx]

    return best_score, best_model

In [12]:
DATA_LENGTH = 480

x_normalized_original, y_up_original, one_rate, total_size = get_dataset(DATA_LENGTH, split=False)

print("x_normalized_original: {0}, y_up_original: {1}, one_rate: {2}, total_size: {3}".format(
    x_normalized_original.size(),
    y_up_original.size(),
    one_rate,
    total_size
))

0.13258426966292136 445
x_normalized_original: torch.Size([137, 18, 125]), y_up_original: torch.Size([137]), one_rate: 0.43065693974494934, total_size: 137


In [13]:
def make_sklearn_model(coin_name, x_normalized_original, y_up_original, total_size, one_rate):    
    param_grid = {
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': np.linspace(1, 9, 4, endpoint=True),
        'n_estimators': [32, 64, 128],
        'max_features': list(range(int(x_normalized_original.shape[1] / 2), x_normalized_original.shape[1], 2)),
    }
        
    is_high_quality = False

    batch_size = 16
    patience = 30

    coin_model_start_time = time.time()
    
    X = x_normalized_original.numpy().reshape(total_size, -1)
    y = y_up_original.numpy()
    
#     print("X.shape: {0}".format(X.shape))
#     print("y.shape: {0}".format(y.shape))

    best_score, best_model = get_best_model_by_nested_cv(
        X=X,
        y=y,
        inner_cv=StratifiedKFold(n_splits=4, shuffle=True),
        outer_cv=StratifiedKFold(n_splits=4, shuffle=True),
        Classifier=GradientBoostingClassifier,
        parameter_grid=ParameterGrid(param_grid)
    )
    
    print(best_model)
    return best_model

In [34]:
def get_dataset_for_buy(coin_name, model_type="LSTM"):
    df = pd.read_sql_query(
        select_all_from_order_book_for_one_coin_recent_window.format(coin_name, WINDOW_SIZE),
        sqlite3.connect(sqlite3_order_book_db_filename, timeout=10, check_same_thread=False)
    )

    df = df.sort_values(['collect_timestamp', 'base_datetime'], ascending=True)
    df = df.drop(["base_datetime", "collect_timestamp"], axis=1)

    min_max_scaler = MinMaxScaler()
    data_normalized = min_max_scaler.fit_transform(df.values)
    data_normalized = torch.from_numpy(data_normalized).float().to(DEVICE)
    
    if model_type == "LSTM":
        return data_normalized.unsqueeze(dim=0)
    else:
        data_normalized = data_normalized.flatten()
        return data_normalized.unsqueeze(dim=0)
        

In [35]:
X_prediction = get_dataset_for_buy("BTC")
print(X_prediction.size())

torch.Size([1, 18, 125])


In [16]:
upbit = Upbit(CLIENT_ID_UPBIT, CLIENT_SECRET_UPBIT, fmt)
for coin_name in upbit.get_all_coin_names():
    print("***************", coin_name)
    best_model = make_sklearn_model(coin_name, x_normalized_original, y_up_original, total_size, one_rate)
    
    X_prediction = get_dataset_for_buy(coin_name)
    
    total_size = X_prediction.size(0)
    X_prediction = X_prediction.view(total_size, -1)
    y_prediction = best_model.predict_proba(X_prediction[-1].unsqueeze(dim=0))

    print(y_prediction)

*************** QKC
[Outer Split: #1]
* Outer Split: #1, Best Score: 0.9703846153846154, Best Parameter: #{'learning_rate': 0.01, 'max_depth': 3.6666666666666665, 'max_features': 13, 'n_estimators': 128} ***

[Outer Split: #2]
* Outer Split: #2, Best Score: 0.97, Best Parameter: #{'learning_rate': 0.1, 'max_depth': 1.0, 'max_features': 11, 'n_estimators': 32} ***

[Outer Split: #3]
* Outer Split: #3, Best Score: 0.9615384615384616, Best Parameter: #{'learning_rate': 0.1, 'max_depth': 1.0, 'max_features': 9, 'n_estimators': 128} ***

[Outer Split: #4]
* Outer Split: #4, Best Score: 0.9518660968660968, Best Parameter: #{'learning_rate': 0.1, 'max_depth': 3.6666666666666665, 'max_features': 17, 'n_estimators': 32} ***

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=1.0,
                           max_features=11, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_

ValueError: cannot reshape array of size 308250 into shape (4137,newaxis)