In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold

import tensorflow as tf
# tf.random.set_seed(87)

import keras.backend as K
from keras.models import Model
from keras.layers import Input, BatchNormalization, GaussianNoise, Dense, Flatten
from keras.layers import Activation, Dropout, Concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam

import keras_tuner as kt
from keras_tuner.tuners import RandomSearch
from keras_tuner.engine.hyperparameters import HyperParameters

from livelossplot import PlotLossesKeras
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

### 一、資料前處理

### 二、建立資料集

2.1. open, high, low, close, volume, financing, fi, ii, di, rp, capital, 5ma, 10ma, 20ma (看5天後答案)

In [2]:
def get_label_data(shift):
    """
    open_time: K 線開盤時間 (UNIX 時間格式)
    open: 開盤價格
    high: 最高價
    low: 最低價
    close: 收盤價格
    volume: 交易量
    close_time: K 線收盤時間 (UNIX 時間格式)
    quote_volume: 報價資產量
    count: 交易數量
    taker_buy_volume: 吃單方在此期間買入的基礎資產數量
    taker_buy_quote_volume: 吃單方在此期間買入的報價資產數量
    """
    ticker_df = pd.read_csv('BTC_Test.csv')
    ticker_df['timestamp'] = pd.to_datetime(ticker_df['timestamp'])
    ticker_df = ticker_df.drop(columns=['close_time']) # 這個column沒有用

    data = ticker_df.copy()

    ### n個K棒後的答案 ###
    shift = 4 # 設定
    data[f'close_{shift}'] = data['close'].shift(-shift)
    data['ans'] = data[f'close_{shift}'] - data['close']

    ### 將timestamp變成index ###
    data.set_index('timestamp', inplace=True) # 先讓兩邊的index一樣才能併資料
    ticker_df.set_index('timestamp', inplace=True)

    data = data.dropna() # 為了讓NaN不會變0要先dropna()
    data['ans'] = (data['ans'] >= 0).astype(int)
    data = data.drop(columns=[f'close_{shift}'])

    ticker_df['ans'] = data['ans'] # 併資料
    ticker_df = ticker_df.dropna()

    ### 還原index ###
    data = data.reset_index()
    ticker_df = ticker_df.reset_index()

    return data, ticker_df

In [3]:
data, ticker_df = get_label_data(shift=4)
data.tail()

Unnamed: 0,timestamp,open,high,low,close,volume,quote_volume,count,taker_buy_volume,taker_buy_quote_volume,ignore,ema_9,ema_12,ema_26,macd,signal,rsi_14,ans
958,2024-03-05 06:00:00,67062.4,67165.0,66700.1,67058.1,4975.596,333150000.0,61523,2412.848,161556500.0,0,67353.263428,67353.263428,67584.987174,-231.723746,-69.699478,42.771435,1
959,2024-03-05 06:15:00,67058.0,67435.0,67052.6,67419.2,4636.991,311985400.0,67732,2694.75,181306000.0,0,67363.407516,67363.407516,67572.706642,-209.299126,-97.619408,47.918953,0
960,2024-03-05 06:30:00,67419.2,67565.3,67287.2,67370.1,3797.674,256186900.0,61614,1798.001,121297700.0,0,67364.437129,67364.437129,67557.698743,-193.261614,-116.747849,47.29601,0
961,2024-03-05 06:45:00,67370.0,67566.7,67273.9,67368.4,3027.573,204141900.0,46184,1381.318,93153990.0,0,67365.046801,67365.046801,67543.676614,-178.629812,-129.124242,47.273095,0
962,2024-03-05 07:00:00,67368.5,67438.5,67161.8,67263.0,2437.294,163953500.0,40707,1170.93,78770320.0,0,67349.347293,67349.347293,67522.885753,-173.53846,-138.007085,45.791784,0


2.2. 分拆X, y

In [8]:
def data_preprocess(data, window_size): # target在data裡面

    ### 將data標準化成0~1 ###
    scaled_data = data.copy()
    scaled_data = scaled_data.drop(columns=['timestamp']) # 把timestamp和Ticker拿掉
    columns_to_scale = list(scaled_data.columns)

    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()

    scaled_data[columns_to_scale] = scaler_X.fit_transform(scaled_data[columns_to_scale])
    scaled_data = scaled_data.drop(columns=['ans'])

    X = np.array(scaled_data)
    y = scaler_y.fit_transform(data['ans'].values.reshape(-1, 1))

    ### 加入window_size ###
    data_X, data_y = [], []
    for i in range(len(X)-window_size-1):
        a = X[i:(i+window_size), :]
        data_X.append(a)
        data_y.append(y[i+window_size])

    data_X, data_y = np.array(data_X), np.array(data_y)

    return data_X, data_y

In [9]:
X, y = data_preprocess(data=data, window_size=20)
X.shape

(942, 20, 16)

2.2.5. Flatten (MLP限定)

In [10]:
def make_X_flatten(X):
    X_flatten = X.reshape((X.shape[0], X.shape[1] * X.shape[2]))

    return X_flatten

In [11]:
X_flatten = make_X_flatten(X)

2.4. 分拆train, valid

In [8]:
### 製作 trainning 資料 ###
train_X, train_y = get_label_data(unique_ticker=unique_ticker,
                                       start_date='2021-02-24', end_date='2023-03-04', shift=10,
                                       window_size=20)

### 製作 Validation 資料 ###
valid_X, valid_y = get_label_data(unique_ticker=unique_ticker,
                                       start_date='2023-03-05', end_date='2023-03-07', shift=10,
                                       window_size=20)

4938 is failed
6153 is failed
4130 is failed
6901 is failed
6526 is failed
6805 is failed
4938 is failed
6153 is failed
4130 is failed
6526 is failed
6805 is failed


### 三、訓練AE-MLP

3.1. 建構AE-MLP model

In [9]:
def create_ae_mlp(hp, num_columns, num_labels):
    
    ##########
    ##########
    units_1 = hp.Int(name="units_1", min_value=16, max_value=256, step=16)
    units_2 = hp.Int(name="units_2", min_value=16, max_value=256, step=16)
    units_3 = hp.Int(name="units_3", min_value=16, max_value=256, step=16)
    units_4 = hp.Int(name="units_4", min_value=16, max_value=256, step=16)
    units_5 = hp.Int(name="units_5", min_value=16, max_value=256, step=16)
    units_6 = hp.Int(name="units_6", min_value=16, max_value=256, step=16)

    hidden_units = [units_1, units_2,
                    units_3,
                    units_4, units_5, units_6]
    
    dropout_1 = hp.Choice("dropout_1", [0.0, 0.2, 0.5])
    dropout_2 = hp.Choice("dropout_2", [0.0, 0.2, 0.5])
    dropout_3 = hp.Choice("dropout_3", [0.0, 0.2, 0.5])
    dropout_4 = hp.Choice("dropout_4", [0.0, 0.2, 0.5])
    dropout_5 = hp.Choice("dropout_5", [0.0, 0.2, 0.5])
    dropout_6 = hp.Choice("dropout_6", [0.0, 0.2, 0.5])
    dropout_7 = hp.Choice("dropout_7", [0.0, 0.2, 0.5])
    dropout_8 = hp.Choice("dropout_8", [0.0, 0.2, 0.5])

    dropout_rates = [dropout_1, dropout_2, dropout_3, dropout_4,
                     dropout_5, dropout_6, dropout_7, dropout_8]
    
    ls = hp.Choice('ls',[1e-2, 1e-3, 1e-5])
    lr = hp.Choice('lr',[1e-2, 1e-3, 1e-5])
    ##########
    ##########

    inp = tf.keras.layers.Input(shape = (num_columns, ))
    x0 = tf.keras.layers.BatchNormalization()(inp)
    
    encoder = tf.keras.layers.GaussianNoise(dropout_rates[0])(x0)
    encoder = tf.keras.layers.Dense(hidden_units[0])(encoder)
    encoder = tf.keras.layers.BatchNormalization()(encoder)
    encoder = tf.keras.layers.Activation('swish')(encoder)
    
    decoder = tf.keras.layers.Dropout(dropout_rates[1])(encoder)
    decoder = tf.keras.layers.Dense(num_columns, name = 'decoder')(decoder)  

    x_ae = tf.keras.layers.Dense(hidden_units[1])(decoder)
    x_ae = tf.keras.layers.BatchNormalization()(x_ae)
    x_ae = tf.keras.layers.Activation('swish')(x_ae)
    x_ae = tf.keras.layers.Dropout(dropout_rates[2])(x_ae)

    out_ae = tf.keras.layers.Dense(num_labels, activation = 'sigmoid', name = 'ae_action')(x_ae)
    
    x = tf.keras.layers.Concatenate()([x0, encoder])
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(dropout_rates[3])(x)
    
    for i in range(2, len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation('swish')(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 2])(x)
        
    out = tf.keras.layers.Dense(num_labels, activation = 'sigmoid', name = 'action')(x)
    
    model = tf.keras.models.Model(inputs = inp, outputs = [decoder, out_ae, out])
    model.compile(optimizer = tf.keras.optimizers.legacy.Adam(learning_rate = lr),
                  loss = {'decoder': tf.keras.losses.MeanSquaredError(), 
                          'ae_action': tf.keras.losses.BinaryCrossentropy(label_smoothing = ls),
                          'action': tf.keras.losses.BinaryCrossentropy(label_smoothing = ls), 
                         },
                  metrics = {'decoder': tf.keras.metrics.MeanAbsoluteError(name = 'MAE'), 
                             'ae_action': tf.keras.metrics.AUC(name = 'AUC'), 
                             'action': tf.keras.metrics.AUC(name = 'AUC'), 
                            }, 
                 )
    
    return model

3.2. 訓練, 找AE-MLP參數

In [10]:
TUNNING = True

if TUNNING:
    model_fn = lambda hp: create_ae_mlp(hp, num_columns=train_X.shape[1], num_labels=1)
    tuner = kt.BayesianOptimization(
                                    model_fn,
                                    objective=kt.Objective("val_action_AUC", direction="max"),
                                    max_trials=1,
                                    executions_per_trial=1,
                                    directory="AE-MLP_kt_test",
                                    overwrite=True,
                                    seed=87
                                    )
    # tuner.search_space_summary()
    es = EarlyStopping(monitor = 'val_action_AUC', min_delta = 1e-4, patience = 10, mode = 'max', 
                   baseline = None, restore_best_weights = True, verbose = 0)
    
    tuner.search(train_X, [train_X, train_y, train_y], validation_split=0.2, epochs=100, 
                 batch_size = 4096, callbacks = [es], verbose = 1)
    AE_MLP_model = tuner.get_best_models()[0]

Trial 1 Complete [00h 02m 13s]
val_action_AUC: 0.5888948440551758

Best val_action_AUC So Far: 0.5888948440551758
Total elapsed time: 00h 02m 13s


3.3. 輸出最佳參數

In [12]:
best_hyperparameters = tuner.get_best_hyperparameters()[0]

print("Best Hyperparameters:")
print(best_hyperparameters.values)

# 0.588 # {'units_1': 48, 'units_2': 112, 'units_3': 32, 'units_4': 64, 'units_5': 32, 'units_6': 112, 'dropout_1': 0.5, 'dropout_2': 0.2, 'dropout_3': 0.5, 'dropout_4': 0.2, 'dropout_5': 0.0, 'dropout_6': 0.2, 'dropout_7': 0.2, 'dropout_8': 0.0, 'ls': 0.001, 'lr': 0.01}

Best Hyperparameters:
{'units_1': 48, 'units_2': 112, 'units_3': 32, 'units_4': 64, 'units_5': 32, 'units_6': 112, 'dropout_1': 0.5, 'dropout_2': 0.2, 'dropout_3': 0.5, 'dropout_4': 0.2, 'dropout_5': 0.0, 'dropout_6': 0.2, 'dropout_7': 0.2, 'dropout_8': 0.0, 'ls': 0.001, 'lr': 0.01}


### 四、評估訓練出來的model

4.1. 測試model在所有股票上

In [14]:
def test_my_model_whole_data(valid_X, valid_y, model):

    predictions = model.predict(valid_X)
    pred_k = predictions[2]
    pred_k = (pred_k > 0.5).astype(int)

    result_df = pd.DataFrame(pred_k, columns=['Pred'])
    result_df['True'] = valid_y

    match_count = (result_df['Pred'] == result_df['True']).sum()
    correct = match_count / len(result_df)

    print(f'ACC: {correct}\n')

    return result_df

result_df = test_my_model_whole_data(valid_X=valid_X, valid_y=valid_y, model=AE_MLP_model)

ACC: 0.6094329514547103



4.2. 測試model在單一股票上

In [15]:
def test_my_model_one_ticker(ticker, model):

    test_X, test_y = make_concat_data_v1(unique_ticker=[ticker], start_date='2023-07-01', end_date='2023-11-30', shift=10, window_size=20)

    predictions = model.predict(test_X)
    pred_k = predictions[2]
    pred_k = (pred_k > 0.5).astype(int)

    result_df = pd.DataFrame(pred_k, columns=['Pred'])
    result_df['True'] = test_y

    match_count = (result_df['Pred'] == result_df['True']).sum()
    correct = match_count / len(result_df)

    print(f'\n{ticker} ACC: {correct}\n')
    print(result_df.head(60))

    return result_df

result_df = test_my_model_one_ticker(ticker=2330, model=AE_MLP_model)


2330 ACC: 0.6181818181818182

    Pred  True
0      0   0.0
1      0   0.0
2      0   0.0
3      0   1.0
4      0   1.0
5      0   0.0
6      0   0.0
7      1   0.0
8      1   0.0
9      1   0.0
10     0   0.0
11     1   0.0
12     0   0.0
13     0   0.0
14     0   0.0
15     0   0.0
16     1   0.0
17     1   0.0
18     1   1.0
19     1   1.0
20     1   1.0
21     1   1.0
22     1   1.0
23     1   1.0
24     1   1.0
25     1   1.0
26     1   1.0
27     1   1.0
28     1   1.0
29     1   1.0
30     1   0.0
31     1   0.0
32     1   0.0
33     1   0.0
34     1   0.0
35     1   1.0
36     1   0.0
37     1   1.0
38     1   1.0
39     1   1.0
40     1   1.0
41     1   1.0
42     1   1.0
43     1   1.0
44     1   1.0
45     1   1.0
46     0   1.0
47     0   1.0
48     0   1.0
49     0   1.0
50     0   1.0
51     0   1.0
52     0   0.0
53     1   1.0
54     1   0.0


### 五、利用找出的參數(model)進行CV, 完整訓練, 預測

5.1. 我的model

In [16]:
def create_ae_mlp(num_columns, num_labels, hidden_units, dropout_rates, ls, lr):
    
    tf.random.set_seed(87)
    
    inp = Input(shape = (num_columns, ))
    x0 = BatchNormalization()(inp)
    
    encoder = GaussianNoise(dropout_rates[0])(x0)
    encoder = Dense(hidden_units[0])(encoder)
    encoder = BatchNormalization()(encoder)
    encoder = Activation('swish')(encoder)
    
    decoder = Dropout(dropout_rates[1])(encoder)
    decoder = Dense(num_columns, name='decoder')(decoder)

    x_ae = Dense(hidden_units[1])(decoder)
    x_ae = BatchNormalization()(x_ae)
    x_ae = Activation('swish')(x_ae)
    x_ae = Dropout(dropout_rates[2])(x_ae)

    out_ae = Dense(num_labels, activation = 'sigmoid', name='ae_action')(x_ae)
    
    x = Concatenate()([x0, encoder])
    x = BatchNormalization()(x)
    x = Dropout(dropout_rates[3])(x)

    for i in range(2, len(hidden_units)):
        x = Dense(hidden_units[i])(x)
        x = BatchNormalization()(x)
        x = Activation('swish')(x)
        x = Dropout(dropout_rates[i + 2])(x)
        
    out = Dense(num_labels, activation = 'sigmoid', name = 'action')(x)
    
    model = Model(inputs = inp, outputs = [decoder, out_ae, out])
    model.compile(optimizer = Adam(learning_rate = lr),
                  loss = {'decoder': tf.keras.losses.MeanSquaredError(), 
                          'ae_action': tf.keras.losses.BinaryCrossentropy(label_smoothing = ls),
                          'action': tf.keras.losses.BinaryCrossentropy(label_smoothing = ls), 
                         },
                  metrics = {'decoder': tf.keras.metrics.MeanAbsoluteError(name = 'MAE'), 
                             'ae_action': tf.keras.metrics.AUC(name = 'AUC'), 
                             'action': tf.keras.metrics.AUC(name = 'AUC'), 
                            }, 
                 )
    
    return model

params = {'num_columns': train_X.shape[1],
          'num_labels': 1,
          'hidden_units': [48, 112, 32, 64, 32, 112],
          'dropout_rates': [0.5, 0.2, 0.5, 0.2, 0.0, 0.2, 0.2, 0.0],
          'ls': 0.001, 'lr': 0.01}

5.2. 利用找出的參數進行model的CV

In [17]:
def AE_MLP_CV(batch_size, X, y):

    # import gc
    
    scores = []
    kf = KFold(n_splits=5, shuffle=True, random_state=87)
    split_indices = kf.split(X)
    for fold, (tr, te) in enumerate(split_indices):

        AE_MLP_path = f'AE_MLP_{fold+1}.hdf5'
        model = create_ae_mlp(**params)
        ckp = ModelCheckpoint(AE_MLP_path, monitor = 'val_action_AUC', verbose = 0, 
                            save_best_only = True, save_weights_only = True, mode = 'max')
        es = EarlyStopping(monitor = 'val_action_AUC', min_delta = 1e-4, patience = 10, mode = 'max', 
                        baseline = None, restore_best_weights = True, verbose = 0)
        history = model.fit(X[tr], [X[tr], y[tr], y[tr]], validation_data = (X[te], [X[te], y[te], y[te]]), 
                            # sample_weight = sw[tr], 
                            epochs = 100, batch_size = batch_size, callbacks = [ckp, es], verbose = 1)
        
        hist = pd.DataFrame(history.history)
        score = hist['val_action_AUC'].max()
        print(f'Fold {fold+1} \tROC AUC:', score)
        scores.append(score)

        K.clear_session()
        del model
        
        # rubbish = gc.collect()

    print('Average ROC AUC:', np.mean(scores))

    return hist, scores

hist, scores = AE_MLP_CV(batch_size=4096, X=train_X, y=train_y)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
# ### 偷懶不CV ###
# def AE_MLP_noCV(batch_size, X, y):

#     AE_MLP_path = f'AE_MLP.hdf5'
#     model = create_ae_mlp(**params)
#     ckp = ModelCheckpoint(AE_MLP_path, monitor = 'val_action_AUC', verbose = 0, 
#                         save_best_only = True, save_weights_only = True, mode = 'max')
#     es = EarlyStopping(monitor = 'val_action_AUC', min_delta = 1e-4, patience = 10, mode = 'max', 
#                     baseline = None, restore_best_weights = True, verbose = 0)
#     history = model.fit(X, [X, y, y], validation_split=0.2, 
#                         # sample_weight = sw[tr], 
#                         epochs = 100, batch_size = batch_size, callbacks = [ckp, es], verbose = 1)

#     hist = pd.DataFrame(history.history)
#     score = hist['val_action_AUC'].max()
#     print(f'ROC AUC:', score)

#     return history, model

# history, model = AE_MLP_noCV(batch_size=4096, X=train_X, y=train_y)

5.3. 取出CV最佳的model

In [18]:
model = create_ae_mlp(**params)  
best_model_path = 'AE_MLP_4.hdf5'  # 選CV中預測能力最好的權重
model.load_weights(best_model_path) 

5.4. 測試在所有股票上

In [19]:
def test_my_model_whole_data(valid_X, valid_y, model):

    predictions = model.predict(valid_X)
    pred_k = predictions[2]
    pred_k = (pred_k > 0.5).astype(int)

    result_df = pd.DataFrame(pred_k, columns=['Pred'])
    result_df['True'] = valid_y

    match_count = (result_df['Pred'] == result_df['True']).sum()
    correct = match_count / len(result_df)

    print(f'ACC: {correct}\n')

    return result_df

result_df = test_my_model_whole_data(valid_X=valid_X, valid_y=valid_y, model=model)

ACC: 0.6175100964312207



5.5. 測試在單一股票上

In [25]:
def test_my_model_one_ticker(ticker, model):

    test_X, test_y = make_concat_data_v1(unique_ticker=[ticker], start_date='2023-07-01', end_date='2023-11-30', shift=10, window_size=20)

    predictions = model.predict(test_X)
    pred_k = predictions[2]
    pred_k = (pred_k > 0.5).astype(int)

    result_df = pd.DataFrame(pred_k, columns=['Pred'])
    result_df['True'] = test_y

    match_count = (result_df['Pred'] == result_df['True']).sum()
    correct = match_count / len(result_df)

    print(f'\n{ticker} ACC: {correct}\n')
    print(result_df.head(60))

    return result_df

result_df = test_my_model_one_ticker(ticker=2330, model=model)


2330 ACC: 0.6909090909090909

    Pred  True
0      1   0.0
1      1   0.0
2      1   0.0
3      1   1.0
4      0   1.0
5      0   0.0
6      0   0.0
7      0   0.0
8      0   0.0
9      0   0.0
10     0   0.0
11     1   0.0
12     1   0.0
13     0   0.0
14     0   0.0
15     0   0.0
16     1   0.0
17     0   0.0
18     1   1.0
19     1   1.0
20     1   1.0
21     1   1.0
22     1   1.0
23     1   1.0
24     1   1.0
25     1   1.0
26     1   1.0
27     1   1.0
28     1   1.0
29     1   1.0
30     1   0.0
31     1   0.0
32     1   0.0
33     1   0.0
34     1   0.0
35     1   1.0
36     1   0.0
37     1   1.0
38     1   1.0
39     1   1.0
40     1   1.0
41     1   1.0
42     1   1.0
43     1   1.0
44     1   1.0
45     1   1.0
46     0   1.0
47     0   1.0
48     1   1.0
49     1   1.0
50     1   1.0
51     1   1.0
52     1   0.0
53     1   1.0
54     1   0.0
