In [1]:
import numpy as np
import pandas as pd

from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import tensorflow as tf
tf.random.set_seed(87)

from keras.models import Model
from keras.layers import Input, Dense ,Dropout 
from keras.layers import BatchNormalization, GaussianNoise, Activation, Concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint

import keras_tuner as kt
from keras_tuner.engine.hyperparameters import HyperParameters
from livelossplot import PlotLossesKeras

import warnings
warnings.filterwarnings('ignore')

# Data preprocessing

1.1. Import data

In [2]:
#############
TICKER = 2330
TP = 1
#############

### import data ###
data = pd.read_csv('/Users/yitsung/Desktop/MasterThesis/data/TaiwanStockData_Top100_EMA')
ticker_data = data[data['ticker']==TICKER].reset_index(drop=True)
ticker_data = ticker_data.drop(columns=['ticker'])

# (SMA-P/P, 2class) #
ticker_data[f'y_{TP}'] = ticker_data['close'].rolling(window=TP).mean()
ticker_data[f'y_{TP}'] = ticker_data[f'y_{TP}'].shift(-TP)
ticker_data = ticker_data.dropna().reindex()
ticker_data[f'y_{TP}'] = ((ticker_data[f'y_{TP}'] - ticker_data['close']) >= 0).astype(int)

ticker_data

Unnamed: 0,Date,open,high,low,close,volume,financing,fi,ii,di,rp,capital,EMA9,EMA12,EMA26,MACD,Signal,RSI14,y_1
0,2021-01-04,530.0,540.0,528.0,536.0,39490.0,454.0,12463.0,-33.0,865.0,2342.0,6.0443,521.295251,518.980386,513.251221,5.729165,3.933239,84.477581,1
1,2021-01-05,536.0,542.0,535.0,542.0,34839.0,-355.0,2884.0,179.0,-451.0,-1374.0,5.3592,525.437881,522.532126,515.535238,6.996887,4.619674,88.417310,1
2,2021-01-06,555.0,555.0,541.0,549.0,55614.0,-256.0,5355.0,105.0,-4163.0,1.0,6.9696,530.151835,526.614084,518.179719,8.434365,5.454306,91.005801,1
3,2021-01-07,554.0,570.0,553.0,565.0,53393.0,2200.0,1671.0,-75.0,2060.0,-402.0,8.7664,537.123278,532.531850,521.861371,10.670478,6.574521,93.325963,1
4,2021-01-08,580.0,580.0,571.0,580.0,62957.0,-502.0,3278.0,187.0,1176.0,-5041.0,9.0658,545.700404,539.847445,526.412277,13.435169,8.026473,94.939847,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
703,2023-11-24,577.0,578.0,574.0,575.0,12503.0,243.0,-854.0,70.0,-118.0,-2263.0,2.8318,575.073961,572.538736,562.541337,9.997398,9.134503,90.744592,0
704,2023-11-27,573.0,577.0,568.0,568.0,20322.0,-112.0,-2153.0,59.0,-56.0,-3554.0,4.1507,573.659169,571.840469,562.945683,8.894786,9.086560,81.069290,1
705,2023-11-28,565.0,576.0,565.0,575.0,26932.0,478.0,3323.0,-98.0,687.0,-416.0,5.1624,573.927335,572.326550,563.838595,8.487955,8.966839,76.500832,0
706,2023-11-29,578.0,579.0,570.0,574.0,27787.0,357.0,-180.0,55.0,-553.0,-2383.0,4.8624,573.941868,572.584004,564.591292,7.992712,8.772014,71.301362,1


1.2.Splite data into train(Library) and test(Prediction)

In [3]:
Library = ticker_data[ticker_data['Date'] <= '2023-06-30'] # windows=20, the last prediction from Library is 6/30
Prediction = ticker_data[(ticker_data['Date'] >= '2023-06-01')&(ticker_data['Date'] <= '2023-10-31')] # windows=20, start from using 6/1 to predict 7/3

In [4]:
Library

Unnamed: 0,Date,open,high,low,close,volume,financing,fi,ii,di,rp,capital,EMA9,EMA12,EMA26,MACD,Signal,RSI14,y_1
0,2021-01-04,530.0,540.0,528.0,536.0,39490.0,454.0,12463.0,-33.0,865.0,2342.0,6.0443,521.295251,518.980386,513.251221,5.729165,3.933239,84.477581,1
1,2021-01-05,536.0,542.0,535.0,542.0,34839.0,-355.0,2884.0,179.0,-451.0,-1374.0,5.3592,525.437881,522.532126,515.535238,6.996887,4.619674,88.417310,1
2,2021-01-06,555.0,555.0,541.0,549.0,55614.0,-256.0,5355.0,105.0,-4163.0,1.0,6.9696,530.151835,526.614084,518.179719,8.434365,5.454306,91.005801,1
3,2021-01-07,554.0,570.0,553.0,565.0,53393.0,2200.0,1671.0,-75.0,2060.0,-402.0,8.7664,537.123278,532.531850,521.861371,10.670478,6.574521,93.325963,1
4,2021-01-08,580.0,580.0,571.0,580.0,62957.0,-502.0,3278.0,187.0,1176.0,-5041.0,9.0658,545.700404,539.847445,526.412277,13.435169,8.026473,94.939847,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598,2023-06-26,576.0,578.0,574.0,574.0,29870.0,164.0,-4194.0,-3.0,314.0,-2859.0,5.6543,579.105659,576.716989,562.864922,13.852067,15.124979,50.802228,0
599,2023-06-27,570.0,575.0,569.0,572.0,22447.0,-71.0,-3850.0,-2006.0,501.0,-168.0,4.2795,577.684527,575.991298,563.541594,12.449704,14.589924,45.390598,1
600,2023-06-28,579.0,579.0,571.0,574.0,18685.0,-113.0,-3437.0,-93.0,378.0,-1260.0,4.0474,576.947622,575.684944,564.316291,11.368654,13.945670,41.019391,0
601,2023-06-29,578.0,580.0,570.0,573.0,18046.0,28.0,-1355.0,-121.0,1407.0,-434.0,3.7922,576.158098,575.271876,564.959529,10.312347,13.219005,34.887970,1


In [5]:
Prediction

Unnamed: 0,Date,open,high,low,close,volume,financing,fi,ii,di,rp,capital,EMA9,EMA12,EMA26,MACD,Signal,RSI14,y_1
583,2023-06-01,550.0,554.0,550.0,551.0,25258.0,550.0,-5422.0,142.0,295.0,-2965.0,4.9781,548.620470,543.823220,530.582370,13.240851,8.936702,97.433397,1
584,2023-06-02,559.0,564.0,557.0,562.0,34705.0,-58.0,5621.0,167.0,37.0,6360.0,5.3101,551.296376,546.619648,532.909601,13.710047,9.891371,98.303245,0
585,2023-06-05,560.0,560.0,555.0,555.0,17484.0,292.0,-2385.0,-301.0,-245.0,-3400.0,3.2038,552.037101,547.908933,534.545927,13.363006,10.585698,96.448220,1
586,2023-06-06,554.0,562.0,553.0,560.0,21562.0,-183.0,-1619.0,-342.0,393.0,2066.0,4.3610,553.629681,549.769097,536.431414,13.337683,11.136095,95.958002,1
587,2023-06-07,561.0,568.0,560.0,568.0,29092.0,-175.0,1827.0,1451.0,339.0,5880.0,5.5581,556.503745,552.573851,538.769828,13.804023,11.669681,95.496446,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681,2023-10-25,544.0,551.0,544.0,544.0,17137.0,-99.0,-2573.0,651.0,-142.0,1185.0,3.9095,544.649068,543.673118,541.716547,1.956572,0.734048,91.909547,0
682,2023-10-26,530.0,535.0,530.0,531.0,31683.0,487.0,-10712.0,-35173.0,-1744.0,-10261.0,6.9033,541.919255,541.723408,540.922728,0.800679,0.747374,85.178131,1
683,2023-10-27,534.0,536.0,532.0,533.0,17051.0,17.0,-5262.0,1478.0,-73.0,-1739.0,4.1968,540.135404,540.381345,540.335860,0.045485,0.606996,78.927028,0
684,2023-10-30,531.0,534.0,528.0,532.0,23299.0,265.0,-11811.0,487.0,378.0,-5803.0,5.6532,538.508323,539.091907,539.718389,-0.626481,0.360301,72.836426,0


1.3.Data Normalize

In [6]:
def make_data_minmax(Library, Prediction):

    # MinMax #
    scaler = MinMaxScaler()
    feature_to_standardize = Library.columns.to_list()[1 : ] # exclude 'Date'
    Library[feature_to_standardize] = scaler.fit_transform(Library[feature_to_standardize])
    Prediction[feature_to_standardize] = scaler.fit_transform(Prediction[feature_to_standardize])

    return Library, Prediction

### splite train set and validation set ###
train_Library = Library[: int((len(Library) * 0.8))]
valid_Library = Library[int((len(Library) * 0.8)): ]
train_Library, valid_Library = make_data_minmax(Library=train_Library, Prediction=valid_Library)

### splite whole data ###
Library, Prediction = make_data_minmax(Library=Library, Prediction=Prediction)

1.4.Make window data: X, y

In [7]:
def data_preprocess(data, window_size):

    X = np.array(data.iloc[:, 1: -1])
    y = data.iloc[:, -1].values.reshape(-1, 1)

    data_X, data_y = [], []
    for i in range(len(data) - window_size + 1):
        data_X.append(X[i : (i + window_size), :])
        data_y.append(y[i + window_size - 1])

    data_X, data_y = np.array(data_X), np.array(data_y)
        
    return data_X, data_y

### train set and validation set ###
train_X, train_y = data_preprocess(data=train_Library, window_size=20)
valid_X, valid_y = data_preprocess(data=valid_Library, window_size=20)

### whole data ###
# full_X, full_y = data_preprocess(data=Library, window_size=20) # just test 
test_X, test_y = data_preprocess(data=Prediction, window_size=20)

1.4.5.Flatten(MLP only)

In [8]:
def make_X_flatten(X):
    X_flatten = X.reshape((X.shape[0], X.shape[1] * X.shape[2]))

    return X_flatten

### train set and validation set ###
train_X = make_X_flatten(train_X)
valid_X = make_X_flatten(valid_X)

### whole data ###
# full_X = make_X_flatten(full_X) # just test 
test_X = make_X_flatten(test_X)

1.5.Over-smapling

In [9]:
### train set and validation set ###
ros = RandomOverSampler(random_state=87)
train_X_resampled, train_y_resampled = ros.fit_resample(train_X, train_y)
train_y_resampled = train_y_resampled.reshape(-1,1) # just test

print("Shape of resampled train_X:", train_X_resampled.shape)
print("Shape of resampled train_y:", train_y_resampled.shape)
print("Number of positive samples after resampling:", train_y_resampled.sum())

# ### whole data ###
# ros = RandomOverSampler(random_state=87)
# full_X_resampled, full_y_resampled = ros.fit_resample(full_X, full_y)
# full_y_resampled = full_y_resampled.reshape(-1,1) # just test

# print("Shape of resampled full_X:", full_X_resampled.shape)
# print("Shape of resampled full_y:", full_y_resampled.shape)
# print("Number of positive samples after resampling:", full_y_resampled.sum())

Shape of resampled train_X: (478, 340)
Shape of resampled train_y: (478, 1)
Number of positive samples after resampling: 239.0


# Create model

In [10]:
#############
TUNNING = False

params = {'X_shape': train_X.shape,
          'hidden_units': [160, 176, 144, 16, 80], 
          'dropout_rates': [0.2, 0.0, 0.0, 0.8, 0.0, 0.2, 0.2],
          'ls': 0.001, 'lr': 0.01}
#############

2.1.Create model and find hyperparameter

In [11]:
def tunning_model(hp, X_shape):

    tf.random.set_seed(87)

    #############################################
    hidden_units = [hp.Int(name=f"units_{i}", min_value=16, max_value=256, step=16) for i in range(1, 6)]
    dropout_rates = [hp.Choice(f"dropout_{i}", [0.0, 0.2, 0.5, 0.8]) for i in range(1, 8)]
    ls = hp.Choice('ls',[1e-2, 1e-3, 1e-5])
    lr = hp.Choice('lr',[1e-2, 1e-3, 1e-5])
    #############################################
    
    inp = Input(shape = (X_shape[1], ))
    x0 = BatchNormalization()(inp)

    encoder = GaussianNoise(dropout_rates[0])(x0)
    encoder = Dense(hidden_units[0])(encoder)
    encoder = BatchNormalization()(encoder)
    encoder = Activation('swish')(encoder)
    
    decoder = Dropout(dropout_rates[1])(encoder)
    decoder = Dense(X_shape[1], name = 'decoder')(decoder)  

    x_ae = Dense(hidden_units[1])(decoder)
    x_ae = BatchNormalization()(x_ae)
    x_ae = Activation('swish')(x_ae)
    x_ae = Dropout(dropout_rates[2])(x_ae)

    out_ae = Dense(1, activation = 'sigmoid', name = 'ae_action')(x_ae)
    
    x = Concatenate()([x0, encoder])
    x = BatchNormalization()(x)
    x = Dropout(dropout_rates[3])(x)

    for i in range(2, len(hidden_units)):
        x = Dense(hidden_units[i])(x)
        x = BatchNormalization()(x)
        x = Activation('swish')(x)
        x = Dropout(dropout_rates[i + 2])(x)
        
    out = Dense(1, activation = 'sigmoid', name = 'action')(x)

    model = tf.keras.models.Model(inputs=inp, outputs=[decoder, out_ae, out])
    model.compile(optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=lr),
                  loss = {'decoder': tf.keras.losses.MeanSquaredError(), 
                          'ae_action': tf.keras.losses.BinaryCrossentropy(label_smoothing=ls),
                          'action': tf.keras.losses.BinaryCrossentropy(label_smoothing=ls), 
                         },
                  metrics = {'decoder': tf.keras.metrics.MeanAbsoluteError(name='MAE'), 
                             'ae_action': tf.keras.metrics.AUC(name='AUC'), 
                             'action': tf.keras.metrics.AUC(name='AUC'), 
                            }, 
                 )
    
    return model

if TUNNING:
    model_fn = lambda hp: tunning_model(hp, X_shape=train_X.shape)
    tuner = kt.BayesianOptimization(model_fn,
                                    objective=kt.Objective("val_action_AUC", direction="max"),
                                    max_trials=10,
                                    executions_per_trial=2,
                                    directory="model_kt",
                                    overwrite=True,
                                    seed=87)
    path = f'model.hdf5'
    ckp = ModelCheckpoint(path, monitor='val_action_AUC', verbose = 0, 
                          save_best_only=True, save_weights_only=True, mode='max')
    es = EarlyStopping(monitor='val_action_AUC', min_delta=1e-4, patience=10, mode='max', 
                       baseline=None, restore_best_weights=True, verbose=1)
    
    tuner.search(train_X, [train_X, train_y, train_y], validation_split=0.2, epochs=100, 
                 batch_size=16, callbacks=[ckp, es], verbose=1)
    model = tuner.get_best_models()[0]

    best_hyperparameters = tuner.get_best_hyperparameters()[0]
    print("Best Hyperparameters:")
    print(best_hyperparameters.values)

2.2.Train model(with parameter)

In [12]:
def create_model(X_shape, hidden_units, dropout_rates, lr, ls):

    tf.random.set_seed(87)

    inp = Input(shape = (X_shape[1], ))
    x0 = BatchNormalization()(inp)

    encoder = GaussianNoise(dropout_rates[0])(x0)
    encoder = Dense(hidden_units[0])(encoder)
    encoder = BatchNormalization()(encoder)
    encoder = Activation('swish')(encoder)
    
    decoder = Dropout(dropout_rates[1])(encoder)
    decoder = Dense(X_shape[1], name = 'decoder')(decoder)  

    x_ae = Dense(hidden_units[1])(decoder)
    x_ae = BatchNormalization()(x_ae)
    x_ae = Activation('swish')(x_ae)
    x_ae = Dropout(dropout_rates[2])(x_ae)

    out_ae = Dense(1, activation = 'sigmoid', name = 'ae_action')(x_ae)
    
    x = Concatenate()([x0, encoder])
    x = BatchNormalization()(x)
    x = Dropout(dropout_rates[3])(x)

    for i in range(2, len(hidden_units)):
        x = Dense(hidden_units[i])(x)
        x = BatchNormalization()(x)
        x = Activation('swish')(x)
        x = Dropout(dropout_rates[i + 2])(x)
        
    out = Dense(1, activation = 'sigmoid', name = 'action')(x)

    model = tf.keras.models.Model(inputs=inp, outputs=[decoder, out_ae, out])
    model.compile(optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=lr),
                  loss = {'decoder': tf.keras.losses.MeanSquaredError(), 
                          'ae_action': tf.keras.losses.BinaryCrossentropy(label_smoothing=ls),
                          'action': tf.keras.losses.BinaryCrossentropy(label_smoothing=ls), 
                         },
                  metrics = {'decoder': tf.keras.metrics.MeanAbsoluteError(name='MAE'), 
                             'ae_action': tf.keras.metrics.AUC(name='AUC'), 
                             'action': tf.keras.metrics.AUC(name='AUC'), 
                            }, 
                 )

    return model

if TUNNING == False:

    path = f'model.hdf5'
    model = create_model(**params)
    ckp = ModelCheckpoint(path, monitor='val_action_AUC', verbose = 0,                    # If you want to use, uncomment
                          save_best_only=True, save_weights_only=True, mode='max')
    es = EarlyStopping(monitor='val_action_AUC', min_delta=1e-4, patience=10, mode='max', # If you want to use, uncomment
                       baseline=None, restore_best_weights=True, verbose=1)
    
    history = model.fit(train_X_resampled, train_y_resampled,  # full_X_resampled, full_y_resampled # train_X_resampled, train_y_resampled
                        validation_data=(valid_X, valid_y), # validation_data=(valid_X, valid_y) # validation_split=0.2, shuffle=True
                        # sample_weight = sw[tr], 
                        epochs=100, # coose n by experience
                        batch_size=16, 
                        callbacks=[ckp, es],                                              # If you want to use, uncomment
                        verbose=1)
    
    hist = pd.DataFrame(history.history)
    score = hist['val_action_AUC'].max()
    print(f'AUC:', score)

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

Epoch 1/100


2024-03-24 14:24:14.423482: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 21: early stopping
AUC: 0.5828846096992493


2.3.Test model on one stock

In [14]:
pred_dir = model.predict(test_X) 
pred_dir = pred_dir[2]
pred_dir = (pred_dir > 0.5).astype(int)

result_df = pd.DataFrame(pred_dir, columns=['Pred'])
result_df['True'] = test_y

match_count = (result_df['Pred'] == result_df['True']).sum()
correct = match_count / len(result_df)

print(f'ACC: {correct}\n')
result_df.head(60)

ACC: 0.5595238095238095



Unnamed: 0,Pred,True
0,0,1.0
1,1,1.0
2,1,0.0
3,1,0.0
4,1,1.0
5,1,1.0
6,1,1.0
7,1,1.0
8,1,1.0
9,1,1.0
