In [1]:
import numpy as np
import pandas as pd

from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
tf.random.set_seed(87)

from keras.models import Model
from keras.layers import Input, Dense ,Dropout 
from keras.layers import BatchNormalization, GaussianNoise, Activation, Concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint

import keras_tuner as kt
from keras_tuner.engine.hyperparameters import HyperParameters
from livelossplot import PlotLossesKeras

import warnings
warnings.filterwarnings('ignore')

# Data preprocessing

1.1. Import data

In [2]:
#############
TICKER = 2330
TP = 5
#############

# import data #
data = pd.read_csv('/Users/yitsung/Desktop/MasterThesis/data/TaiwanStockData_Top100_EMA')
ticker_data = data[data['ticker']==TICKER].reset_index(drop=True)
ticker_data = ticker_data.drop(columns=['ticker'])

# (SMA-P/P, 2class) #
ticker_data[f'y_{TP}'] = ticker_data['close'].rolling(window=TP).mean()
ticker_data[f'y_{TP}'] = ticker_data[f'y_{TP}'].shift(-TP)
ticker_data = ticker_data.dropna().reindex()
ticker_data[f'y_{TP}'] = ((ticker_data[f'y_{TP}'] - ticker_data['close']) >= 0).astype(int)

ticker_data

Unnamed: 0,Date,open,high,low,close,volume,financing,fi,ii,di,rp,capital,EMA9,EMA12,EMA26,MACD,Signal,RSI14,y_1
0,2021-01-04,530.0,540.0,528.0,536.0,39490.0,454.0,12463.0,-33.0,865.0,2342.0,6.0443,521.295251,518.980386,513.251221,5.729165,3.933239,84.477581,1
1,2021-01-05,536.0,542.0,535.0,542.0,34839.0,-355.0,2884.0,179.0,-451.0,-1374.0,5.3592,525.437881,522.532126,515.535238,6.996887,4.619674,88.417310,1
2,2021-01-06,555.0,555.0,541.0,549.0,55614.0,-256.0,5355.0,105.0,-4163.0,1.0,6.9696,530.151835,526.614084,518.179719,8.434365,5.454306,91.005801,1
3,2021-01-07,554.0,570.0,553.0,565.0,53393.0,2200.0,1671.0,-75.0,2060.0,-402.0,8.7664,537.123278,532.531850,521.861371,10.670478,6.574521,93.325963,1
4,2021-01-08,580.0,580.0,571.0,580.0,62957.0,-502.0,3278.0,187.0,1176.0,-5041.0,9.0658,545.700404,539.847445,526.412277,13.435169,8.026473,94.939847,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
703,2023-11-24,577.0,578.0,574.0,575.0,12503.0,243.0,-854.0,70.0,-118.0,-2263.0,2.8318,575.073961,572.538736,562.541337,9.997398,9.134503,90.744592,0
704,2023-11-27,573.0,577.0,568.0,568.0,20322.0,-112.0,-2153.0,59.0,-56.0,-3554.0,4.1507,573.659169,571.840469,562.945683,8.894786,9.086560,81.069290,1
705,2023-11-28,565.0,576.0,565.0,575.0,26932.0,478.0,3323.0,-98.0,687.0,-416.0,5.1624,573.927335,572.326550,563.838595,8.487955,8.966839,76.500832,0
706,2023-11-29,578.0,579.0,570.0,574.0,27787.0,357.0,-180.0,55.0,-553.0,-2383.0,4.8624,573.941868,572.584004,564.591292,7.992712,8.772014,71.301362,1


1.2.Splite data into train(Library) and test(Prediction)

In [3]:
Library = ticker_data[ticker_data['Date'] <= '2023-06-30'] # windows=20, the last prediction from Library is 6/30
Prediction = ticker_data[(ticker_data['Date'] >= '2023-06-01')&(ticker_data['Date'] <= '2023-10-31')] # windows=20, start from using 6/1 to predict 7/3

1.3.Data Normalize

In [4]:
def make_data_minmax(Library, Prediction):

    # MinMax #
    scaler = MinMaxScaler()
    feature_to_standardize = Library.columns.to_list()[1 : ] # exclude 'Date'
    Library[feature_to_standardize] = scaler.fit_transform(Library[feature_to_standardize])
    Prediction[feature_to_standardize] = scaler.fit_transform(Prediction[feature_to_standardize])

    return Library, Prediction

### splite train set and validation set ###
train_Library = Library[: int((len(Library) * 0.8))]
valid_Library = Library[int((len(Library) * 0.8)): ]
train_Library, valid_Library = make_data_minmax(Library=train_Library, Prediction=valid_Library)

### splite whole data ###
Library, Prediction = make_data_minmax(Library=Library, Prediction=Prediction)

1.4.Make window data: X, y

In [5]:
def data_preprocess(data, window_size):

    X = np.array(data.iloc[:, 1: -1])
    y = data.iloc[:, -1].values.reshape(-1, 1)

    data_X, data_y = [], []
    for i in range(len(data) - window_size + 1):
        data_X.append(X[i : (i + window_size), :])
        data_y.append(y[i + window_size - 1])

    data_X, data_y = np.array(data_X), np.array(data_y)
        
    return data_X, data_y

### train set and validation set ###
train_X, train_y = data_preprocess(data=train_Library, window_size=20)
valid_X, valid_y = data_preprocess(data=valid_Library, window_size=20)

### whole data ###
# full_X, full_y = data_preprocess(data=Library, window_size=20) # just test 
test_X, test_y = data_preprocess(data=Prediction, window_size=20)

1.4.5.Flatten(MLP only)

In [6]:
def make_X_flatten(X):
    X_flatten = X.reshape((X.shape[0], X.shape[1] * X.shape[2]))

    return X_flatten

### train set and validation set ###
train_X = make_X_flatten(train_X)
valid_X = make_X_flatten(valid_X)

### whole data ###
# full_X = make_X_flatten(full_X) # just test 
test_X = make_X_flatten(test_X)

1.5.Over-smapling

In [7]:
### train set and validation set ###
ros = RandomOverSampler(random_state=87)
train_X_resampled, train_y_resampled = ros.fit_resample(train_X, train_y)
train_y_resampled = train_y_resampled.reshape(-1,1) # just test

print("Shape of resampled train_X:", train_X_resampled.shape)
print("Shape of resampled train_y:", train_y_resampled.shape)
print("Number of positive samples after resampling:", train_y_resampled.sum())

# ### whole data ###
# ros = RandomOverSampler(random_state=87)
# full_X_resampled, full_y_resampled = ros.fit_resample(full_X, full_y)
# full_y_resampled = full_y_resampled.reshape(-1,1) # just test

# print("Shape of resampled full_X:", full_X_resampled.shape)
# print("Shape of resampled full_y:", full_y_resampled.shape)
# print("Number of positive samples after resampling:", full_y_resampled.sum())

Shape of resampled train_X: (478, 340)
Shape of resampled train_y: (478, 1)
Number of positive samples after resampling: 239.0


# Create model

In [8]:
#############
TUNNING = True
params = {'X_shape': train_X.shape,
          'hidden_units': [80, 80, 48, 64, 64], 
          'dropout_rates': [0.2, 0.5, 0.5, 0.0, 0.0, 0.0, 0.8],
          'ls': 0.001, 'lr': 0.001}
#############

2.1.Create model and find hyperparameter

In [9]:
def tunning_model(hp, X_shape):

    tf.random.set_seed(87)

    #############################################
    hidden_units = [hp.Int(name=f"units_{i}", min_value=16, max_value=128, step=16) for i in range(1, 6)]
    dropout_rates = [hp.Choice(f"dropout_{i}", [0.0, 0.2, 0.5, 0.8]) for i in range(1, 8)]
    ls = hp.Choice('ls',[1e-2, 1e-3, 1e-5])
    lr = hp.Choice('lr',[1e-2, 1e-3, 1e-5])
    #############################################
    
    inp = Input(shape = (X_shape[1], ))
    x0 = BatchNormalization()(inp)

    encoder = GaussianNoise(dropout_rates[0])(x0)
    encoder = Dense(hidden_units[0])(encoder)
    encoder = BatchNormalization()(encoder)
    encoder = Activation('swish')(encoder)
    
    decoder = Dropout(dropout_rates[1])(encoder)
    decoder = Dense(X_shape[1], name = 'decoder')(decoder)  

    x_ae = Dense(hidden_units[1])(decoder)
    x_ae = BatchNormalization()(x_ae)
    x_ae = Activation('swish')(x_ae)
    x_ae = Dropout(dropout_rates[2])(x_ae)

    out_ae = Dense(1, activation = 'sigmoid', name = 'ae_action')(x_ae)
    
    x = Concatenate()([x0, encoder])
    x = BatchNormalization()(x)
    x = Dropout(dropout_rates[3])(x)

    for i in range(2, len(hidden_units)):
        x = Dense(hidden_units[i])(x)
        x = BatchNormalization()(x)
        x = Activation('swish')(x)
        x = Dropout(dropout_rates[i + 2])(x)
        
    out = Dense(1, activation = 'sigmoid', name = 'action')(x)

    model = tf.keras.models.Model(inputs=inp, outputs=[decoder, out_ae, out])
    model.compile(optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=lr),
                  loss = {'decoder': tf.keras.losses.MeanSquaredError(), 
                          'ae_action': tf.keras.losses.BinaryCrossentropy(label_smoothing=ls),
                          'action': tf.keras.losses.BinaryCrossentropy(label_smoothing=ls), 
                         },
                  metrics = {'decoder': tf.keras.metrics.MeanAbsoluteError(name='MAE'), 
                             'ae_action': tf.keras.metrics.AUC(name='AUC'), 
                             'action': tf.keras.metrics.AUC(name='AUC'), 
                            }, 
                 )
    
    return model

if TUNNING:
    model_fn = lambda hp: tunning_model(hp, X_shape=train_X.shape)
    tuner = kt.BayesianOptimization(model_fn,
                                    objective=kt.Objective("val_action_AUC", direction="max"),
                                    max_trials=10,
                                    executions_per_trial=1,
                                    directory="model_kt",
                                    overwrite=True,
                                    seed=87)
    path = f'model.hdf5'
    ckp = ModelCheckpoint(path, monitor='val_action_AUC', verbose = 0,                    # If you want to use, uncomment
                          save_best_only=True, save_weights_only=True, mode='max')
    es = EarlyStopping(monitor='val_action_AUC', min_delta=1e-4, patience=30, mode='max', # If you want to use, uncomment # or choose patience=n by experience
                       baseline=None, restore_best_weights=True, verbose=1)
    
    tuner.search(train_X_resampled, [train_X_resampled, train_y_resampled, train_y_resampled],
                 validation_data=(valid_X, [valid_X, valid_y, valid_y]), # validation_data=(valid_X, [valid_X, valid_y, valid_y]) # validation_split=0.2, shuffle=True
                 epochs=100,                                                              # 100 or coose epochs=n by experience
                 batch_size=16, 
                 callbacks=[ckp, es],                                                     # If you want to use, uncomment 
                 verbose=1)
    
    model = tuner.get_best_models()[0]

    tf.keras.backend.clear_session() # clear memory

    best_hyperparameters = tuner.get_best_hyperparameters()[0]
    print("Best Hyperparameters:")
    print(best_hyperparameters.values)

Trial 10 Complete [00h 02m 52s]
val_action_AUC: 0.6060576736927032

Best val_action_AUC So Far: 0.641634613275528
Total elapsed time: 00h 27m 45s
Best Hyperparameters:
{'units_1': 80, 'units_2': 80, 'units_3': 48, 'units_4': 64, 'units_5': 64, 'dropout_1': 0.2, 'dropout_2': 0.5, 'dropout_3': 0.5, 'dropout_4': 0.0, 'dropout_5': 0.0, 'dropout_6': 0.0, 'dropout_7': 0.8, 'ls': 0.001, 'lr': 0.001}


2.2.Train model(with parameter)

In [10]:
def create_model(X_shape, hidden_units, dropout_rates, lr, ls):

    tf.random.set_seed(87)

    inp = Input(shape = (X_shape[1], ))
    x0 = BatchNormalization()(inp)

    encoder = GaussianNoise(dropout_rates[0])(x0)
    encoder = Dense(hidden_units[0])(encoder)
    encoder = BatchNormalization()(encoder)
    encoder = Activation('swish')(encoder)
    
    decoder = Dropout(dropout_rates[1])(encoder)
    decoder = Dense(X_shape[1], name = 'decoder')(decoder)  

    x_ae = Dense(hidden_units[1])(decoder)
    x_ae = BatchNormalization()(x_ae)
    x_ae = Activation('swish')(x_ae)
    x_ae = Dropout(dropout_rates[2])(x_ae)

    out_ae = Dense(1, activation = 'sigmoid', name = 'ae_action')(x_ae)
    
    x = Concatenate()([x0, encoder])
    x = BatchNormalization()(x)
    x = Dropout(dropout_rates[3])(x)

    for i in range(2, len(hidden_units)):
        x = Dense(hidden_units[i])(x)
        x = BatchNormalization()(x)
        x = Activation('swish')(x)
        x = Dropout(dropout_rates[i + 2])(x)
        
    out = Dense(1, activation = 'sigmoid', name = 'action')(x)

    model = tf.keras.models.Model(inputs=inp, outputs=[decoder, out_ae, out])
    model.compile(optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=lr),
                  loss = {'decoder': tf.keras.losses.MeanSquaredError(), 
                          'ae_action': tf.keras.losses.BinaryCrossentropy(label_smoothing=ls),
                          'action': tf.keras.losses.BinaryCrossentropy(label_smoothing=ls), 
                         },
                  metrics = {'decoder': tf.keras.metrics.MeanAbsoluteError(name='MAE'), 
                             'ae_action': tf.keras.metrics.AUC(name='AUC'), 
                             'action': tf.keras.metrics.AUC(name='AUC'), 
                            }, 
                 )

    return model

if TUNNING == False:

    path = f'model.hdf5'
    model = create_model(**params)
    ckp = ModelCheckpoint(path, monitor='val_action_AUC', verbose = 0,                    # If you want to use, uncomment
                          save_best_only=True, save_weights_only=True, mode='max')
    es = EarlyStopping(monitor='val_action_AUC', min_delta=1e-4, patience=30, mode='max', # If you want to use, uncomment # or choose patience=n by experience
                       baseline=None, restore_best_weights=True, verbose=1)
    
    history = model.fit(train_X_resampled, [train_X_resampled, train_y_resampled, train_y_resampled],  # full_X_resampled, [full_X_resampled, full_y_resampled, full_y_resampled]
                        validation_data=(valid_X, [valid_X, valid_y, valid_y]), # validation_data=(valid_X, [valid_X, valid_y, valid_y]) # validation_split=0.2, shuffle=True
                        # sample_weight = sw[tr], 
                        epochs=100,                                                       # 100 or coose epochs=n by experience
                        batch_size=16, 
                        callbacks=[ckp, es],                                              # If you want to use, uncomment
                        verbose=1)
    
    tf.keras.backend.clear_session() # clear memory

    hist = pd.DataFrame(history.history)
    score = hist['val_action_AUC'].max()
    print(f'AUC:', score)

2.3.Test model on one stock

In [11]:
pred_dir = model.predict(test_X) 
pred_dir = pred_dir[2]
pred_dir = (pred_dir > 0.5).astype(int)

result_df = pd.DataFrame(pred_dir, columns=['Pred'])
result_df['True'] = test_y

match_count = (result_df['Pred'] == result_df['True']).sum()
correct = match_count / len(result_df)

print(f'ACC: {correct}\n')
result_df.head(60)

ACC: 0.5595238095238095



Unnamed: 0,Pred,True
0,1,1.0
1,1,1.0
2,1,0.0
3,1,0.0
4,0,1.0
5,1,1.0
6,1,1.0
7,1,1.0
8,1,1.0
9,1,1.0


# Whole experiment 

In [None]:
#############
TP = 5
TUNNING = True
params = {'X_shape': train_X.shape,
          'hidden_units': [32, 64, 16, 32, 16], 
          'dropout_rates': [0.2, 0.8, 0.5, 0.8, 0.2, 0.0, 0.2],
          'ls': 0.001, 'lr': 0.01}

# constituent = [2330, 2454, 2317, 2308, 2382, 2303, 2891, 3711, 2881, 2412,
#                2886, 2882, 2884, 1216, 2885, 3231, 3034, 2357, 2002, 2892,
#                1303, 2379, 5880, 2301, 3037, 2345, 1301, 3008, 3661, 2890,
#                5871, 2880, 2327, 2883, 2887, 2207, 4938, 1101, 6669, 1326,
#                2395, 3045, 5876, 2603, 1590, 2912, 4904, 2801, 6505, 2408]
constituent = [2330, 2454, 2317, 2308, 2382, 2303, 2891, 3711, 2881, 2412] # just test
#############

experiment_0050_result = pd.DataFrame()

for TICKER in constituent:

    print(f'\n now: processing {TICKER} \n')

    try:
        ##### import data #####
        data = pd.read_csv('/Users/yitsung/Desktop/MasterThesis/data/TaiwanStockData_Top100_EMA')
        ticker_data = data[data['ticker']==TICKER].reset_index(drop=True)
        ticker_data = ticker_data.drop(columns=['ticker'])

        ticker_data[f'y_{TP}'] = ticker_data['close'].rolling(window=TP).mean()
        ticker_data[f'y_{TP}'] = ticker_data[f'y_{TP}'].shift(-TP)
        ticker_data = ticker_data.dropna().reindex()
        ticker_data[f'y_{TP}'] = ((ticker_data[f'y_{TP}'] - ticker_data['close']) >= 0).astype(int)

        ##### Splite data into train(Library) and test(Prediction) #####
        Library = ticker_data[ticker_data['Date'] <= '2023-06-30']
        Prediction = ticker_data[(ticker_data['Date'] >= '2023-06-01')&(ticker_data['Date'] <= '2023-10-31')]

        ##### Data Normalize #####
        train_Library = Library[: int((len(Library) * 0.8))]
        valid_Library = Library[int((len(Library) * 0.8)): ]

        train_Library, valid_Library = make_data_minmax(Library=train_Library, Prediction=valid_Library)
        Library, Prediction = make_data_minmax(Library=Library, Prediction=Prediction)

        ##### Make window data: X, y #####
        train_X, train_y = data_preprocess(data=train_Library, window_size=20)
        valid_X, valid_y = data_preprocess(data=valid_Library, window_size=20)
        test_X, test_y = data_preprocess(data=Prediction, window_size=20)

        ##### Flatten(MLP only) #####
        train_X = make_X_flatten(train_X)
        valid_X = make_X_flatten(valid_X)
        test_X = make_X_flatten(test_X)

        ##### Over-smapling #####
        ros = RandomOverSampler(random_state=87)
        train_X_resampled, train_y_resampled = ros.fit_resample(train_X, train_y)
        train_y_resampled = train_y_resampled.reshape(-1,1) # just test

        ###### Create model and find hyperparameter #####
        if TUNNING:
            model_fn = lambda hp: tunning_model(hp, X_shape=train_X.shape)
            tuner = kt.BayesianOptimization(model_fn,
                                            objective=kt.Objective("val_action_AUC", direction="max"),
                                            max_trials=10,
                                            executions_per_trial=1,
                                            directory="model_kt",
                                            overwrite=True,
                                            seed=87)
            path = f'model.hdf5'
            ckp = ModelCheckpoint(path, monitor='val_action_AUC', verbose = 0,                    # If you want to use, uncomment
                                save_best_only=True, save_weights_only=True, mode='max')
            es = EarlyStopping(monitor='val_action_AUC', min_delta=1e-4, patience=30, mode='max', # If you want to use, uncomment # or choose patience=n by experience
                            baseline=None, restore_best_weights=True, verbose=1)
            
            tuner.search(train_X_resampled, [train_X_resampled, train_y_resampled, train_y_resampled],
                        validation_data=(valid_X, [valid_X, valid_y, valid_y]), # validation_data=(valid_X, [valid_X, valid_y, valid_y]) # validation_split=0.2, shuffle=True
                        epochs=100,                                                               # 100 or coose epochs=n by experience 
                        batch_size=16, 
                        callbacks=[ckp, es],                                                      # If you want to use, uncomment 
                        verbose=1)
            
            model = tuner.get_best_models()[0]

            tf.keras.backend.clear_session() # clear memory

            best_hyperparameters = tuner.get_best_hyperparameters()[0]
            print("Best Hyperparameters:")
            print(best_hyperparameters.values)

        ##### Train model(with parameter) #####
        else:
            path = f'model.hdf5'
            model = create_model(**params)
            ckp = ModelCheckpoint(path, monitor='val_action_AUC', verbose = 0,                    # If you want to use, uncomment
                                save_best_only=True, save_weights_only=True, mode='max')
            es = EarlyStopping(monitor='val_action_AUC', min_delta=1e-4, patience=30, mode='max', # If you want to use, uncomment # or choose patience=n by experience
                            baseline=None, restore_best_weights=True, verbose=1)
            
            history = model.fit(train_X_resampled, [train_X_resampled, train_y_resampled, train_y_resampled],  # full_X_resampled, [full_X_resampled, full_y_resampled, full_y_resampled]
                                validation_data=(valid_X, [valid_X, valid_y, valid_y]), # validation_data=(valid_X, [valid_X, valid_y, valid_y]) # validation_split=0.2, shuffle=True
                                # sample_weight = sw[tr], 
                                epochs=100,                                                       # 100 or coose epochs=n by experience
                                batch_size=16, 
                                callbacks=[ckp, es],                                              # If you want to use, uncomment
                                verbose=1)
            
            hist = pd.DataFrame(history.history)
            score = hist['val_action_AUC'].max()
            print(f'AUC:', score)

        ##### Test model on one stock #####
        pred_dir = model.predict(test_X) 
        pred_dir = pred_dir[2]
        pred_dir = (pred_dir > 0.5).astype(int)

        result_df = pd.DataFrame(pred_dir, columns=['Pred'])
        result_df['True'] = test_y

        match_count = (result_df['Pred'] == result_df['True']).sum()
        correct = match_count / len(result_df)
        print(f'\n ACC: {correct} \n')

        tf.keras.backend.clear_session() # clear memory

        ##### Add to result dataframe #####
        experiment_0050_result = pd.concat([experiment_0050_result, result_df], axis=0, ignore_index=True)

    except:
        print(f'{TICKER} process failed.')
        continue

#### Final ACC ####
whole_match_count = (experiment_0050_result['Pred'] == experiment_0050_result['True']).sum()
whole_correct = whole_match_count / len(experiment_0050_result)
print(f'\n Whole 0050 ACC: {whole_correct} \n')

In [None]:
experiment_0050_result.to_csv(f'AE-MLP_Tp={TP}_result.csv', index=False)
print(f'\n Whole 0050 ACC: {whole_correct} \n')

experiment_0050_result.tail(60)

Unnamed: 0,Pred,True
276,0,1.0
277,0,0.0
278,1,0.0
279,0,0.0
280,1,1.0
281,0,1.0
282,0,1.0
283,1,1.0
284,1,1.0
285,1,0.0
