In [1]:
import utils.helper_functions as hf

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from collections import Counter

import hyperopt
from hyperopt import hp, fmin, tpe


db_file_path = './db/ohlcv_ntickers_1254_2000-08-01_to_2023-12-23.pkl'
start_date = '2013-01-01'

thresholds = [1.08, 1.04, 1.02, 1]

epochs = 2

In [2]:
df = pd.read_pickle(db_file_path)
df = hf.get_rows_after_date(df, start_date)

def get_dfs_buy_sell(buying_time, selling_time):
    df_buy = df[[buying_time]]
    df_buy = hf.remove_top_column_name(df_buy)

    df_sell = df[[selling_time]]
    df_sell = hf.remove_top_column_name(df_sell)

    df_buy.tail(5)

    return df_buy, df_sell

df_volume = df[['Volume']]
df_close = df[['Close']]

In [3]:
def calculate_var(df, past_days, future_days):
    var = hf.calculate_variations(df, past_days, future_days)
    var_stacked = hf.stack(var, f'var_past_{past_days}d_future_{future_days}d')

    return var_stacked

def calculate_var_vs_close(df, df_close, past_days):
    var = df / df_close.shift(past_days)
    var_stacked = hf.stack(var, f'var_past_close_{past_days}d')

    return var_stacked
    
def min_max_var(df, past_days):
    rolling_min = df.rolling(window=past_days + 1, min_periods=1).min()
    min_var = df / rolling_min
    min_var_stacked = hf.stack(min_var, f'min_var_past_{past_days}d')

    rolling_max = df.rolling(window=past_days + 1, min_periods=1).max()
    max_var = df / rolling_max
    max_var_stacked = hf.stack(max_var, f'max_var_past_{past_days}d')

    return min_var_stacked, max_var_stacked

def get_future_max_var(df, future_days):
    future_rolling_max = hf.get_future_rolling_max(df, future_days+1)
    future_max_var = future_rolling_max / df
    future_max_var_stacked = hf.stack(future_max_var, f'buy_var')
    
    return future_max_var_stacked

def days_since_min_max(df, past_days):
    days_since_min = hf.get_days_since_min(df, past_days)
    days_since_min_stacked = hf.stack(days_since_min, f'days_since_min_{past_days}d')

    days_since_max = hf.get_days_since_max(df, past_days)
    days_since_max_stacked = hf.stack(days_since_max, f'days_since_max_{past_days}d')

    return days_since_min_stacked, days_since_max_stacked

def get_volatility(df, past_days):
    volatility = hf.calculate_volatility(df, past_days)
    volatility_stacked = hf.stack(volatility, f'volatility_{past_days}d')

    return volatility_stacked

def get_n_ups(df, past_days):
    n_ups = hf.calculate_n_ups(df, past_days)
    n_ups_stacked = hf.stack(n_ups, f'n_ups_{past_days}d')

    return n_ups_stacked

def get_rank(df, past_days):
    rank = hf.calculate_rank(df, past_days)   
    rank_stacked = hf.stack(rank, f'rank_{past_days}d')
    
    return rank_stacked

def get_performance_vs_market(df, past_days):
    performance_vs_market = hf.calculate_performance_vs_market(df, past_days)
    performance_vs_market_stacked = hf.stack(performance_vs_market, f'perf_vs_market_{past_days}d')

    return performance_vs_market_stacked

def classify_var(df_var, thresholds, col_name):
    df_thresholds = hf.classify_var(df_var, thresholds)

    df_thresholds_stacked = hf.stack(df_thresholds, col_name)
    df_thresholds_stacked = df_thresholds_stacked.droplevel(level=-1)

    return df_thresholds_stacked

In [4]:
def get_dfs_input_output(df_buy, df_sell, target_future_days):
    var_30 = calculate_var(df_buy, past_days=30, future_days=0)
    var_10 = calculate_var(df_buy, past_days=10, future_days=0)
    var_5 = calculate_var(df_buy, past_days=5, future_days=0)
    var_2 = calculate_var(df_buy, past_days=2, future_days=0)
    var_1 = calculate_var(df_buy, past_days=1, future_days=0)

    var_vs_close_1 = calculate_var_vs_close(df_buy, df_close, past_days=1)

    min_var_30, max_var_30 = min_max_var(df_buy, past_days=30)
    min_var_10, max_var_10 = min_max_var(df_buy, past_days=10)
    min_var_5, max_var_5 = min_max_var(df_buy, past_days=5)
    min_var_2, max_var_2 = min_max_var(df_buy, past_days=2)

    days_since_min_30, days_since_max_30 = days_since_min_max(df_buy, past_days=30)
    days_since_min_10, days_since_max_10 = days_since_min_max(df_buy, past_days=10)

    volatility_30 = get_volatility(df_buy, past_days=30)
    volatility_10 = get_volatility(df_buy, past_days=10)
    volatility_2 = get_volatility(df_buy, past_days=2)

    n_ups_30 = get_n_ups(df_buy, past_days=30)
    n_ups_5 = get_n_ups(df_buy, past_days=5)

    rank_30 = get_rank(df_buy, past_days=30)
    rank_10 = get_rank(df_buy, past_days=10)
    rank_5 = get_rank(df_buy, past_days=5)
    rank_2 = get_rank(df_buy, past_days=2)
    rank_1 = get_rank(df_buy, past_days=1)

    perf_vs_market_30 = get_performance_vs_market(df_buy, past_days=30)
    perf_vs_market_10 = get_performance_vs_market(df_buy, past_days=10)
    perf_vs_market_5 = get_performance_vs_market(df_buy, past_days=5)
    perf_vs_market_2 = get_performance_vs_market(df_buy, past_days=2)
    perf_vs_market_1 = get_performance_vs_market(df_buy, past_days=1)

    # buy_var = calculate_var(df_sell, past_days=0, future_days=target_future_days)
    buy_var = get_future_max_var(df_sell, target_future_days)
    buy_class = classify_var(buy_var, thresholds, 'buy_class')

    df_input_output = pd.concat(
            [var_30, var_10, var_5, var_2, var_1,
            var_vs_close_1,
            min_var_30, min_var_10, min_var_5, min_var_2,
            max_var_30, max_var_10, max_var_5, max_var_2,
            days_since_min_30, days_since_min_10,
            days_since_max_30, days_since_max_10,
            volatility_30, volatility_10, volatility_2,
            n_ups_30, n_ups_5,
            rank_30, rank_10, rank_5, rank_2, rank_1,
            perf_vs_market_30, perf_vs_market_10, perf_vs_market_5,
            perf_vs_market_2, perf_vs_market_1, 
            buy_class],
        axis='columns')

    df_input_output = df_input_output.dropna()

    input_columns = [col for col in df_input_output.columns if not col.startswith('buy')]
    df_input = df_input_output[input_columns]
    df_output = df_input_output[['buy_class']]

    # df_input_output.tail(5)

    return df_input, df_output

In [5]:
def get_class_cumulative_percentages(df_output):
    value_counts = df_output['buy_class'].value_counts()
    percentages = (value_counts / len(df_output)) * 100
    percentages = percentages.sort_index()
    cumulative_percentages = percentages.cumsum()

    # print(f'''
    # Percentage of each class:
    # {percentages}

    # Cumulative percentages:
    # {cumulative_percentages}''')

    return cumulative_percentages

In [6]:
def get_test_train_data(df_input, df_output):
    X_all = StandardScaler().fit_transform(df_input)
    y_all = df_output.values.ravel()

    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, train_size=0.8, test_size=0.2, random_state=42)
    y_train = y_train.astype(int)
    y_test = y_test.astype(int)

    return X_train, X_test, y_train, y_test

def create_model(X_train, X_test, y_train, y_test,
                      size_layer_1, size_layer_2, size_layer_3,
                      dropout_rate, balance_data, batch_size):
    last_layers_size = len(thresholds) + 1

    model = Sequential()

    model.add(Dense(size_layer_1, input_shape=(X_train.shape[1],), activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))
    model.add(Dense(size_layer_2, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))
    model.add(Dense(size_layer_3, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))
    model.add(Dense(last_layers_size, activation='softmax'))

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    if (balance_data):
        counter = Counter(y_train)
        max_count = max(counter.values())
        class_weights = {cls: max_count / count for cls, count in counter.items()}
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), class_weight=class_weights)
    else:
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))

    return model

In [7]:
def get_model_performance(model, X_test, y_test,
                          predicted_n_first_classes, accepted_n_first_classes,
                          cumulated_probs_target):
    y_prediction = model.predict(X_test)
    df_prediction = pd.DataFrame(y_prediction, columns=['prob_0', 'prob_1', 'prob_2', 'prob_3', 'prob_4'])
    df_test = pd.DataFrame({'real_class': y_test})

    df_comparison = pd.concat([df_prediction, df_test], axis=1)
    df_comparison['cumulated_probs'] = df_comparison.iloc[:, :predicted_n_first_classes].sum(axis=1)
    df_comparison['predicted_true'] = (df_comparison['cumulated_probs'] > cumulated_probs_target)
    df_comparison['real_true'] = (df_comparison['real_class'] <= accepted_n_first_classes)

    tp = ((df_comparison['real_true'] == True) & (df_comparison['predicted_true'] == True)).sum()
    tn = ((df_comparison['real_true'] == False) & (df_comparison['predicted_true'] == False)).sum()
    fp = ((df_comparison['real_true'] == False) & (df_comparison['predicted_true'] == True)).sum()
    fn = ((df_comparison['real_true'] == True) & (df_comparison['predicted_true'] == False)).sum()

    # print(f"True Positives (TP), Correctly bought, earned money: {tp}")
    # print(f"True Negatives (TN), Correctly not bought: {tn}")
    # print(f"False Positives (FP), Incorrectly bought, may have lost money : {fp}")
    # print(f"False Negatives (FN), Missed buying opportunity: {fn}")

    return tp, tn, fp, fn

In [8]:
from itertools import product

# param_grid = {
#     'buying_times': ['Open'],
#     'selling_times': ['High'],
#     'target_future_days': [i for i in list(np.arange(1, 20, 3))],
#     'sizes_layer_1': [64, 128, 256],
#     'sizes_layer_2': [64, 128, 256],
#     'sizes_layer_3': [64, 128, 256],
#     'dropout_rates': [i for i in list(np.arange(0, 0.3, 0.1))],
#     'balance_data': [True, False],
#     'batch_sizes': [32, 64, 128],
#     'n_first_classes': [[0,0], [0,1], [0,2], [1,1], [1,2], [2, 2]],
#     'cumulated_probs_targets': [i for i in list(np.arange(0.4, 1, 0.2))]
# }

param_grid = {
    'buying_times': ['Open'],
    'selling_times': ['High'],
    'target_future_days': [1, 3, 5, 10, 19, 25],
    'sizes_layer_1': [256],
    'sizes_layer_2': [64],
    'sizes_layer_3': [64],
    'dropout_rates': [0.079],
    'balance_data': [True],
    'batch_sizes': [64],
    'n_first_classes': [[2, 2]],
    'cumulated_probs_targets': [0.636]
}

performance_scores = []

for params in product(*param_grid.values()):
    hyperparams = dict(zip(param_grid.keys(), params))
    buying_time = hyperparams['buying_times']
    selling_time = hyperparams['selling_times']
    target_future_days = hyperparams['target_future_days']
    size_layer_1 = hyperparams['sizes_layer_1']
    size_layer_2 = hyperparams['sizes_layer_2']
    size_layer_3 = hyperparams['sizes_layer_3']
    dropout_rate = hyperparams['dropout_rates']
    balance_data = hyperparams['balance_data']
    batch_size = hyperparams['batch_sizes']
    predicted_n_first_classes = hyperparams['n_first_classes'][0]
    accepted_n_first_classes = hyperparams['n_first_classes'][1]
    cumulated_probs_target = hyperparams['cumulated_probs_targets']

    df_buy, df_sell = get_dfs_buy_sell(buying_time, selling_time)
    df_input, df_output = get_dfs_input_output(df_buy, df_sell, target_future_days)
    class_cumulative_percentages = get_class_cumulative_percentages(df_output)

    X_train, X_test, y_train, y_test = get_test_train_data(df_input, df_output)
    trained_model = create_model(X_train, X_test, y_train, y_test,
                                      size_layer_1, size_layer_2, size_layer_3,
                                      dropout_rate, balance_data, batch_size)
    tp, tn, fp, fn = get_model_performance(trained_model, X_test, y_test,
                                           predicted_n_first_classes, accepted_n_first_classes,
                                           cumulated_probs_target)
    
    performance_score = {
        'buying_time': buying_time,
        'selling_time': selling_time,
        'target_future_days': target_future_days,
        'size_layer_1': size_layer_1,
        'size_layer_2': size_layer_2,
        'size_layer_3': size_layer_3,
        'dropout_rate': dropout_rate,
        'balance_data': balance_data,
        'batch_size': batch_size,
        'predicted_n_first_classes': predicted_n_first_classes,
        'accepted_n_first_classes': accepted_n_first_classes,
        'cumulated_probs_target': cumulated_probs_target,
        'tp': tp,
        'tn': tn,
        'fp': fp,
        'fn': fn,
        'winning_rate': tp / (tp + fp)
    }
    print(performance_score)
    performance_scores.append(performance_score)

df_performance = pd.DataFrame(performance_scores)
df_performance.sort_values(by='winning_rate', ascending=False)
df_performance.head(10)

2024-01-17 18:52:44.462508: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/2
Epoch 2/2
{'buying_time': 'Open', 'selling_time': 'High', 'target_future_days': 1, 'size_layer_1': 256, 'size_layer_2': 64, 'size_layer_3': 64, 'dropout_rate': 0.079, 'balance_data': True, 'batch_size': 64, 'predicted_n_first_classes': 2, 'accepted_n_first_classes': 2, 'cumulated_probs_target': 0.636, 'tp': 3130, 'tn': 309747, 'fp': 9024, 'fn': 40088, 'winning_rate': 0.2575283857166365}
Epoch 1/2
Epoch 2/2
{'buying_time': 'Open', 'selling_time': 'High', 'target_future_days': 3, 'size_layer_1': 256, 'size_layer_2': 64, 'size_layer_3': 64, 'dropout_rate': 0.079, 'balance_data': True, 'batch_size': 64, 'predicted_n_first_classes': 2, 'accepted_n_first_classes': 2, 'cumulated_probs_target': 0.636, 'tp': 94, 'tn': 251164, 'fp': 123, 'fn': 103080, 'winning_rate': 0.43317972350230416}
Epoch 1/2
Epoch 2/2
{'buying_time': 'Open', 'selling_time': 'High', 'target_future_days': 5, 'size_layer_1': 256, 'size_layer_2': 64, 'size_layer_3': 64, 'dropout_rate': 0.079, 'balance_data': True, 'b

Unnamed: 0,buying_time,selling_time,target_future_days,size_layer_1,size_layer_2,size_layer_3,dropout_rate,balance_data,batch_size,predicted_n_first_classes,accepted_n_first_classes,cumulated_probs_target,tp,tn,fp,fn,winning_rate
0,Open,High,1,256,64,64,0.079,True,64,2,2,0.636,3130,309747,9024,40088,0.257528
1,Open,High,3,256,64,64,0.079,True,64,2,2,0.636,94,251164,123,103080,0.43318
2,Open,High,5,256,64,64,0.079,True,64,2,2,0.636,4962,206202,4057,131717,0.550172
3,Open,High,10,256,64,64,0.079,True,64,2,2,0.636,185,151832,87,176065,0.680147
4,Open,High,19,256,64,64,0.079,True,64,2,2,0.636,217,101734,103,193122,0.678125
5,Open,High,25,256,64,64,0.079,True,64,2,2,0.636,57,83078,32,190197,0.640449


In [9]:
search_space = {
    'buying_time': hp.choice('buying_time', ['Open', 'Close']),
    'selling_time': hp.choice('selling_time', ['Open', 'Close', 'High']),
    'target_future_days': hp.quniform('target_future_days', 1, 20, 1),
    'size_layer_1': hp.choice('size_layer_1', [64, 128, 256]),
    'size_layer_2': hp.choice('size_layer_2', [64, 128, 256]),
    'size_layer_3': hp.choice('size_layer_3', [64, 128, 256]),
    'dropout_rate': hp.uniform('dropout_rate', 0, 0.3),
    'balance_data': hp.choice('balance_data', [True, False]),
    'batch_size': hp.choice('batch_size', [32, 64, 128]),
    'n_first_classes': hp.choice('n_first_classes', [[0, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 2]]),
    'cumulated_probs_target': hp.uniform('cumulated_probs_target', 0.4, 1),
}

results = []

def objective(params):
    buying_time = params['buying_time']
    selling_time = params['selling_time']
    target_future_days = int(params['target_future_days'])
    size_layer_1 = params['size_layer_1']
    size_layer_2 = params['size_layer_2']
    size_layer_3 = params['size_layer_3']
    dropout_rate = params['dropout_rate']
    balance_data = params['balance_data']
    batch_size = params['batch_size']
    predicted_n_first_classes = params['n_first_classes'][0]
    accepted_n_first_classes = params['n_first_classes'][1]
    cumulated_probs_target = params['cumulated_probs_target']

    df_buy, df_sell = get_dfs_buy_sell(buying_time, selling_time)
    df_input, df_output = get_dfs_input_output(df_buy, df_sell, target_future_days)
    # class_cumulative_percentages = get_class_cumulative_percentages(df_output)

    X_train, X_test, y_train, y_test = get_test_train_data(df_input, df_output)
    trained_model = create_model(X_train, X_test, y_train, y_test,
                                      size_layer_1, size_layer_2, size_layer_3,
                                      dropout_rate, balance_data, batch_size)
    tp, tn, fp, fn = get_model_performance(trained_model, X_test, y_test,
                                           predicted_n_first_classes, accepted_n_first_classes,
                                           cumulated_probs_target)

    winning_rate = tp / (tp + fp)
    performance = winning_rate * tp

    result = {'params': params, 'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn,
              'winning_rate': winning_rate, 'performance': performance}
    results.append(result)

    return -performance

n_iter = 20

# rstate = hyperopt.RandomState(seed=42, print_node=lambda s: print(s, end="", flush=True))
best = fmin(objective, search_space, algo=tpe.suggest, max_evals=n_iter)

print('All results:')
df_results = pd.DataFrame(results)
results.head(1000)

print("Best parameters:")
print(best)