In [1]:
import utils.helper_functions as hf
import utils.inputs as inputs

import os
from IPython.display import display, clear_output

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import Counter

import hyperopt
from hyperopt import hp, fmin, tpe

pd.options.mode.copy_on_write = True # avoid making unnecessary copies of DataFrames or Series
os.environ['PYDEVD_DISABLE_FILE_VALIDATION'] = '1' # disable file validation in the debugger
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' #0: All logs (default setting), 1: Filter out INFO logs, up to 3

db_path = './db/ohlcv_ntickers_593_2000-08-01_to_2024-06-15.pkl'
transformed_data_path = './outputs/classifier_transformed_data.pkl'
model_path = './outputs/classifier_model.keras'

fee = 0.002
# num_tickers = 1250

use_hyperopt = False
use_saved_transformed_data = False
use_saved_model = False

start_date = '2008-01-01' #'2013-01-01'
test_size = 60000
epochs = 3
hyperopt_n_iterations = 40

output_class_name = 'output_var_class' #'output_var_class' or 'output_rank_class'

param_grid = {
    'buying_time': ['Open'], 'selling_time': ['Open'],
    'target_future_days': [1], 'loss_limit': [0.997],
    'sell_at_target': [False],
    'size_layer_1': [128], 'size_layer_2': [128], 'size_layer_3': [128],
    'dropout_rate': [0.1], 'balance_data': [True], 'batch_size': [32], #'dropout_rates': [i for i in list(np.arange(0, 0.3, 0.1))], 'batch_sizes': [32, 64, 128],
    'n_first_classes': [[0,0]],
    'cumulated_probs_target': [0.9],
    'thresholds': [[1.0184]],
    'rank_pct_thresholds': [[0.45]]
}
num_combinations = hf.get_num_combinations(param_grid)

search_space = {
    'buying_time': hp.choice('buying_time', ['Open']),
    'selling_time': hp.choice('selling_time', ['Open']),
    'target_future_days': hp.choice('target_future_days', [1]), #hp.randint('param', 1, 60), #1, 60
    'loss_limit': hp.uniform('loss_limit', 0.98, 1),
    'sell_at_target': hp.choice('sell_at_target', [False]), #[True, False]
    'size_layer_1': hp.choice('size_layer_1', [128]),
    'size_layer_2': hp.choice('size_layer_2', [128]),
    'size_layer_3': hp.choice('size_layer_3', [128]), #[64, 128, 256]
    'dropout_rate': hp.choice('dropout_rate', [0.1]), #hp.uniform('dropout_rate', 0.05, 0.1), #0, 0.3
    'balance_data': hp.choice('balance_data', [True]),
    'batch_size': hp.choice('batch_size', [128]), #[32, 64, 128]
    'n_first_classes': hp.choice('n_first_classes', [[0, 0]]),
    'cumulated_probs_target': hp.uniform('cumulated_probs_target', 0.7, 1),
    'thresholds': hp.uniform('thresholds', 1, 1.05),
    # 'thresholds': hp.choice('thresholds', [[1.08, 1.04, 1.02, 1], [1.06, 1.03, 1.01], [1.05, 1.025, 1], [1.1, 1.05, 1.01]]),
    'rank_pct_thresholds': hp.uniform('rank_pct_thresholds', 0.002, 0.5),
    #'rank_pct_thresholds': hp.choice('rank_pct_thresholds', [[0.08, 0.2, 0.33], [0.15, 0.3, 0.5]])
}

number of combinations: 1


In [2]:
df = pd.read_pickle(db_path)
df = hf.get_rows_after_date(df, start_date)
df = hf.fillnavalues(df)

def get_single_level_df(df, ohlcv):
    new_df = df[[ohlcv]]
    new_df = hf.remove_top_column_name(new_df)

    return new_df

def get_ohlcv_dfs(df):
    df_open = get_single_level_df(df, 'Open')
    df_high = get_single_level_df(df, 'High')
    df_low = get_single_level_df(df, 'Low')
    df_close = get_single_level_df(df, 'Close')
    df_volume = get_single_level_df(df, 'Volume')
    
    return {'df_open': df_open, 'df_high': df_high, 'df_low': df_low,
            'df_close': df_close, 'df_volume': df_volume}

num_tickers = hf.get_num_tickers(get_single_level_df(df, 'Open'))
print(f'number of tickers: {num_tickers}')


number of tickers: 593


In [3]:
def get_future_end_var(df_buy, df_sell, future_days):
    df_future_end = df_sell.shift(-future_days)
    future_end_var =  df_future_end / df_buy
    future_end_var_stacked = hf.stack(future_end_var, f'output_future_end_var')
    
    return future_end_var_stacked

def get_future_max_var(df_buy, df_sell, future_days):
    future_rolling_max = hf.get_future_rolling_max(df_sell, future_days+1)
    future_max_var = future_rolling_max / df_buy
    future_max_var_stacked = hf.stack(future_max_var, f'output_future_max_var')
        
    return future_max_var_stacked

def get_future_min_var(df_buy, df_low, future_days):
    future_rolling_min = hf.get_future_rolling_min(df_low, future_days+1)
    future_min_var = future_rolling_min / df_buy
    future_min_var_stacked = hf.stack(future_min_var, f'output_future_min_var')
        
    return future_min_var_stacked

def get_future_min_var_before_max(df_buy, df_sell, df_low, future_days):
    rolling_max_positions = hf.get_future_rolling_max_position(df_sell, future_days)

    df_low = df_low.reset_index(drop=True)
    rolling_min = df_low.apply(lambda col: col.index.map(
            lambda row: hf.get_future_rolling_min_value(row, df_low.columns.get_loc(col.name), df_low, rolling_max_positions)
        ))
    rolling_min.index = df_buy.index
    
    var = rolling_min / df_buy
    var_stacked = hf.stack(var, f'output_future_min_var_before_max')

    return var_stacked



In [4]:
def classify_var(df_var, thresholds, col_name):
    df_thresholds = hf.classify_var(df_var, thresholds)

    df_thresholds_stacked = hf.stack(df_thresholds, col_name)
    df_thresholds_stacked = df_thresholds_stacked.droplevel(level=-1)

    return df_thresholds_stacked

def classify_rank(df_rank, thresholds, col_name):
    df_thresholds = hf.classify_rank(df_rank, thresholds)

    df_thresholds_stacked = hf.stack(df_thresholds, col_name)
    df_thresholds_stacked = df_thresholds_stacked.droplevel(level=-1)

    return df_thresholds_stacked

In [5]:
def add_future_vars(df_data, df_buy, df_sell, dfs_ohlcv, **hyperparams):
    target_future_days = hyperparams.get('target_future_days')
    sell_at_target = hyperparams.get('sell_at_target')
    
    future_end_var = get_future_end_var(df_buy, df_sell, target_future_days)
    future_max_var = get_future_max_var(df_buy, df_sell, target_future_days)
    future_min_var = get_future_min_var(df_buy, dfs_ohlcv['df_low'], target_future_days-1)
    
    df_data = pd.concat([df_data, future_end_var, future_max_var, future_min_var], axis='columns')
    
    if sell_at_target:
        future_min_var_before_max = get_future_min_var_before_max(df_buy, df_sell, dfs_ohlcv['df_low'], target_future_days)
        df_data = pd.concat([df_data, future_min_var_before_max], axis='columns')
    
    return df_data

def add_output_loss_min_var(df, **hyperparams):
    sell_at_target = hyperparams.get('sell_at_target')
    
    if sell_at_target:
        df['output_loss_min_var'] = df['output_future_min_var_before_max']
    else:
        df['output_loss_min_var'] = df['output_future_min_var']

    return df

def add_output_is_loss_limit_reached(df, **hyperparams):
    loss_limit = hyperparams.get('loss_limit')

    df['output_is_loss_limit_reached'] = (df['output_loss_min_var'] <= loss_limit)

    return df

def add_output_var_class(df_data, **hyperparams):
    sell_at_target = hyperparams.get('sell_at_target')
    thresholds = hyperparams.get('thresholds')
    last_class = len(thresholds)

    if sell_at_target:
        output_class = classify_var(df_data[['output_future_max_var']], thresholds, 'output_var_class')
    else:
        output_class = classify_var(df_data[['output_future_end_var']], thresholds, 'output_var_class')

    output_class.loc[df_data['output_is_loss_limit_reached'], 'output_var_class'] = last_class
    
    df_data = pd.concat([df_data, output_class], axis='columns')

    return df_data

def add_output_is_buy(df, output_class_name, **hyperparams):
    accepted_n_first_classes = hyperparams.get('n_first_classes')[1]
    df['output_is_buy'] = (df[output_class_name] <= accepted_n_first_classes)
    
    return df

def add_output_profit(df, fee, **hyperparams):
    thresholds = hyperparams.get('thresholds')
    accepted_n_first_classes = hyperparams.get('n_first_classes')[1]
    loss_limit = hyperparams.get('loss_limit')
    sell_at_target = hyperparams.get('sell_at_target')

    accepted_var = thresholds[accepted_n_first_classes]

    loss_condition = df['output_loss_min_var'] <= loss_limit
    reached_target_condition = sell_at_target & (df['output_future_max_var'] > accepted_var)

    df['output_profit'] = np.select(
        [
            loss_condition,  # Condition for buy and loss condition
            reached_target_condition  # Condition for buy and reached target condition
        ],
        [
            loss_limit,  # Value if buy and meets loss condition
            accepted_var  # Value if buy and meets target condition
        ],
        default=df['output_future_end_var']  # Default value for buy condition not meeting the above
    )

    fee_coef = hf.get_fee_coef(fee)
    df['output_profit'] *= fee_coef
    
    return df

In [6]:
def add_future_rank(df_data, df_buy, **hyperparams):
    target_future_days = hyperparams.get('target_future_days')
    df_data['output_future_end_rank'] = inputs.get_rank(df_buy, past_days=0, future_days=target_future_days)
    
    return df_data

def add_output_rank_class(df_data, **hyperparams):
    rank_pct_thresholds = hyperparams.get('rank_pct_thresholds')
    rank_thresholds = np.floor(np.array(rank_pct_thresholds) * num_tickers).astype(int)
    
    output_class = classify_rank(df_data[['output_future_end_rank']], rank_thresholds, 'output_rank_class')
    
    df_data = pd.concat([df_data, output_class], axis='columns')

    return df_data

In [7]:
def get_df_data(hyperparams):
    df_buy = get_single_level_df(df, hyperparams['buying_time'])
    df_sell = get_single_level_df(df, hyperparams['selling_time'])
    dfs_ohlcv = get_ohlcv_dfs(df)

    if os.path.exists(transformed_data_path) and use_saved_transformed_data:
        df_data = pd.read_pickle(transformed_data_path)
        print(f'using existing {transformed_data_path}')
    else:
        print(f'need to create {transformed_data_path}')
        df_data = inputs.get_inputs(df_buy, dfs_ohlcv)
        
        df_data.to_pickle(transformed_data_path)
        print(f'saved new {transformed_data_path}')

    df_data = add_future_vars(df_data, df_buy, df_sell, dfs_ohlcv, **hyperparams)
    df_data = add_output_loss_min_var(df_data, **hyperparams)
    df_data = add_output_is_loss_limit_reached(df_data, **hyperparams)
    df_data = add_output_var_class(df_data, **hyperparams)

    df_data = add_future_rank(df_data, df_buy, **hyperparams)
    df_data = add_output_rank_class(df_data, **hyperparams)

    df_data = add_output_is_buy(df_data, output_class_name, **hyperparams)
    df_data = add_output_profit(df_data, fee, **hyperparams)

    df_data = df_data.dropna()

    return df_data

In [8]:
def get_dfs_input_output(df_data, output_class_name):
    input_columns = [col for col in df_data.columns if col.startswith('input_')]
    df_input = df_data[input_columns]
    df_output = df_data[[output_class_name]]

    return df_input, df_output

def get_test_train_data(df_input, df_output, test_size):
    X_train = df_input[:-test_size].values
    y_train = df_output[:-test_size].values.ravel().astype(int)

    X_test = df_input.tail(test_size).values
    y_test = df_output.tail(test_size).values.ravel().astype(int)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    hf.save_object(scaler, './outputs/scaler.pkl')

    print(f"number of elements in y_train: {len(y_train)}")
    print(f"number of elements in y_test: {len(y_test)}")

    return {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}

def create_model(**kwargs):
    X_train = kwargs.get('X_train')
    X_test = kwargs.get('X_test')
    y_train = kwargs.get('y_train')
    y_test = kwargs.get('y_test')

    thresholds = kwargs.get('thresholds')
    
    size_layer_1 = kwargs.get('size_layer_1')
    size_layer_2 = kwargs.get('size_layer_2')
    size_layer_3 = kwargs.get('size_layer_3')
    dropout_rate = kwargs.get('dropout_rate')
    balance_data = kwargs.get('balance_data')
    batch_size = kwargs.get('batch_size')

    last_layers_size = len(thresholds) + 1

    model = Sequential()

    model.add(Dense(size_layer_1, input_shape=(X_train.shape[1],), activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))
    model.add(Dense(size_layer_2, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))
    model.add(Dense(size_layer_3, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))
    model.add(Dense(last_layers_size, activation='softmax'))

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    if (balance_data):
        counter = Counter(y_train)
        max_count = max(counter.values())
        class_weights = {cls: max_count / count for cls, count in counter.items()}
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), class_weight=class_weights)
    else:
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))

    model.save(model_path)

def load_model(df_data, hyperparams):
    df_input, df_output = get_dfs_input_output(df_data, output_class_name)
    test_train_data = get_test_train_data(df_input, df_output, test_size)

    if os.path.exists(model_path) and use_saved_model:
        print(f'using existing {model_path}')
    else:
        print(f'need to create {model_path}')
        create_model(**{**test_train_data, **hyperparams})
    
    model = tf.keras.models.load_model(model_path)

    return test_train_data, model

In [9]:
def slice_df_test(df_data, test_size):
    return df_data.tail(test_size)

def add_predictions(df, model, X_test, **hyperparams):
    print(f'X_test shape: {X_test.shape}')
    
    predicted_n_first_classes = hyperparams['n_first_classes'][0]
    cumulated_probs_target = hyperparams['cumulated_probs_target']

    prediction_y_test_lists = model.predict(X_test)
    prediction_y_test_array = np.array(prediction_y_test_lists)
    df['prediction_probs'] = prediction_y_test_array.tolist()

    df['prediction_cumulated_probs'] = [sum(row[:predicted_n_first_classes+1]) for row in df['prediction_probs']]
    df['prediction_is_buy'] = (df['prediction_cumulated_probs'] > cumulated_probs_target)
    df['prediction_is_buy_is_correct'] = (df['output_is_buy'] == df['prediction_is_buy'])

    return df

def get_class_cumulative_percentages(y_test):
    unique_values, counts = np.unique(y_test, return_counts=True)
    percentages = counts / len(y_test)
    percentages = percentages[np.argsort(unique_values)]
    cumulative_percentages = np.cumsum(percentages)

    print(f'market cumulative % per class: {cumulative_percentages}')

    return cumulative_percentages

def get_market_rate(y_test, **hyperparams):
    accepted_n_first_classes = hyperparams['n_first_classes'][1]

    class_cumulative_percentages = get_class_cumulative_percentages(y_test)
    market_rate = class_cumulative_percentages[accepted_n_first_classes]

    return market_rate

def get_binary_classification(df):
    # tp: true positive, tn: true negative, fp: false positive, fn: false negative  
    tp = ((df['output_is_buy'] == True) & (df['prediction_is_buy'] == True)).sum()
    tn = ((df['output_is_buy'] == False) & (df['prediction_is_buy'] == False)).sum()
    fp = ((df['output_is_buy'] == False) & (df['prediction_is_buy'] == True)).sum()
    fn = ((df['output_is_buy'] == True) & (df['prediction_is_buy'] == False)).sum()

    winning_rate = float(tp / (tp + fp)) if (tp + fp) > 0 else 0

    return {
        'true_positives': tp, 'true_negatives': tn,
        'false_positives': fp, 'false_negatives': fn,
        'winning_rate': winning_rate
    }

def get_profits(df_prediction_is_buy):
    trimmed_average_profit = hf.get_trimmed_average(df_prediction_is_buy['output_profit'], pct_to_trim=0.03, min_num_to_trim=8)
    average_profit = df_prediction_is_buy['output_profit'].mean()
    median_profit = df_prediction_is_buy['output_profit'].median()

    return {
        'trimmed_average_profit': trimmed_average_profit,
        'average_profit': average_profit,
        'median_profit': median_profit
    }

def get_loss_limit_pct(df):
    return df['output_is_loss_limit_reached'].sum() / len(df) if len(df) > 0 else 0

def get_performance_score(trimmed_average_profit, is_buy_count):
    estimated_days = test_size / num_tickers
    adjusted_profit = trimmed_average_profit ** 8 # to decrease small values, e.g. 0.8^2 = 0.64
    performance_score = adjusted_profit * min(is_buy_count, estimated_days)
    
    return performance_score

def evaluate_model(df_data, model, test_train_data, hyperparams):
    df_test = slice_df_test(df_data, test_size)
    df_test = add_predictions(df_test, model, test_train_data['X_test'], **hyperparams)
    
    market_rate = get_market_rate(test_train_data['y_test'], **hyperparams)

    binary_classification = get_binary_classification(df_test)
    
    df_prediction_is_buy = df_test[(df_test['prediction_is_buy'] == True)]
    if (not use_hyperopt and num_combinations == 1):
        print(df_prediction_is_buy.to_markdown())
        df_prediction_is_buy.to_excel(f'./outputs/{hf.get_date()}_classifier_df_prediction_is_buy.xlsx')

    profits = get_profits(df_prediction_is_buy)
    prediction_is_buy_count = len(df_prediction_is_buy['output_profit'])
    loss_limit_reached_pct = get_loss_limit_pct(df_prediction_is_buy)
    performance_score = get_performance_score(profits['trimmed_average_profit'],
                                              prediction_is_buy_count)

    performance_metrics = {
        'performance_score': performance_score,
        **profits,
        'prediction_is_buy_count': prediction_is_buy_count,
        'loss_limit_reached_pct': loss_limit_reached_pct,
        'market_rate': market_rate,
        **binary_classification,
        'winning_rate_vs_market': binary_classification['winning_rate'] - market_rate,
    }

    return performance_metrics

In [10]:
from itertools import product

i = 0
results = []

def objective(hyperparams):
    hyperparams['thresholds'] = [hyperparams['thresholds']]
    hyperparams['rank_pct_thresholds'] = [hyperparams['rank_pct_thresholds']]
    
    df_data = get_df_data(hyperparams)
    test_train_data, model = load_model(df_data, hyperparams)
    performance_metrics = evaluate_model(df_data, model, test_train_data, hyperparams)

    result = {**performance_metrics, **hyperparams, 'epochs': epochs}
    print(result)
    results.append(result)

    performance = result['performance_score']

    return -performance

if use_hyperopt:
    best = fmin(objective, search_space, algo=tpe.suggest, max_evals=hyperopt_n_iterations)
    print(f'best parameters: {best}')
else:
    for params in product(*param_grid.values()):
        i += 1
        # clear_output(wait=True) # clear printed outputs
        hf.print_combination(i, num_combinations)

        hyperparams = dict(zip(param_grid.keys(), params))

        df_data = get_df_data(hyperparams)
        test_train_data, model = load_model(df_data, hyperparams)
        performance_metrics = evaluate_model(df_data, model, test_train_data, hyperparams)

        result = {**performance_metrics, **hyperparams, 'epochs': epochs}
        print(result)
        results.append(result)


step: 1/1
need to create ./outputs/classifier_transformed_data.pkl
saved new ./outputs/classifier_transformed_data.pkl
number of elements in y_train: 1605684
number of elements in y_test: 60000
need to create ./outputs/classifier_model.keras


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/3
[1m50178/50178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 1ms/step - accuracy: 0.5824 - loss: 1.2756 - val_accuracy: 0.5527 - val_loss: 0.6751
Epoch 2/3
[1m50178/50178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 1ms/step - accuracy: 0.5957 - loss: 1.2369 - val_accuracy: 0.5312 - val_loss: 0.6809
Epoch 3/3
[1m50178/50178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 1ms/step - accuracy: 0.5905 - loss: 1.2341 - val_accuracy: 0.5850 - val_loss: 0.6796
X_test shape: (60000, 52)
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 411us/step
market cumulative % per class: [0.06546667 1.        ]
|                                                    |   input_var_past_90d_future_0d |   input_var_past_60d_future_0d |   input_var_past_30d_future_0d |   input_var_past_10d_future_0d |   input_var_past_5d_future_0d |   input_var_past_2d_future_0d |   input_var_past_1d_future_0d |   input_var_past_close_1d |   input_var_past_high_1d |   in

In [11]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df_results = pd.DataFrame(results)
df_results = df_results.sort_values(by='performance_score', ascending=False)
df_results.head(1000)

Unnamed: 0,performance_score,trimmed_average_profit,average_profit,median_profit,prediction_is_buy_count,loss_limit_reached_pct,market_rate,true_positives,true_negatives,false_positives,false_negatives,winning_rate,winning_rate_vs_market,buying_time,selling_time,target_future_days,loss_limit,sell_at_target,size_layer_1,size_layer_2,size_layer_3,dropout_rate,balance_data,batch_size,n_first_classes,cumulated_probs_target,thresholds,rank_pct_thresholds,epochs
0,112.053346,1.01284,1.031456,0.99302,126,0.571429,0.065467,32,55978,94,3896,0.253968,0.188502,Open,Open,1,0.997,False,128,128,128,0.1,True,32,"[0, 0]",0.9,[1.0184],[0.45],3


In [12]:
df_results.to_excel(f'./outputs/{hf.get_date()}_classifier_results.xlsx')