In [1]:
import utils.helper_functions as hf

import os
from IPython.display import display, clear_output

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import Counter

import hyperopt
from hyperopt import hp, fmin, tpe

pd.options.mode.copy_on_write = True # avoid making unnecessary copies of DataFrames or Series
os.environ['PYDEVD_DISABLE_FILE_VALIDATION'] = '1' # disable file validation in the debugger

db_path = './db/ohlcv_ntickers_593_2000-08-01_to_2023-12-23.pkl'
transformed_data_path = './outputs/classifier_transformed_data.pkl'
model_path = './outputs/classifier_model.keras'

fee = 0.002
# num_tickers = 1250

use_hyperopt = False
use_saved_transformed_data = False
use_saved_model = False

start_date = '2008-01-01' #'2013-01-01'
test_size = 60000
epochs = 3
hyperopt_n_iterations = 250

output_class_name = 'output_var_class' #'output_var_class' or 'output_rank_class'

param_grid = {
    'buying_time': ['Open'], 'selling_time': ['Open'],
    'target_future_days': [1], 'loss_limit': [0.95,0.955,0.96,0.965,0.97,0.975,0.98,0.985,0.99],
    'sell_at_target': [False],
    'size_layer_1': [128], 'size_layer_2': [128], 'size_layer_3': [128],
    'dropout_rate': [0.075], 'balance_data': [True], 'batch_size': [32], #'dropout_rates': [i for i in list(np.arange(0, 0.3, 0.1))], 'batch_sizes': [32, 64, 128],
    'n_first_classes': [[0,0]],
    'cumulated_probs_target': [0.68],
    'thresholds': [[1.0065]],
    'rank_pct_thresholds': [[0.45]]
}
num_combinations = hf.get_num_combinations(param_grid)

search_space = {
    'buying_time': hp.choice('buying_time', ['Open']),
    'selling_time': hp.choice('selling_time', ['Open']),
    'target_future_days': hp.randint('param', 1, 3),
    'loss_limit': hp.uniform('loss_limit', 0.96, 1),
    'sell_at_target': hp.choice('sell_at_target', [True, False]),
    'size_layer_1': hp.choice('size_layer_1', [128]),
    'size_layer_2': hp.choice('size_layer_2', [128]),
    'size_layer_3': hp.choice('size_layer_3', [128, 256]),
    'dropout_rate': hp.uniform('dropout_rate', 0, 0.3),
    'balance_data': hp.choice('balance_data', [True]),
    'batch_size': hp.choice('batch_size', [32, 64, 128]),
    'n_first_classes': hp.choice('n_first_classes', [[0, 0]]),
    'cumulated_probs_target': hp.uniform('cumulated_probs_target', 0.6, 1),
    'thresholds': hp.uniform('thresholds', 1.005, 1.1),
    # 'thresholds': hp.choice('thresholds', [[1.08, 1.04, 1.02, 1], [1.06, 1.03, 1.01], [1.05, 1.025, 1], [1.1, 1.05, 1.01]]),
    'rank_pct_thresholds': hp.uniform('rank_pct_thresholds', 0.002, 0.5),
    #'rank_pct_thresholds': hp.choice('rank_pct_thresholds', [[0.08, 0.2, 0.33], [0.15, 0.3, 0.5]])
}

number of combinations: 9


In [2]:
df = pd.read_pickle(db_path)
df = hf.get_rows_after_date(df, start_date)

def get_single_level_df(df, ohlcv):
    new_df = df[[ohlcv]]
    new_df = hf.remove_top_column_name(new_df)

    return new_df

def get_ohlcv_dfs(df):
    df_open = get_single_level_df(df, 'Open')
    df_high = get_single_level_df(df, 'High')
    df_low = get_single_level_df(df, 'Low')
    df_close = get_single_level_df(df, 'Close')
    df_volume = get_single_level_df(df, 'Volume')
    
    return {'df_open': df_open, 'df_high': df_high, 'df_low': df_low,
            'df_close': df_close, 'df_volume': df_volume}

num_tickers = hf.get_num_tickers(get_single_level_df(df, 'Open'))
print(f'number of tickers: {num_tickers}')


number of tickers: 593


In [3]:
def calculate_var(df, past_days, future_days):
    var = hf.calculate_variations(df, past_days, future_days)
    var_stacked = hf.stack(var, f'input_var_past_{past_days}d_future_{future_days}d')

    return var_stacked

def calculate_var_vs_past_ohlcv(df, df_past, past_days, title):
    var = df / df_past.shift(past_days)
    var_stacked = hf.stack(var, f'input_var_past_{title}_{past_days}d')

    return var_stacked

def calculate_volume_var(df_volume, past_start_day, past_end_day):
    df_volume.replace(0, np.nan, inplace=True)
    df_volume.replace([np.inf, -np.inf], np.nan, inplace=True)

    volume_var= df_volume.shift(past_end_day) / df_volume.shift(past_start_day)
    volume_var_stacked = hf.stack(volume_var, f'input_volume_var_{past_start_day}-{past_end_day}d')

    return volume_var_stacked

def calculate_market_var(df, past_days):
    market_var = hf.calculate_market_variations(df, past_days)
    market_var_stacked = hf.stack(market_var, f'input_market_var_{past_days}d')

    return market_var_stacked

def min_max_var(df, past_days):
    rolling_min = df.rolling(window=past_days + 1, min_periods=1).min()
    min_var = df / rolling_min
    min_var_stacked = hf.stack(min_var, f'input_min_var_past_{past_days}d')

    rolling_max = df.rolling(window=past_days + 1, min_periods=1).max()
    max_var = df / rolling_max
    max_var_stacked = hf.stack(max_var, f'input_max_var_past_{past_days}d')

    return min_var_stacked, max_var_stacked

def get_future_end_var(df_buy, df_sell, future_days):
    df_future_end = df_sell.shift(-future_days)
    future_end_var =  df_future_end / df_buy
    future_end_var_stacked = hf.stack(future_end_var, f'output_future_end_var')
    
    return future_end_var_stacked

def get_future_max_var(df_buy, df_sell, future_days):
    future_rolling_max = hf.get_future_rolling_max(df_sell, future_days+1)
    future_max_var = future_rolling_max / df_buy
    future_max_var_stacked = hf.stack(future_max_var, f'output_future_max_var')
        
    return future_max_var_stacked

def get_future_min_var(df_buy, df_low, future_days):
    future_rolling_min = hf.get_future_rolling_min(df_low, future_days+1)
    future_min_var = future_rolling_min / df_buy
    future_min_var_stacked = hf.stack(future_min_var, f'output_future_min_var')
        
    return future_min_var_stacked

def get_future_min_var_before_max(df_buy, df_sell, df_low, future_days):
    rolling_max_positions = hf.get_future_rolling_max_position(df_sell, future_days)

    df_low = df_low.reset_index(drop=True)
    rolling_min = df_low.apply(lambda col: col.index.map(
            lambda row: hf.get_future_rolling_min_value(row, df_low.columns.get_loc(col.name), df_low, rolling_max_positions)
        ))
    rolling_min.index = df_buy.index
    
    var = rolling_min / df_buy
    var_stacked = hf.stack(var, f'output_future_min_var_before_max')

    return var_stacked

def days_since_min_max(df, past_days):
    days_since_min = hf.get_days_since_min(df, past_days)
    days_since_min_stacked = hf.stack(days_since_min, f'input_days_since_min_{past_days}d')

    days_since_max = hf.get_days_since_max(df, past_days)
    days_since_max_stacked = hf.stack(days_since_max, f'input_days_since_max_{past_days}d')

    return days_since_min_stacked, days_since_max_stacked

def get_volatility(df, past_days):
    volatility = hf.calculate_volatility(df, past_days)
    volatility_stacked = hf.stack(volatility, f'input_volatility_{past_days}d')

    return volatility_stacked

def get_market_volatility(df, past_days):
    market_average = hf.calculate_averages(df)
    volatility = hf.calculate_volatility(market_average, past_days)
    volatility_stacked = hf.stack(volatility, f'input_market_volatility_{past_days}d')

    return volatility_stacked

def get_volume_volability(df, past_days):
    volatility = hf.calculate_volatility(df, past_days)
    volatility_stacked = hf.stack(volatility, f'input_volume_volatility_{past_days}d')

    return volatility_stacked

def get_n_ups(df, past_days):
    n_ups = hf.calculate_n_ups(df, past_days)
    n_ups_stacked = hf.stack(n_ups, f'input_n_ups_{past_days}d')

    return n_ups_stacked

def get_rank(df, past_days, future_days):
    rank = hf.calculate_rank(df, past_days, future_days)
    
    if future_days == 0:
        rank_stacked = hf.stack(rank, f'input_rank_{past_days}d')
    elif past_days == 0:
        rank_stacked = hf.stack(rank, f'output_rank_{future_days}d')
    else:
        raise ValueError('Either past_days or future_days must be 0')
    
    return rank_stacked

def get_performance_vs_market(df, past_days):
    performance_vs_market = hf.calculate_performance_vs_market(df, past_days)
    performance_vs_market_stacked = hf.stack(performance_vs_market, f'input_perf_vs_market_{past_days}d')

    return performance_vs_market_stacked

def classify_var(df_var, thresholds, col_name):
    df_thresholds = hf.classify_var(df_var, thresholds)

    df_thresholds_stacked = hf.stack(df_thresholds, col_name)
    df_thresholds_stacked = df_thresholds_stacked.droplevel(level=-1)

    return df_thresholds_stacked

def classify_rank(df_rank, thresholds, col_name):
    df_thresholds = hf.classify_rank(df_rank, thresholds)

    df_thresholds_stacked = hf.stack(df_thresholds, col_name)
    df_thresholds_stacked = df_thresholds_stacked.droplevel(level=-1)

    return df_thresholds_stacked


In [4]:
def get_inputs(df_buy, dfs_ohlcv):
    var_90 = calculate_var(df_buy, past_days=90, future_days=0)
    var_60 = calculate_var(df_buy, past_days=60, future_days=0)
    var_30 = calculate_var(df_buy, past_days=30, future_days=0)
    var_10 = calculate_var(df_buy, past_days=10, future_days=0)
    var_5 = calculate_var(df_buy, past_days=5, future_days=0)
    var_2 = calculate_var(df_buy, past_days=2, future_days=0)
    var_1 = calculate_var(df_buy, past_days=1, future_days=0)

    var_vs_close_1 = calculate_var_vs_past_ohlcv(df_buy, dfs_ohlcv['df_close'], past_days=1, title='close')
    var_vs_low_1 = calculate_var_vs_past_ohlcv(df_buy, dfs_ohlcv['df_low'], past_days=1, title='low')
    var_vs_high_1 = calculate_var_vs_past_ohlcv(df_buy, dfs_ohlcv['df_high'], past_days=1, title='high')

    volume_var_90_1 = calculate_volume_var(dfs_ohlcv['df_volume'], past_start_day=90, past_end_day=1)
    volume_var_60_1 = calculate_volume_var(dfs_ohlcv['df_volume'], past_start_day=60, past_end_day=1)
    volume_var_30_1 = calculate_volume_var(dfs_ohlcv['df_volume'], past_start_day=30, past_end_day=1)
    volume_var_10_1 = calculate_volume_var(dfs_ohlcv['df_volume'], past_start_day=10, past_end_day=1)
    volume_var_3_1 = calculate_volume_var(dfs_ohlcv['df_volume'], past_start_day=3, past_end_day=1)
    volume_var_2_1 = calculate_volume_var(dfs_ohlcv['df_volume'], past_start_day=2, past_end_day=1)
    
    # market_var_90 = calculate_market_var(df_buy, past_days=90)
    # market_var_30 = calculate_market_var(df_buy, past_days=30)
    # market_var_10 = calculate_market_var(df_buy, past_days=10)
    # market_var_5 = calculate_market_var(df_buy, past_days=5)
    # market_var_1 = calculate_market_var(df_buy, past_days=1)
    
    min_var_90, max_var_90 = min_max_var(df_buy, past_days=90)
    min_var_30, max_var_30 = min_max_var(df_buy, past_days=30)
    min_var_10, max_var_10 = min_max_var(df_buy, past_days=10)
    min_var_5, max_var_5 = min_max_var(df_buy, past_days=5)
    min_var_2, max_var_2 = min_max_var(df_buy, past_days=2)

    days_since_min_30, days_since_max_30 = days_since_min_max(df_buy, past_days=30)
    days_since_min_10, days_since_max_10 = days_since_min_max(df_buy, past_days=10)

    volatility_30 = get_volatility(df_buy, past_days=30)
    volatility_10 = get_volatility(df_buy, past_days=10)
    volatility_2 = get_volatility(df_buy, past_days=2)

    # market_volatility_30 = get_market_volatility(df_buy, past_days=30)
    # market_volatility_10 = get_market_volatility(df_buy, past_days=10)
    # market_volatility_2 = get_market_volatility(df_buy, past_days=2)

    volume_volability_90 = get_volume_volability(dfs_ohlcv['df_volume'], past_days=90)
    volume_volability_30 = get_volume_volability(dfs_ohlcv['df_volume'], past_days=30)
    volume_volability_10 = get_volume_volability(dfs_ohlcv['df_volume'], past_days=10)
    volume_volability_2 = get_volume_volability(dfs_ohlcv['df_volume'], past_days=2)

    n_ups_90 = get_n_ups(df_buy, past_days=90)
    n_ups_30 = get_n_ups(df_buy, past_days=30)
    n_ups_5 = get_n_ups(df_buy, past_days=5)

    rank_90 = get_rank(df_buy, past_days=90, future_days=0)
    rank_30 = get_rank(df_buy, past_days=30, future_days=0)
    rank_10 = get_rank(df_buy, past_days=10, future_days=0)
    rank_5 = get_rank(df_buy, past_days=5, future_days=0)
    rank_2 = get_rank(df_buy, past_days=2, future_days=0)
    rank_1 = get_rank(df_buy, past_days=1, future_days=0)

    perf_vs_market_90 = get_performance_vs_market(df_buy, past_days=90)
    perf_vs_market_30 = get_performance_vs_market(df_buy, past_days=30)
    perf_vs_market_10 = get_performance_vs_market(df_buy, past_days=10)
    perf_vs_market_5 = get_performance_vs_market(df_buy, past_days=5)
    perf_vs_market_2 = get_performance_vs_market(df_buy, past_days=2)
    perf_vs_market_1 = get_performance_vs_market(df_buy, past_days=1)

    df_data = pd.concat([
            var_90, var_60, var_30, var_10, var_5, var_2, var_1,
            var_vs_close_1, var_vs_high_1, var_vs_low_1,
            volume_var_90_1, volume_var_60_1, volume_var_30_1, volume_var_10_1, volume_var_2_1, volume_var_3_1,
            # market_var_90, market_var_30, market_var_10, market_var_5, market_var_1,
            min_var_90, min_var_30, min_var_10, min_var_5, min_var_2,
            max_var_90, max_var_30, max_var_10, max_var_5, max_var_2,
            days_since_min_30, days_since_min_10,
            days_since_max_30, days_since_max_10,
            volatility_30, volatility_10, volatility_2,
            # market_volatility_30, market_volatility_10, market_volatility_2,
            volume_volability_90, volume_volability_30, volume_volability_10, volume_volability_2,
            n_ups_90, n_ups_30, n_ups_5,
            rank_90, rank_30, rank_10, rank_5, rank_2, rank_1,
            perf_vs_market_90, perf_vs_market_30, perf_vs_market_10, perf_vs_market_5,
            perf_vs_market_2, perf_vs_market_1
        ], axis='columns')

    df_data = df_data.dropna()

    return df_data

In [5]:
def add_future_vars(df_data, df_buy, df_sell, dfs_ohlcv, **hyperparams):
    target_future_days = hyperparams.get('target_future_days')
    sell_at_target = hyperparams.get('sell_at_target')
    
    future_end_var = get_future_end_var(df_buy, df_sell, target_future_days)
    future_max_var = get_future_max_var(df_buy, df_sell, target_future_days)
    future_min_var = get_future_min_var(df_buy, dfs_ohlcv['df_low'], target_future_days-1)

    df_data = pd.concat([df_data, future_end_var, future_max_var, future_min_var], axis='columns')
    
    if sell_at_target:
        future_min_var_before_max = get_future_min_var_before_max(df_buy, df_sell, dfs_ohlcv['df_low'], target_future_days)
        df_data = pd.concat([df_data, future_min_var_before_max], axis='columns')
    
    return df_data

def add_output_loss_min_var(df, **hyperparams):
    sell_at_target = hyperparams.get('sell_at_target')
    
    if sell_at_target:
        df['output_loss_min_var'] = df['output_future_min_var_before_max']
    else:
        df['output_loss_min_var'] = df['output_future_min_var']

    return df

def add_output_is_loss_limit_reached(df, **hyperparams):
    loss_limit = hyperparams.get('loss_limit')

    df['output_is_loss_limit_reached'] = (df['output_loss_min_var'] <= loss_limit)

    return df

def add_output_var_class(df_data, **hyperparams):
    sell_at_target = hyperparams.get('sell_at_target')
    thresholds = hyperparams.get('thresholds')
    last_class = len(thresholds)

    if sell_at_target:
        output_class = classify_var(df_data[['output_future_max_var']], thresholds, 'output_var_class')
    else:
        output_class = classify_var(df_data[['output_future_end_var']], thresholds, 'output_var_class')

    output_class.loc[df_data['output_is_loss_limit_reached'], 'output_var_class'] = last_class
    
    df_data = pd.concat([df_data, output_class], axis='columns')

    return df_data

def add_output_is_buy(df, output_class_name, **hyperparams):
    accepted_n_first_classes = hyperparams.get('n_first_classes')[1]
    df['output_is_buy'] = (df[output_class_name] <= accepted_n_first_classes)
    
    return df

def add_output_profit(df, fee, **hyperparams):
    thresholds = hyperparams.get('thresholds')
    accepted_n_first_classes = hyperparams.get('n_first_classes')[1]
    loss_limit = hyperparams.get('loss_limit')
    sell_at_target = hyperparams.get('sell_at_target')

    accepted_var = thresholds[accepted_n_first_classes]

    loss_condition = df['output_loss_min_var'] <= loss_limit
    reached_target_condition = sell_at_target & (df['output_future_max_var'] > accepted_var)

    df['output_profit'] = np.select(
        [
            loss_condition,  # Condition for buy and loss condition
            reached_target_condition  # Condition for buy and reached target condition
        ],
        [
            loss_limit,  # Value if buy and meets loss condition
            accepted_var  # Value if buy and meets target condition
        ],
        default=df['output_future_end_var']  # Default value for buy condition not meeting the above
    )

    fee_coef = hf.get_fee_coef(fee)
    df['output_profit'] *= fee_coef
    
    return df

In [6]:
def add_future_rank(df_data, df_buy, **hyperparams):
    target_future_days = hyperparams.get('target_future_days')
    df_data['output_future_end_rank'] = get_rank(df_buy, past_days=0, future_days=target_future_days)
    
    return df_data

def add_output_rank_class(df_data, **hyperparams):
    rank_pct_thresholds = hyperparams.get('rank_pct_thresholds')
    rank_thresholds = np.floor(np.array(rank_pct_thresholds) * num_tickers).astype(int)
    
    output_class = classify_rank(df_data[['output_future_end_rank']], rank_thresholds, 'output_rank_class')
    
    df_data = pd.concat([df_data, output_class], axis='columns')

    return df_data

In [7]:
def get_df_data(hyperparams):
    df_buy = get_single_level_df(df, hyperparams['buying_time'])
    df_sell = get_single_level_df(df, hyperparams['selling_time'])
    dfs_ohlcv = get_ohlcv_dfs(df)

    if os.path.exists(transformed_data_path) and use_saved_transformed_data:
        df_data = pd.read_pickle(transformed_data_path)
        print(f'using existing {transformed_data_path}')
    else:
        print(f'need to create {transformed_data_path}')
        df_data = get_inputs(df_buy, dfs_ohlcv)
        
        df_data.to_pickle(transformed_data_path)
        print(f'saved new {transformed_data_path}')

    df_data = add_future_vars(df_data, df_buy, df_sell, dfs_ohlcv, **hyperparams)
    df_data = add_output_loss_min_var(df_data, **hyperparams)
    df_data = add_output_is_loss_limit_reached(df_data, **hyperparams)
    df_data = add_output_var_class(df_data, **hyperparams)

    df_data = add_future_rank(df_data, df_buy, **hyperparams)
    df_data = add_output_rank_class(df_data, **hyperparams)

    df_data = add_output_is_buy(df_data, output_class_name, **hyperparams)
    df_data = add_output_profit(df_data, fee, **hyperparams)

    df_data = df_data.dropna()

    return df_data

In [8]:
def get_dfs_input_output(df_data, output_class_name):
    input_columns = [col for col in df_data.columns if col.startswith('input_')]
    df_input = df_data[input_columns]
    df_output = df_data[[output_class_name]]

    return df_input, df_output

def get_test_train_data(df_input, df_output, test_size):
    X_train = df_input[:-test_size].values
    y_train = df_output[:-test_size].values.ravel().astype(int)

    X_test = df_input.tail(test_size).values
    y_test = df_output.tail(test_size).values.ravel().astype(int)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    print(f"number of elements in y_train: {len(y_train)}")
    print(f"number of elements in y_test: {len(y_test)}")

    return {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}

def create_model(**kwargs):
    X_train = kwargs.get('X_train')
    X_test = kwargs.get('X_test')
    y_train = kwargs.get('y_train')
    y_test = kwargs.get('y_test')

    thresholds = kwargs.get('thresholds')
    
    size_layer_1 = kwargs.get('size_layer_1')
    size_layer_2 = kwargs.get('size_layer_2')
    size_layer_3 = kwargs.get('size_layer_3')
    dropout_rate = kwargs.get('dropout_rate')
    balance_data = kwargs.get('balance_data')
    batch_size = kwargs.get('batch_size')

    last_layers_size = len(thresholds) + 1

    model = Sequential()

    model.add(Dense(size_layer_1, input_shape=(X_train.shape[1],), activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))
    model.add(Dense(size_layer_2, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))
    model.add(Dense(size_layer_3, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))
    model.add(Dense(last_layers_size, activation='softmax'))

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    if (balance_data):
        counter = Counter(y_train)
        max_count = max(counter.values())
        class_weights = {cls: max_count / count for cls, count in counter.items()}
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), class_weight=class_weights)
    else:
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))

    model.save(model_path)

def load_model(df_data, hyperparams):
    df_input, df_output = get_dfs_input_output(df_data, output_class_name)
    test_train_data = get_test_train_data(df_input, df_output, test_size)

    if os.path.exists(model_path) and use_saved_model:
        print(f'using existing {model_path}')
    else:
        print(f'need to create {model_path}')
        create_model(**{**test_train_data, **hyperparams})
    
    model = tf.keras.models.load_model(model_path)

    return test_train_data, model

In [9]:
def slice_df_test(df_data, test_size):
    return df_data.tail(test_size)

def add_predictions(df, model, X_test, **hyperparams):
    predicted_n_first_classes = hyperparams['n_first_classes'][0]
    cumulated_probs_target = hyperparams['cumulated_probs_target']

    prediction_y_test_lists = model.predict(X_test)
    prediction_y_test_array = np.array(prediction_y_test_lists)
    df['prediction_probs'] = prediction_y_test_array.tolist()

    df['prediction_cumulated_probs'] = [sum(row[:predicted_n_first_classes+1]) for row in df['prediction_probs']]
    df['prediction_is_buy'] = (df['prediction_cumulated_probs'] > cumulated_probs_target)
    df['prediction_is_buy_is_correct'] = (df['output_is_buy'] == df['prediction_is_buy'])

    return df

def get_class_cumulative_percentages(y_test):
    unique_values, counts = np.unique(y_test, return_counts=True)
    percentages = counts / len(y_test)
    percentages = percentages[np.argsort(unique_values)]
    cumulative_percentages = np.cumsum(percentages)

    print(f'market cumulative % per class: {cumulative_percentages}')

    return cumulative_percentages

def get_market_rate(y_test, **hyperparams):
    accepted_n_first_classes = hyperparams['n_first_classes'][1]

    class_cumulative_percentages = get_class_cumulative_percentages(y_test)
    market_rate = class_cumulative_percentages[accepted_n_first_classes]

    return market_rate

def get_binary_classification(df):
    # tp: true positive, tn: true negative, fp: false positive, fn: false negative  
    tp = ((df['output_is_buy'] == True) & (df['prediction_is_buy'] == True)).sum()
    tn = ((df['output_is_buy'] == False) & (df['prediction_is_buy'] == False)).sum()
    fp = ((df['output_is_buy'] == False) & (df['prediction_is_buy'] == True)).sum()
    fn = ((df['output_is_buy'] == True) & (df['prediction_is_buy'] == False)).sum()

    winning_rate = float(tp / (tp + fp)) if (tp + fp) > 0 else 0

    return {
        'true_positives': tp, 'true_negatives': tn,
        'false_positives': fp, 'false_negatives': fn,
        'winning_rate': winning_rate
    }

def get_profits(df_prediction_is_buy):
    trimmed_average_profit = hf.get_trimmed_average(df_prediction_is_buy['output_profit'], pct_to_trim=0.03, min_num_to_trim=8)
    average_profit = df_prediction_is_buy['output_profit'].mean()
    median_profit = df_prediction_is_buy['output_profit'].median()

    return {
        'trimmed_average_profit': trimmed_average_profit,
        'average_profit': average_profit,
        'median_profit': median_profit
    }

def get_loss_limit_pct(df):
    return df['output_is_loss_limit_reached'].sum() / len(df) if len(df) > 0 else 0

def get_performance_score(trimmed_average_profit, is_buy_count):
    estimated_days = test_size / num_tickers
    adjusted_profit = trimmed_average_profit ** 8 # to decrease small values, e.g. 0.8^2 = 0.64
    performance_score = adjusted_profit * min(is_buy_count, estimated_days)
    
    return performance_score

def evaluate_model(df_data, model, test_train_data, hyperparams):
    df_test = slice_df_test(df_data, test_size)
    df_test = add_predictions(df_test, model, test_train_data['X_test'], **hyperparams)
    
    market_rate = get_market_rate(test_train_data['y_test'], **hyperparams)

    binary_classification = get_binary_classification(df_test)
    
    df_prediction_is_buy = df_test[(df_test['prediction_is_buy'] == True)]
    if (not use_hyperopt and num_combinations == 1):
        df_prediction_is_buy.to_excel(f'./outputs/{hf.get_date()}_classifier_df_prediction_is_buy.xlsx')

    profits = get_profits(df_prediction_is_buy)
    prediction_is_buy_count = len(df_prediction_is_buy['output_profit'])
    loss_limit_reached_pct = get_loss_limit_pct(df_prediction_is_buy)
    performance_score = get_performance_score(profits['trimmed_average_profit'],
                                              prediction_is_buy_count)

    performance_metrics = {
        'performance_score': performance_score,
        **profits,
        'prediction_is_buy_count': prediction_is_buy_count,
        'loss_limit_reached_pct': loss_limit_reached_pct,
        'market_rate': market_rate,
        **binary_classification,
        'winning_rate_vs_market': binary_classification['winning_rate'] - market_rate,
    }

    return performance_metrics

In [10]:
from itertools import product

i = 0
results = []

def objective(hyperparams):
    hyperparams['thresholds'] = [hyperparams['thresholds']]
    hyperparams['rank_pct_thresholds'] = [hyperparams['rank_pct_thresholds']]
    
    df_data = get_df_data(hyperparams)
    test_train_data, model = load_model(df_data, hyperparams)
    performance_metrics = evaluate_model(df_data, model, test_train_data, hyperparams)

    result = {**performance_metrics, **hyperparams, 'epochs': epochs}
    print(result)
    results.append(result)

    performance = result['performance_score']

    return -performance

if use_hyperopt:
    best = fmin(objective, search_space, algo=tpe.suggest, max_evals=hyperopt_n_iterations)
    print(f'best parameters: {best}')
else:
    for params in product(*param_grid.values()):
        i += 1
        # clear_output(wait=True) # clear printed outputs
        hf.print_combination(i, num_combinations)

        hyperparams = dict(zip(param_grid.keys(), params))

        df_data = get_df_data(hyperparams)
        test_train_data, model = load_model(df_data, hyperparams)
        performance_metrics = evaluate_model(df_data, model, test_train_data, hyperparams)

        result = {**performance_metrics, **hyperparams, 'epochs': epochs}
        print(result)
        results.append(result)


step: 1/9
need to create ./outputs/classifier_transformed_data.pkl
saved new ./outputs/classifier_transformed_data.pkl
number of elements in y_train: 487242
number of elements in y_test: 60000
need to create ./outputs/classifier_model.keras
Epoch 1/3


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m15227/15227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 600us/step - accuracy: 0.5411 - loss: 0.9325 - val_accuracy: 0.5515 - val_loss: 0.6874
Epoch 2/3
[1m15227/15227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 564us/step - accuracy: 0.5536 - loss: 0.9040 - val_accuracy: 0.5293 - val_loss: 0.6911
Epoch 3/3
[1m15227/15227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 542us/step - accuracy: 0.5563 - loss: 0.9028 - val_accuracy: 0.5231 - val_loss: 0.6927
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 231us/step
market cumulative % per class: [0.36178333 1.        ]
{'performance_score': 103.63660831470132, 'trimmed_average_profit': 1.003002647356023, 'average_profit': 1.0070697782765758, 'median_profit': 1.0057094537550837, 'prediction_is_buy_count': 670, 'loss_limit_reached_pct': 0.14029850746268657, 'market_rate': 0.36178333333333335, 'true_positives': 360, 'true_negatives': 37983, 'false_positives': 310, 'false_negatives': 21347,

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m15227/15227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 562us/step - accuracy: 0.5383 - loss: 0.9364 - val_accuracy: 0.5582 - val_loss: 0.6850
Epoch 2/3
[1m15227/15227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 563us/step - accuracy: 0.5503 - loss: 0.9051 - val_accuracy: 0.5577 - val_loss: 0.6830
Epoch 3/3
[1m15227/15227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 570us/step - accuracy: 0.5584 - loss: 0.9030 - val_accuracy: 0.5633 - val_loss: 0.6840
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 238us/step
market cumulative % per class: [0.36093333 1.        ]
{'performance_score': 103.26899701379142, 'trimmed_average_profit': 1.0025572345555047, 'average_profit': 1.0066703212626047, 'median_profit': 1.0045860579245456, 'prediction_is_buy_count': 647, 'loss_limit_reached_pct': 0.17619783616692428, 'market_rate': 0.36093333333333333, 'true_positives': 336, 'true_negatives': 38033, 'false_positives': 311, 'false_negatives': 21320,

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m15227/15227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 591us/step - accuracy: 0.5371 - loss: 0.9350 - val_accuracy: 0.5128 - val_loss: 0.6979
Epoch 2/3
[1m15227/15227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 589us/step - accuracy: 0.5497 - loss: 0.9085 - val_accuracy: 0.5360 - val_loss: 0.6892
Epoch 3/3
[1m15227/15227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 563us/step - accuracy: 0.5584 - loss: 0.9056 - val_accuracy: 0.5161 - val_loss: 0.6912
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 233us/step
market cumulative % per class: [0.35955 1.     ]
{'performance_score': 103.58466357522789, 'trimmed_average_profit': 1.0029397929500428, 'average_profit': 1.0069844858812764, 'median_profit': 1.0047421764487172, 'prediction_is_buy_count': 446, 'loss_limit_reached_pct': 0.21973094170403587, 'market_rate': 0.35955, 'true_positives': 231, 'true_negatives': 38212, 'false_positives': 215, 'false_negatives': 21342, 'winning_rate': 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m15227/15227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 572us/step - accuracy: 0.5365 - loss: 0.9448 - val_accuracy: 0.5146 - val_loss: 0.6905
Epoch 2/3
[1m15227/15227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 571us/step - accuracy: 0.5492 - loss: 0.9122 - val_accuracy: 0.5202 - val_loss: 0.6940
Epoch 3/3
[1m15227/15227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 573us/step - accuracy: 0.5488 - loss: 0.9102 - val_accuracy: 0.5473 - val_loss: 0.6901
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 233us/step
market cumulative % per class: [0.35756667 1.        ]
{'performance_score': 101.66200008609323, 'trimmed_average_profit': 1.0005936941524995, 'average_profit': 1.003849204467717, 'median_profit': 1.0005110025117863, 'prediction_is_buy_count': 1114, 'loss_limit_reached_pct': 0.18850987432675045, 'market_rate': 0.35756666666666664, 'true_positives': 529, 'true_negatives': 37961, 'false_positives': 585, 'false_negatives': 20925,

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m15227/15227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 583us/step - accuracy: 0.5349 - loss: 0.9480 - val_accuracy: 0.5258 - val_loss: 0.6945
Epoch 2/3
[1m15227/15227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 603us/step - accuracy: 0.5425 - loss: 0.9178 - val_accuracy: 0.5535 - val_loss: 0.6842
Epoch 3/3
[1m15227/15227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 567us/step - accuracy: 0.5499 - loss: 0.9159 - val_accuracy: 0.5418 - val_loss: 0.6868
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 238us/step
market cumulative % per class: [0.35401667 1.        ]
{'performance_score': nan, 'trimmed_average_profit': nan, 'average_profit': 1.0670573162332917, 'median_profit': 1.0609344052345988, 'prediction_is_buy_count': 4, 'loss_limit_reached_pct': 0.0, 'market_rate': 0.35401666666666665, 'true_positives': 4, 'true_negatives': 38759, 'false_positives': 0, 'false_negatives': 21237, 'winning_rate': 1.0, 'winning_rate_vs_market': 0.6

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m15227/15227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 589us/step - accuracy: 0.5286 - loss: 0.9576 - val_accuracy: 0.5064 - val_loss: 0.7050
Epoch 2/3
[1m15227/15227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 587us/step - accuracy: 0.5403 - loss: 0.9270 - val_accuracy: 0.5160 - val_loss: 0.6905
Epoch 3/3
[1m15227/15227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 655us/step - accuracy: 0.5457 - loss: 0.9239 - val_accuracy: 0.5769 - val_loss: 0.6814
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 238us/step
market cumulative % per class: [0.34875 1.     ]
{'performance_score': 105.29473982030133, 'trimmed_average_profit': 1.0049946847416886, 'average_profit': 1.009142244940025, 'median_profit': 1.003188258310416, 'prediction_is_buy_count': 372, 'loss_limit_reached_pct': 0.29301075268817206, 'market_rate': 0.34875, 'true_positives': 187, 'true_negatives': 38890, 'false_positives': 185, 'false_negatives': 20738, 'winning_rate': 0

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df_results = pd.DataFrame(results)
df_results = df_results.sort_values(by='performance_score', ascending=False)
df_results.head(1000)

In [None]:
df_results.to_excel(f'./outputs/{hf.get_date()}_classifier_results.xlsx')