Classify as True if highest during n future days reaches the target thresholds

In [1]:
import utils.helper_functions as hf
import pandas as pd

db_file_path = './db/ohlcv_ntickers_1254_2000-08-01_to_2023-12-23.pkl'

start_date = '2013-01-01'

buying_time = 'Open'
selling_time = 'High'
target_future_days = 5
thresholds = [1.1, 1.05, 1.01, 1]
cumulated_probs_target = 0.5

In [2]:
df = pd.read_pickle(db_file_path)
df = hf.get_rows_after_date(df, start_date)

df_buy = df[[buying_time]]
df_buy = hf.remove_top_column_name(df_buy)

df_sell = df[[selling_time]]
df_sell = hf.remove_top_column_name(df_sell)

df_buy.tail(5)

Unnamed: 0_level_0,1CALL.MI,2020.OL,5PG.OL,A2A.MI,A3M.MC,AAK.ST,AALB.AS,AB.PA,ABB.ST,ABCA.PA,...,XXL.OL,YAR.OL,YEXR.MC,YIPS.MC,YIV.MI,ZAL.OL,ZAP.OL,ZEAL.CO,ZENA.OL,ZV.MI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-07-24 00:00:00,,97.0,2.08,1.682,3.56,201.800003,38.959999,3.98,418.899994,5.86,...,1.75,402.700012,0.65,1.3,0.022,42.0,29.32,238.0,0.0588,15.36
2023-07-25 00:00:00,,98.75,2.09,1.7095,3.6,201.399994,39.450001,4.19,420.399994,5.95,...,1.71,406.700012,0.65,1.3,0.0216,42.0,29.299999,231.800003,0.062,15.36
2023-07-26 00:00:00,,99.699997,2.085,1.6995,3.61,204.600006,39.490002,4.04,421.899994,5.92,...,1.71,411.399994,0.65,1.3,0.0216,42.0,28.9,225.399994,0.062,15.68
2023-07-27 00:00:00,,100.400002,2.12,1.71,3.638,205.0,39.619999,4.07,419.799988,5.9,...,1.71,411.799988,0.65,1.3,0.0216,42.0,29.66,221.0,0.059,16.16
2023-07-28 00:00:00,,101.599998,2.085,1.71,3.65,204.399994,40.599998,3.975,425.0,5.9,...,1.642,413.700012,0.65,1.3,0.0224,44.0,30.68,220.199997,0.0586,16.6


In [3]:
def calculate_var(df, past_days, future_days):
    var = hf.calculate_variations(df, past_days, future_days)
    var_stacked = hf.stack(var, f'var_past_{past_days}d_future_{future_days}d')

    return var_stacked

def min_max_var(df, past_days):
    rolling_min = df.rolling(window=past_days + 1, min_periods=1).min()
    min_var = df / rolling_min
    min_var_stacked = hf.stack(min_var, f'min_var_past_{past_days}d')

    rolling_max = df.rolling(window=past_days + 1, min_periods=1).max()
    max_var = df / rolling_max
    max_var_stacked = hf.stack(max_var, f'max_var_past_{past_days}d')

    return min_var_stacked, max_var_stacked

def days_since_min_max(df, past_days):
    days_since_min = hf.get_days_since_min(df, past_days)
    days_since_min_stacked = hf.stack(days_since_min, f'days_since_min_{past_days}d')

    days_since_max = hf.get_days_since_max(df, past_days)
    days_since_max_stacked = hf.stack(days_since_max, f'days_since_max_{past_days}d')

    return days_since_min_stacked, days_since_max_stacked

def get_volatility(df, past_days):
    volatility = hf.calculate_volatility(df, past_days)
    volatility_stacked = hf.stack(volatility, f'volatility_{past_days}d')

    return volatility_stacked

def classify_var(df_var, thresholds, col_name):
    df_thresholds = hf.classify_var(df_var, thresholds)

    df_thresholds_stacked = hf.stack(df_thresholds, col_name)
    df_thresholds_stacked = df_thresholds_stacked.droplevel(level=-1)

    return df_thresholds_stacked

In [4]:
var_30 = calculate_var(df_buy, past_days=30, future_days=0)
var_10 = calculate_var(df_buy, past_days=10, future_days=0)
var_5 = calculate_var(df_buy, past_days=5, future_days=0)
var_2 = calculate_var(df_buy, past_days=2, future_days=0)
var_1 = calculate_var(df_buy, past_days=1, future_days=0)

min_var_30, max_var_30 = min_max_var(df_buy, past_days=30)
min_var_10, max_var_10 = min_max_var(df_buy, past_days=10)
min_var_5, max_var_5 = min_max_var(df_buy, past_days=5)
min_var_2, max_var_2 = min_max_var(df_buy, past_days=2)

days_since_min_30, days_since_max_30 = days_since_min_max(df_buy, past_days=30)
days_since_min_10, days_since_max_10 = days_since_min_max(df_buy, past_days=10)

volatility_30 = get_volatility(df_buy, past_days=30)
volatility_10 = get_volatility(df_buy, past_days=10)
volatility_2 = get_volatility(df_buy, past_days=2)

buy_var = calculate_var(df_sell, past_days=0, future_days=target_future_days)
buy_class = classify_var(buy_var, thresholds, 'buy_class')

input_output_df = pd.concat(
        [var_30, var_10, var_5, var_2, var_1,
        min_var_30, min_var_10, min_var_5, min_var_2,
        max_var_30, max_var_10, max_var_5, max_var_2,
        days_since_min_30, days_since_min_10,
        days_since_max_30, days_since_max_10,
        volatility_30, volatility_10, volatility_2,
        buy_class],
    axis='columns')

input_output_df = input_output_df.dropna()

input_columns = [col for col in input_output_df.columns if not col.startswith('buy')]
input_df = input_output_df[input_columns]
output_df = input_output_df[['buy_class']]

input_output_df.tail(5)

KeyboardInterrupt: 

In [None]:
value_counts = output_df['buy_class'].value_counts()
percentages = (value_counts / len(output_df)) * 100
percentages = percentages.sort_index()
cumulative_percentages = percentages.cumsum()

print(f'''
Percentage of each class:
{percentages}

Cumulative percentages:
{cumulative_percentages}''')

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import Counter

X_all = StandardScaler().fit_transform(input_df)
y_all = output_df.values.ravel()

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, train_size=0.8, test_size=0.2, random_state=42)
y_train = y_train.astype(int)
y_test = y_test.astype(int)

last_layers_size = len(thresholds) + 1

model = Sequential()

model.add(Dense(128, input_shape=(X_train.shape[1],), activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(last_layers_size, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

counter = Counter(y_train)
max_count = max(counter.values())
class_weights = {cls: max_count / count for cls, count in counter.items()}

model.fit(X_train, y_train, epochs=2, batch_size=32, validation_data=(X_test, y_test), class_weight=class_weights)

In [None]:
y_prediction = model.predict(X_test)
df_prediction = pd.DataFrame(y_prediction, columns=['prob_0', 'prob_1', 'prob_2', 'prob_3', 'prob_4'])
df_test = pd.DataFrame({'real_class': y_test})
df_comparison = pd.concat([df_prediction, df_test], axis=1)

In [None]:
df_comparison['cumulated_probs'] = df_comparison['prob_0'] + df_comparison['prob_1']
df_comparison['predicted_true'] = (df_comparison['cumulated_probs'] > cumulated_probs_target)
df_comparison['real_true'] = (df_comparison['real_class'] <= 1)

df_comparison.head(5)

tp = ((df_comparison['real_true'] == True) & (df_comparison['predicted_true'] == True)).sum()
tn = ((df_comparison['real_true'] == False) & (df_comparison['predicted_true'] == False)).sum()
fp = ((df_comparison['real_true'] == False) & (df_comparison['predicted_true'] == True)).sum()
fn = ((df_comparison['real_true'] == True) & (df_comparison['predicted_true'] == False)).sum()

print(f"True Positives (TP), Correctly bought, earned money: {tp}")
print(f"True Negatives (TN), Correctly not bought: {tn}")
print(f"False Positives (FP), Incorrectly bought, may have lost money : {fp}")
print(f"False Negatives (FN), Missed buying opportunity: {fn}")

winning_rate = tp / (tp + fp)
print(f'Winning rate: {round(winning_rate * 100, 2)} %')

In [None]:
# buying_time = 'Open' # 'Open' or 'Close'
# target_future_days = 5
# thresholds = [1.1, 1.05, 1.01, 1]
# cumulated_probs_target = 0.5

# True Positives (TP), Correctly bought, earned money: 160
# True Negatives (TN), Correctly not bought: 222576
# False Positives (FP), Incorrectly bought, may have lost money : 251
# False Negatives (FN), Missed buying opportunity: 136939
# Winning rate: 38.93 %

# buying_time = 'Open'
# target_future_days = 10
# thresholds = [1.1, 1.05, 1.01, 1]
# cumulated_probs_target = 0.6

# True Positives (TP), Correctly bought, earned money: 6712
# True Negatives (TN), Correctly not bought: 273724
# False Positives (FP), Incorrectly bought, may have lost money : 14081
# False Negatives (FN), Missed buying opportunity: 63609
# Winning rate: 32.28 %