In [21]:
import pandas as pd

from StateMachines.TradingStateMachine import TradingStateMachine
from StrategyBuilders import getStrategyBuilder
from Strategies.AbstractTradingStrategy import AbstractTradingStrategy

In [22]:
folder = "./Data/bybit"
folder_lab1 = "./Data/lab1"
file = "ETH-USDT:USDT_1m.csv"
periods = [5, 10, 15, 20, 30, 60, 120]

In [23]:
folder = "./Data/gemini"
folder_lab1 = "./Data/lab1"
file = "gemini_1m_features.csv"


In [24]:

strategy_builder = getStrategyBuilder('SIMPLE')
trader: TradingStateMachine = TradingStateMachine(strategy_builder("gemini_btcusd_1m"))

df = pd.read_csv(f"{folder}/{file}")
len(df)

3680586

In [25]:
period = 30
periods = [period]
first_year = 2017

In [26]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['year_'] = df['timestamp'].dt.year

features_base = ['timestamp', 'close', 'year_']
columns = features_base + AbstractTradingStrategy.get_features_list(df)
features_stochrsi = [col for col in columns if 'stoch_rsi' in col and 'change_of_trend' not in col and col[-1] == '_']
features_hma = [col for col in columns if 'hma' in col and 'change_of_trend' not in col and col[-1] == '_']
features_macd = [col for col in columns if 'macd' in col and 'change_of_trend' not in col and col[-1] == '_']


In [27]:
def get_best_group(df, features, period, sort='mean', max=5):
    
    # aggregations = {
    #     'count': (column, 'count'),
    #     'min': (column, 'min'),
    #     'max': (column, 'max')
    # }

# Perform the aggregations
# result = df.agg(**aggregations)    
    column = f'close_p_in_{period}'
    # grouped_df = df.groupby(features)[f'close_p_in_{period}'].agg(['count', 'mean', 'min', 'max'])
    grouped_df = df.groupby(features)[column].agg(['count', 'mean', 'min', 'max'])
    grouped_df = grouped_df.sort_values(by=sort, ascending=False)
    return grouped_df.head(max)


def convert_grouped_to_dataframe(grouped_df):
    df = grouped_df.reset_index()
    return df

# import pandas as pd

def append_dataframes(df1, df2):
    # Concatenate the dataframes
    df = pd.concat([df1, df2], ignore_index=True)
    return df

we want to work on the history 1m from bitcoin (gemini)

Dataframe must be indexed by timestamp

In [28]:
df = df[columns]
df = df.set_index('timestamp')

df.tail(5)

Unnamed: 0_level_0,close,year_,stoch_rsi_overbought_,stoch_rsi_oversold_,stoch_rsi_bullish_crossover_,stoch_rsi_bearish_crossover_,stoch_rsi_trend_,stoch_rsi_change_of_trend_,hma200_trend_,hma200_change_of_trend_,hma200_above_price_,macd_bullish_crossover_,macd_bearish_crossover_,macd_positive_,macd_trend_,macd_change_of_trend_,year_
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2024-04-07 00:24:00,69046.77,2024,False,False,False,False,Downtrend,,Downtrend,,True,False,False,True,Uptrend,,2024
2024-04-07 00:25:00,68944.63,2024,False,True,False,False,Downtrend,,Downtrend,,True,False,True,False,Uptrend,,2024
2024-04-07 00:26:00,68942.84,2024,False,True,False,False,Downtrend,,Downtrend,,True,False,False,False,Downtrend,Uptrend->Downtrend,2024
2024-04-07 00:27:00,68947.39,2024,False,True,False,False,Downtrend,,Downtrend,,True,False,False,False,Downtrend,,2024
2024-04-07 00:28:00,68890.02,2024,False,True,True,False,Downtrend,,Downtrend,,True,False,False,False,Downtrend,,2024


Now we calculate the result for different number of intervals

In [29]:
# adding shifted periods results
features_close = [f'close_in_{period}' for period in periods]
features_close_perc = [f'close_p_in_{period}' for period in periods]
for period in periods:
    column_name = f'close_in_{period}'
    df[column_name] = df['close'].shift(-period)
    column_perc = f'close_p_in_{period}'
    df[column_perc] = ((df['close'].shift(-period) - df['close']) / df['close']) * 100
    
df.to_csv(f"{folder_lab1}/{file}_periods.csv")
df.columns

Index(['close', 'year_', 'stoch_rsi_overbought_', 'stoch_rsi_oversold_',
       'stoch_rsi_bullish_crossover_', 'stoch_rsi_bearish_crossover_',
       'stoch_rsi_trend_', 'stoch_rsi_change_of_trend_', 'hma200_trend_',
       'hma200_change_of_trend_', 'hma200_above_price_',
       'macd_bullish_crossover_', 'macd_bearish_crossover_', 'macd_positive_',
       'macd_trend_', 'macd_change_of_trend_', 'year_', 'close_in_30',
       'close_p_in_30'],
      dtype='object')

we classify the features based on the result for the given number of intervals

In [30]:
# period = 30
x = df[df[f'close_p_in_{period}'] > 1]
x[['year_', 'close', f'close_in_{period}', f'close_p_in_{period}']].head(10)

Unnamed: 0_level_0,year_,year_,close,close_in_30,close_p_in_30
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-01 16:11:00,2017,2017,982.99,994.95,1.216696
2017-01-01 16:12:00,2017,2017,982.99,996.32,1.356067
2017-01-01 16:13:00,2017,2017,982.99,996.32,1.356067
2017-01-01 16:14:00,2017,2017,982.99,1000.01,1.731452
2017-01-01 16:15:00,2017,2017,982.99,1000.0,1.730435
2017-01-01 16:16:00,2017,2017,982.97,999.99,1.731487
2017-01-01 16:17:00,2017,2017,982.51,996.71,1.445278
2017-01-01 16:18:00,2017,2017,979.34,998.7,1.976842
2017-01-01 16:19:00,2017,2017,979.34,998.7,1.976842
2017-01-01 16:20:00,2017,2017,982.5,998.7,1.648855


In [31]:
# Create a column variable that is the combination of all the features_ lists
all_features = features_stochrsi + features_hma + features_macd

# removing likely not valid rows
features = all_features + [f'close_p_in_{period}']
new_df = df[features].iloc[200:-period]

len(new_df)


3680356

We reduce the dataframe to the last 4 years

In [32]:

new_df = new_df[new_df.index.year >= first_year]
len(new_df)

3680356

In [33]:
# import pandas as pd

# # Assuming `df` is your DataFrame with OHLC data, features, and a 'future_return' column 
# # that represents the percent increase after N periods.

# # Calculate Pearson correlation coefficients
# correlation_matrix = new_df.corr()
# performance_correlations = correlation_matrix[f'close_p_in_{period}'].sort_values(ascending=False)

# print(performance_correlations)

1. Chi-Square Test for Independence
The Chi-Square test is useful for testing the relationship between two categorical variables. It can tell you whether the distribution of sample categorical data matches an expected distribution.

Use Case: To determine if there is a significant association between two categorical features.

In [54]:
import numpy as np
from scipy.stats import chi2_contingency

# Assuming df is your DataFrame and 'feature' and 'target' are your categorical columns

results = []
print(new_df.columns)

new_df['target'] = np.where(new_df[f'close_p_in_{period}'] > 0, 'Increase', 'Decrease')
for feature in all_features:
    contingency_table = pd.crosstab(new_df[feature], new_df['target'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    results.append({'feature': feature, 'chi2': chi2, 'p-value': p})

# Sort the results by chi2 in descending order
results = sorted(results, key=lambda x: x['chi2'], reverse=True)

# Display the sorted results
for result in results:
    print(f"Feature: {result['feature']}, Chi-Square Value: {result['chi2']}, P-Value: {result['p-value']}")



Index(['stoch_rsi_overbought_', 'stoch_rsi_oversold_',
       'stoch_rsi_bullish_crossover_', 'stoch_rsi_bearish_crossover_',
       'stoch_rsi_trend_', 'hma200_trend_', 'hma200_above_price_',
       'macd_bullish_crossover_', 'macd_bearish_crossover_', 'macd_positive_',
       'macd_trend_', 'close_p_in_30'],
      dtype='object')
Feature: hma200_trend_, Chi-Square Value: 12341.981646970511, P-Value: 0.0
Feature: macd_trend_, Chi-Square Value: 6317.035989819508, P-Value: 0.0
Feature: hma200_above_price_, Chi-Square Value: 4403.817489622915, P-Value: 0.0
Feature: macd_positive_, Chi-Square Value: 935.2414216253367, P-Value: 2.142499522495479e-205
Feature: stoch_rsi_oversold_, Chi-Square Value: 591.6792414129565, P-Value: 1.0805656720444698e-130
Feature: stoch_rsi_overbought_, Chi-Square Value: 309.20155481951434, P-Value: 3.259577502263205e-69
Feature: stoch_rsi_trend_, Chi-Square Value: 18.357771462497553, P-Value: 0.00010319545658255706
Feature: macd_bearish_crossover_, Chi-Square Va

In [61]:
cluster1 = ['target','hma200_trend_','hma200_above_price_','macd_trend_','macd_positive_']
best_df = get_best_group(new_df, cluster1, period)

save_model = True
if save_model:
    print("saving model")
    x = convert_grouped_to_dataframe(best_df)
    x = x[cluster1 ]
    x['signal'] = 'Buy'
    x.to_csv(f"./simple_model_buy.csv")

best_df

saving model


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count,mean,min,max
target,hma200_trend_,hma200_above_price_,macd_trend_,macd_positive_,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Increase,Not trending,True,Not trending,False,65,1.288211,0.000792,3.21199
Increase,Not trending,False,Not trending,False,195,1.079443,0.000239,3.898413
Increase,Not trending,False,Not trending,True,81,0.713434,0.000781,1.397711
Increase,Downtrend,False,Not trending,False,42,0.423919,0.000785,1.061973
Increase,Uptrend,False,Uptrend,True,268179,0.373992,1.9e-05,12.471277


In [49]:
cluster2 = cluster1 + ['stoch_rsi_oversold_','stoch_rsi_overbought_']
# cluster3 = cluster2 + ['stoch_rsi_overbought','macd_bullish_crossover_positive']
get_best_group(new_df, cluster2, period)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,count,mean,min,max
hma200_trend_,hma200_above_price_,macd_trend_,macd_positive_,stoch_rsi_oversold_,stoch_rsi_overbought_,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Not trending,False,Not trending,False,False,True,125,0.32984,-3.299736,3.890222
Not trending,True,Not trending,False,False,True,121,0.251054,-1.555125,2.996078
Not trending,False,Downtrend,True,True,False,3,0.240472,0.002172,0.490865
Not trending,True,Uptrend,False,False,False,8,0.213927,-0.130851,1.570665
Not trending,True,Not trending,False,True,False,100,0.206022,-0.62586,2.463981


In [52]:
column = f'close_p_in_{period}'

# Aggregations for count, min, and max
aggregations = {
    'count': (column, 'count'),
    'min': (column, 'min'),
    'max': (column, 'max')
}

# Perform the aggregations
result = df.agg(**aggregations)

# Count values greater than 0
greater_than_zero_count = (df[column] > 0).sum()

# Count values less than 0
less_than_zero_count = (df[column] < 0).sum()

# Adding these counts to the result
result['greater_than_zero'] = greater_than_zero_count
result['less_than_zero'] = less_than_zero_count

result


Unnamed: 0,close_p_in_30,greater_than_zero,less_than_zero
count,3680556.0,1859027,1778840
min,-38.86764,1859027,1778840
max,35.30435,1859027,1778840


In [50]:

get_best_group(new_df, features_stochrsi, period)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count,mean,min,max
stoch_rsi_overbought_,stoch_rsi_oversold_,stoch_rsi_bullish_crossover_,stoch_rsi_bearish_crossover_,stoch_rsi_trend_,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,False,True,False,Not trending,189,0.085468,-2.182746,6.968734
True,False,False,False,Not trending,56546,0.054255,-9.628426,8.9701
True,False,True,False,Not trending,682,0.026602,-5.158465,4.82753
True,False,False,True,Not trending,2101,0.025256,-6.834781,5.143171
False,True,True,False,Uptrend,46025,0.013112,-38.732548,12.385252


In [38]:

df_result = get_best_group(new_df, all_features, period, max=10, sort='mean')
df_result

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,count,mean,min,max
stoch_rsi_overbought_,stoch_rsi_oversold_,stoch_rsi_bullish_crossover_,stoch_rsi_bearish_crossover_,stoch_rsi_trend_,hma200_trend_,hma200_above_price_,macd_bullish_crossover_,macd_bearish_crossover_,macd_positive_,macd_trend_,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
False,True,False,True,Uptrend,Uptrend,False,True,False,True,Downtrend,2,1.193243,0.106865,2.279621
False,True,True,False,Not trending,Not trending,False,False,False,False,Not trending,1,1.079951,1.079951,1.079951
True,False,False,False,Not trending,Not trending,False,False,False,False,Not trending,28,0.785207,0.0,3.71822
True,False,False,False,Uptrend,Not trending,False,True,False,True,Downtrend,1,0.78096,0.78096,0.78096
False,True,True,False,Uptrend,Not trending,False,False,False,False,Not trending,5,0.774616,0.0,3.873082
False,False,False,False,Downtrend,Not trending,True,False,True,False,Uptrend,2,0.758099,-0.054467,1.570665
True,False,True,False,Not trending,Downtrend,False,False,False,False,Downtrend,1,0.711628,0.711628,0.711628
True,False,False,True,Downtrend,Uptrend,False,True,False,True,Uptrend,1,0.654911,0.654911,0.654911
False,False,True,False,Downtrend,Not trending,False,False,True,False,Uptrend,1,0.629819,0.629819,0.629819
False,False,True,False,Uptrend,Uptrend,False,False,False,False,Not trending,6,0.610113,-0.220254,2.672373


In [39]:

get_best_group(new_df, features_macd, period)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count,mean,min,max
macd_bullish_crossover_,macd_bearish_crossover_,macd_positive_,macd_trend_,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
False,True,False,Not trending,249,0.010716,-1.762366,2.398996
False,False,False,Downtrend,1616852,0.009558,-38.81211,22.190704
True,False,True,Downtrend,102042,0.007519,-38.86764,28.299313
False,True,False,Downtrend,36203,0.007215,-10.844406,14.257855
False,False,True,Downtrend,95154,0.006953,-17.231202,21.89007


In [40]:

get_best_group(new_df, features_stochrsi+features_macd, period)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,count,mean,min,max
stoch_rsi_overbought_,stoch_rsi_oversold_,stoch_rsi_bullish_crossover_,stoch_rsi_bearish_crossover_,stoch_rsi_trend_,macd_bullish_crossover_,macd_bearish_crossover_,macd_positive_,macd_trend_,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
True,False,False,True,Downtrend,True,False,True,Uptrend,2,0.479425,0.30394,0.654911
False,True,True,False,Not trending,False,False,True,Downtrend,3,0.434968,0.111293,0.762364
False,True,True,False,Not trending,False,False,False,Not trending,4,0.356938,-0.087663,1.079951
False,True,False,True,Not trending,True,False,True,Uptrend,1,0.349005,0.349005,0.349005
False,True,True,False,Downtrend,False,True,False,Not trending,1,0.314345,0.314345,0.314345


In [41]:
folder_lab1 = "./Data/lab1"
features_to_test = features_stochrsi
for period in periods:
    print(f"Period: {period}")
    features = all_features + [f'close_p_in_{period}']
    new_df = df[features].iloc[50:-period]
    result = get_best_group(new_df, features_to_test, period, 'count')
    result.to_csv(f"{folder_lab1}/{file}_stochrsi_{period}.csv")
    print(result)
    
    # print(get_best_group(new_df, features_hma, period))
    # print(get_best_group(new_df, features_macd, period, 'count'))
    # print(get_best_group(new_df, features_stochrsi+features_macd, period))
    print("")

Period: 30
                                                                                                                       count  \
stoch_rsi_overbought_ stoch_rsi_oversold_ stoch_rsi_bullish_crossover_ stoch_rsi_bearish_crossover_ stoch_rsi_trend_           
True                  False               False                        False                        Uptrend           692148   
False                 True                False                        False                        Downtrend         675265   
                      False               False                        False                        Downtrend         570458   
                                                                                                    Uptrend           532496   
True                  False               False                        False                        Downtrend         173534   

                                                                                            

i want to find the best combination of features, maximising the results

now we find the combo of features that are performing better

In [42]:
import itertools

folder_lab1 = "./Data/lab1"
file = "gemini_1m_features.csv"
# features_to_test = features_stochrsi 
features_to_test = features_stochrsi 

combinations_stochrsi = []
for r in range(1, len(features_stochrsi) + 1):
    combinations_stochrsi.extend(itertools.combinations(features_stochrsi, r))
print(len(combinations_stochrsi)) 

combinations_hma = []
for r in range(1, len(features_hma) + 1):
    combinations_hma.extend(itertools.combinations(features_hma, r))
print(len(combinations_hma)) 

combinations_macd = []
for r in range(1, len(features_macd) + 1):
    combinations_macd.extend(itertools.combinations(features_macd, r))
print(len(combinations_macd)) 

cartesian_product = list(itertools.product(combinations_stochrsi, combinations_hma, combinations_macd))
print(len(cartesian_product))


31
3
15
1395


In [43]:

folder_lab1 = "./Data/lab1"
file = "gemini_1m_features.csv"

features_to_test = features_stochrsi 



for period in periods:
    analysis_df = pd.DataFrame()
    print(f"Period: {period}")
    features = all_features + [f'close_p_in_{period}']
    new_df = df[features].iloc[50:-period]
    for combination in combinations_stochrsi:
        list_of_combination = list(combination)
        if len(list_of_combination) < 3:
            continue
        result = get_best_group(new_df, list_of_combination, period, 'count')
        tmp_df = convert_grouped_to_dataframe(result)
        tmp_df['period'] = period
        analysis_df = append_dataframes(tmp_df,analysis_df)
        # print(result)
        print("")
    analysis_df.to_csv(f"{folder_lab1}/{file}_stochrsi_analysis_{period}.csv")

Period: 30


















In [44]:

analysis_df = pd.read_csv(f"{folder_lab1}/{file}_stochrsi_analysis_{period}.csv")

# Filter the dataframe to include only rows where the count is greater than 1
filtered_df = analysis_df[analysis_df['count'] > 1]

# I want to minimise the losses
# filtered_df = filtered_df[filtered_df['mean'] > filtered_df['min'].abs()]

# Sort the dataframe by mean in descending order and by the difference between min and max in ascending order
sorted_df = filtered_df.sort_values(by=['mean', 'max', 'min'], ascending=[False, True, True])

sorted_df.head(5)


Unnamed: 0.1,Unnamed: 0,stoch_rsi_overbought_,stoch_rsi_oversold_,stoch_rsi_bullish_crossover_,stoch_rsi_bearish_crossover_,stoch_rsi_trend_,count,mean,min,max,period
29,29,False,True,True,False,,180565,0.011432,-38.732548,12.385252,30
49,49,,True,True,False,,180565,0.011432,-38.732548,12.385252,30
78,78,False,True,True,,,180565,0.011432,-38.732548,12.385252,30
44,44,,True,True,,Downtrend,132996,0.010983,-38.071665,11.32281,30
66,66,False,True,,,Downtrend,846836,0.010363,-38.81211,29.606152,30
