In [50]:
import pandas as pd

from StateMachines.TradingStateMachine import TradingStateMachine
from StrategyBuilders import getStrategyBuilder
from Strategies.AbstractTradingStrategy import AbstractTradingStrategy

In [51]:
folder = "./Data/bybit"
folder_lab1 = "./Data/lab1"
file = "ETH-USDT:USDT_1m.csv"
periods = [5, 10, 15, 20, 30, 60, 120]

In [52]:
folder = "./Data/gemini"
folder_lab1 = "./Data/lab1"
file = "gemini_1m_features.csv"
period = 10
periods = [period]


In [53]:

strategy_builder = getStrategyBuilder('SIMPLE')
trader: TradingStateMachine = TradingStateMachine(strategy_builder("gemini_btcusd_1m"))

df = pd.read_csv(f"{folder}/{file}")
len(df)

3680586

In [54]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['year_'] = df['timestamp'].dt.year

features_base = ['timestamp', 'close', 'year_']
columns = features_base + AbstractTradingStrategy.get_features_list(df)
features_stochrsi = [col for col in columns if 'stoch_rsi' in col and 'change_of_trend' not in col and col[-1] == '_']
features_hma = [col for col in columns if 'hma' in col and 'change_of_trend' not in col and col[-1] == '_']
features_macd = [col for col in columns if 'macd' in col and 'change_of_trend' not in col and col[-1] == '_']


In [55]:
def get_best_group(df, features, period, sort='mean', max=5):
    grouped_df = df.groupby(features)[f'close_p_in_{period}'].agg(['count', 'mean', 'min', 'max'])
    grouped_df = grouped_df.sort_values(by=sort, ascending=False)
    return grouped_df.head(max)


def convert_grouped_to_dataframe(grouped_df):
    df = grouped_df.reset_index()
    return df

# import pandas as pd

def append_dataframes(df1, df2):
    # Concatenate the dataframes
    df = pd.concat([df1, df2], ignore_index=True)
    return df

we want to work on the history 1m from bitcoin (gemini)

Dataframe must be indexed by timestamp

In [56]:
df = df[columns]
df = df.set_index('timestamp')

df.tail(5)

Unnamed: 0_level_0,close,year_,stoch_rsi_overbought_,stoch_rsi_oversold_,stoch_rsi_bullish_crossover_,stoch_rsi_bearish_crossover_,stoch_rsi_trend_,stoch_rsi_change_of_trend_,hma200_trend_,hma200_change_of_trend_,hma200_above_price_,macd_bullish_crossover_,macd_bearish_crossover_,macd_positive_,macd_trend_,macd_change_of_trend_,year_
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2024-04-07 00:24:00,69046.77,2024,False,False,False,False,Downtrend,,Downtrend,,True,False,False,True,Uptrend,,2024
2024-04-07 00:25:00,68944.63,2024,False,True,False,False,Downtrend,,Downtrend,,True,False,True,False,Uptrend,,2024
2024-04-07 00:26:00,68942.84,2024,False,True,False,False,Downtrend,,Downtrend,,True,False,False,False,Downtrend,Uptrend->Downtrend,2024
2024-04-07 00:27:00,68947.39,2024,False,True,False,False,Downtrend,,Downtrend,,True,False,False,False,Downtrend,,2024
2024-04-07 00:28:00,68890.02,2024,False,True,True,False,Downtrend,,Downtrend,,True,False,False,False,Downtrend,,2024


Now we calculate the result for different number of intervals

In [57]:
# adding shifted periods results
features_close = [f'close_in_{period}' for period in periods]
features_close_perc = [f'close_p_in_{period}' for period in periods]
for period in periods:
    column_name = f'close_in_{period}'
    df[column_name] = df['close'].shift(-period)
    column_perc = f'close_p_in_{period}'
    df[column_perc] = ((df['close'].shift(-period) - df['close']) / df['close']) * 100
    
df.to_csv(f"{folder_lab1}/{file}_periods.csv")
df.columns

Index(['close', 'year_', 'stoch_rsi_overbought_', 'stoch_rsi_oversold_',
       'stoch_rsi_bullish_crossover_', 'stoch_rsi_bearish_crossover_',
       'stoch_rsi_trend_', 'stoch_rsi_change_of_trend_', 'hma200_trend_',
       'hma200_change_of_trend_', 'hma200_above_price_',
       'macd_bullish_crossover_', 'macd_bearish_crossover_', 'macd_positive_',
       'macd_trend_', 'macd_change_of_trend_', 'year_', 'close_in_10',
       'close_p_in_10'],
      dtype='object')

we classify the features based on the result for the given number of intervals

In [73]:
period = 10
x = df[df[f'close_p_in_{period}'] > 1]
x[['year_', 'close', f'close_in_{period}', f'close_p_in_{period}']].head(10)

Unnamed: 0_level_0,year_,year_,close,close_in_10,close_p_in_10
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-01 16:31:00,2017,2017,984.95,994.95,1.01528
2017-01-01 16:34:00,2017,2017,985.54,1000.01,1.468231
2017-01-01 16:35:00,2017,2017,985.54,1000.0,1.467216
2017-01-01 16:36:00,2017,2017,985.54,999.99,1.466201
2017-01-01 16:37:00,2017,2017,985.54,996.71,1.133389
2017-01-02 13:33:00,2017,2017,1005.21,1017.54,1.226609
2017-01-02 13:34:00,2017,2017,1005.21,1017.54,1.226609
2017-01-02 13:37:00,2017,2017,1011.41,1022.82,1.128128
2017-01-02 20:15:00,2017,2017,995.1,1015.56,2.056075
2017-01-04 17:03:00,2017,2017,1101.55,1113.7,1.102991


In [59]:
# Create a column variable that is the combination of all the features_ lists
all_features = features_stochrsi + features_hma + features_macd

# removing likely not valid rows
features = all_features + [f'close_p_in_{period}']
new_df = df[features].iloc[50:-period]

len(new_df)


3680526

We reduce the dataframe to the last 4 years

In [60]:

new_df = new_df[new_df.index.year >= 2020]
len(new_df)

2181043

In [61]:
# import pandas as pd

# # Assuming `df` is your DataFrame with OHLC data, features, and a 'future_return' column 
# # that represents the percent increase after N periods.

# # Calculate Pearson correlation coefficients
# correlation_matrix = new_df.corr()
# performance_correlations = correlation_matrix[f'close_p_in_{period}'].sort_values(ascending=False)

# print(performance_correlations)

1. Chi-Square Test for Independence
The Chi-Square test is useful for testing the relationship between two categorical variables. It can tell you whether the distribution of sample categorical data matches an expected distribution.

Use Case: To determine if there is a significant association between two categorical features.

In [62]:
import numpy as np
from scipy.stats import chi2_contingency

# Assuming df is your DataFrame and 'feature' and 'target' are your categorical columns

results = []
print(new_df.columns)

new_df['target'] = np.where(new_df[f'close_p_in_{period}'] > 0, 'Increase', 'Decrease')
for feature in all_features:
    contingency_table = pd.crosstab(new_df[feature], new_df['target'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    results.append({'feature': feature, 'chi2': chi2, 'p-value': p})

# Sort the results by chi2 in descending order
results = sorted(results, key=lambda x: x['chi2'], reverse=True)

# Display the sorted results
for result in results:
    print(f"Feature: {result['feature']}, Chi-Square Value: {result['chi2']}, P-Value: {result['p-value']}")



Index(['stoch_rsi_overbought_', 'stoch_rsi_oversold_',
       'stoch_rsi_bullish_crossover_', 'stoch_rsi_bearish_crossover_',
       'stoch_rsi_trend_', 'hma200_trend_', 'hma200_above_price_',
       'macd_bullish_crossover_', 'macd_bearish_crossover_', 'macd_positive_',
       'macd_trend_', 'close_p_in_10'],
      dtype='object')
Feature: hma200_trend_, Chi-Square Value: 5294.457175366875, P-Value: 0.0
Feature: hma200_above_price_, Chi-Square Value: 3886.291890838492, P-Value: 0.0
Feature: macd_trend_, Chi-Square Value: 1799.7493716834656, P-Value: 0.0
Feature: macd_positive_, Chi-Square Value: 606.8595198837755, P-Value: 5.392782078070306e-134
Feature: stoch_rsi_oversold_, Chi-Square Value: 350.7773952768926, P-Value: 2.8697978132677026e-78
Feature: stoch_rsi_overbought_, Chi-Square Value: 301.60143252015706, P-Value: 1.4752859496938196e-67
Feature: stoch_rsi_trend_, Chi-Square Value: 66.70263716384488, P-Value: 3.278735444970216e-15
Feature: stoch_rsi_bearish_crossover_, Chi-Square

In [85]:
cluster1 = ['macd_trend_','hma200_trend_','hma200_above_price_','macd_positive_']
best_df = get_best_group(new_df, cluster1, period)

x = convert_grouped_to_dataframe(best_df)
x = x[cluster1 ]
x['signal'] = 'Buy'
x.to_csv(f"./simple_model_buy.csv")


In [75]:
cluster2 = cluster1 + ['macd_positive_']
# cluster3 = cluster2 + ['stoch_rsi_overbought','macd_bullish_crossover_positive']
get_best_group(new_df, cluster2, period)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count,mean,min,max
macd_trend_,hma200_trend_,hma200_above_price_,macd_positive_,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Uptrend,Not trending,True,False,16,0.050221,-0.296504,0.926901
Not trending,Not trending,True,False,475,0.04862,-0.262448,2.463981
Not trending,Downtrend,False,False,174,0.029657,-0.600225,1.14332
Uptrend,Not trending,False,True,248,0.022914,-0.84319,0.975514
Downtrend,Not trending,True,False,254,0.009517,-0.848389,0.812574


In [76]:

get_best_group(new_df, features_stochrsi, period)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count,mean,min,max
stoch_rsi_overbought_,stoch_rsi_oversold_,stoch_rsi_bullish_crossover_,stoch_rsi_bearish_crossover_,stoch_rsi_trend_,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
True,False,False,False,Not trending,56546,0.035783,-6.779029,9.678757
False,False,True,False,Not trending,189,0.028147,-2.340636,4.742192
True,False,True,False,Not trending,682,0.024492,-4.081744,1.740407
False,False,False,False,Not trending,1827,0.0122,-2.364631,2.111386
False,True,True,False,Not trending,1544,0.009445,-3.406666,2.023358


In [77]:

df_result = get_best_group(new_df, all_features, period, max=10, sort='mean')
df_result

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,count,mean,min,max
stoch_rsi_overbought_,stoch_rsi_oversold_,stoch_rsi_bullish_crossover_,stoch_rsi_bearish_crossover_,stoch_rsi_trend_,hma200_trend_,hma200_above_price_,macd_bullish_crossover_,macd_bearish_crossover_,macd_positive_,macd_trend_,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
True,False,True,False,Not trending,Uptrend,True,False,False,False,Downtrend,2,0.868422,0.790426,0.946417
False,True,False,False,Not trending,Uptrend,True,False,False,True,Downtrend,1,0.77839,0.77839,0.77839
True,False,False,False,Not trending,Not trending,False,False,False,False,Not trending,28,0.68985,0.0,3.898413
False,True,False,False,Not trending,Uptrend,False,False,False,True,Downtrend,1,0.613569,0.613569,0.613569
False,True,True,False,Uptrend,Downtrend,True,False,True,False,Downtrend,1,0.58872,0.58872,0.58872
True,False,False,False,Uptrend,Not trending,False,True,False,True,Downtrend,1,0.520928,0.520928,0.520928
False,False,True,False,Uptrend,Uptrend,False,False,True,False,Not trending,1,0.513956,0.513956,0.513956
True,False,True,False,Downtrend,Uptrend,True,False,False,True,Downtrend,4,0.440718,-0.153301,1.371781
False,False,False,False,Downtrend,Not trending,True,False,True,False,Uptrend,2,0.434615,-0.057671,0.926901
True,False,False,False,Uptrend,Not trending,False,True,False,True,Uptrend,2,0.38425,-0.05653,0.82503


In [67]:

get_best_group(new_df, features_macd, period)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count,mean,min,max
macd_bullish_crossover_,macd_bearish_crossover_,macd_positive_,macd_trend_,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
False,True,False,Not trending,18,0.014926,-0.332016,0.27192
True,False,True,Not trending,20,0.012828,-0.463038,0.334836
False,True,False,Downtrend,21220,0.005145,-6.387041,3.442899
False,False,False,Downtrend,950318,0.003751,-13.738419,14.28355
False,False,False,Uptrend,58633,0.003006,-8.68676,7.430837


In [68]:

get_best_group(new_df, features_stochrsi+features_macd, period)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,count,mean,min,max
stoch_rsi_overbought_,stoch_rsi_oversold_,stoch_rsi_bullish_crossover_,stoch_rsi_bearish_crossover_,stoch_rsi_trend_,macd_bullish_crossover_,macd_bearish_crossover_,macd_positive_,macd_trend_,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
True,False,False,True,Not trending,False,True,False,Downtrend,4,0.427003,-0.0687,1.745556
False,False,True,False,Uptrend,False,False,False,Not trending,4,0.346397,-0.162935,1.554688
False,True,True,False,Uptrend,True,False,True,Not trending,1,0.3026,0.3026,0.3026
False,False,True,False,Downtrend,False,False,True,Not trending,10,0.272657,-0.115768,1.262354
False,False,False,True,Downtrend,False,False,True,Not trending,5,0.230649,0.0,1.153243


In [69]:
folder_lab1 = "./Data/lab1"
features_to_test = features_stochrsi
for period in periods:
    print(f"Period: {period}")
    features = all_features + [f'close_p_in_{period}']
    new_df = df[features].iloc[50:-period]
    result = get_best_group(new_df, features_to_test, period, 'count')
    result.to_csv(f"{folder_lab1}/{file}_stochrsi_{period}.csv")
    print(result)
    
    # print(get_best_group(new_df, features_hma, period))
    # print(get_best_group(new_df, features_macd, period, 'count'))
    # print(get_best_group(new_df, features_stochrsi+features_macd, period))
    print("")

Period: 10
                                                                                                                       count  \
stoch_rsi_overbought_ stoch_rsi_oversold_ stoch_rsi_bullish_crossover_ stoch_rsi_bearish_crossover_ stoch_rsi_trend_           
True                  False               False                        False                        Uptrend           692157   
False                 True                False                        False                        Downtrend         675265   
                      False               False                        False                        Downtrend         570462   
                                                                                                    Uptrend           532496   
True                  False               False                        False                        Downtrend         173537   

                                                                                            

i want to find the best combination of features, maximising the results

now we find the combo of features that are performing better

In [70]:
import itertools

folder_lab1 = "./Data/lab1"
file = "gemini_1m_features.csv"
# features_to_test = features_stochrsi 
features_to_test = features_stochrsi 

combinations_stochrsi = []
for r in range(1, len(features_stochrsi) + 1):
    combinations_stochrsi.extend(itertools.combinations(features_stochrsi, r))
print(len(combinations_stochrsi)) 

combinations_hma = []
for r in range(1, len(features_hma) + 1):
    combinations_hma.extend(itertools.combinations(features_hma, r))
print(len(combinations_hma)) 

combinations_macd = []
for r in range(1, len(features_macd) + 1):
    combinations_macd.extend(itertools.combinations(features_macd, r))
print(len(combinations_macd)) 

cartesian_product = list(itertools.product(combinations_stochrsi, combinations_hma, combinations_macd))
print(len(cartesian_product))


31
3
15
1395


In [71]:

folder_lab1 = "./Data/lab1"
file = "gemini_1m_features.csv"

features_to_test = features_stochrsi 



for period in periods:
    analysis_df = pd.DataFrame()
    print(f"Period: {period}")
    features = all_features + [f'close_p_in_{period}']
    new_df = df[features].iloc[50:-period]
    for combination in combinations_stochrsi:
        list_of_combination = list(combination)
        if len(list_of_combination) < 3:
            continue
        result = get_best_group(new_df, list_of_combination, period, 'count')
        tmp_df = convert_grouped_to_dataframe(result)
        tmp_df['period'] = period
        analysis_df = append_dataframes(tmp_df,analysis_df)
        # print(result)
        print("")
    analysis_df.to_csv(f"{folder_lab1}/{file}_stochrsi_analysis_{period}.csv")

Period: 10


















In [72]:

analysis_df = pd.read_csv(f"{folder_lab1}/{file}_stochrsi_analysis_{period}.csv")

# Filter the dataframe to include only rows where the count is greater than 1
filtered_df = analysis_df[analysis_df['count'] > 1]

# I want to minimise the losses
# filtered_df = filtered_df[filtered_df['mean'] > filtered_df['min'].abs()]

# Sort the dataframe by mean in descending order and by the difference between min and max in ascending order
sorted_df = filtered_df.sort_values(by=['mean', 'max', 'min'], ascending=[False, True, True])

sorted_df.head(5)


Unnamed: 0.1,Unnamed: 0,stoch_rsi_overbought_,stoch_rsi_oversold_,stoch_rsi_bullish_crossover_,stoch_rsi_bearish_crossover_,stoch_rsi_trend_,count,mean,min,max,period
19,19,False,True,,False,Uptrend,203530,0.007003,-9.767953,8.921122,10
39,39,,True,,False,Uptrend,203530,0.007003,-9.767953,8.921122,10
9,9,,True,False,False,Uptrend,157505,0.006701,-9.196208,8.921122,10
43,43,,True,False,,Uptrend,174996,0.006311,-9.196208,8.921122,10
29,29,False,True,True,False,,180565,0.004603,-38.614407,6.961575,10
