In [None]:
from openfe import OpenFE, tree_to_formula
import pandas as pd
import matplotlib.pyplot as plt

In [None]:

def label_dataset():
    df = pd.read_pickle("./data/train/year/2022-1m.pkl")
    df['position'] = 0
    
    for interval, group in df.resample('12H'):
        if not group.empty:
            max_idx = group['close'].idxmax()
            min_idx = group['close'].idxmin()
            
            df.at[max_idx, 'position'] = -1
            df.at[min_idx, 'position'] = 1
    
    return df


In [None]:
def plot_data(df):
    plt.figure(figsize=(14, 7))
    plt.plot(df.index, df['close'], label='Close', color='gray', alpha=0.7)

    long_positions = df[df['position'] == 1]
    plt.scatter(long_positions.index, long_positions['close'], color='green', label='Long', marker='^', s=100)

    short_positions = df[df['position'] == -2]
    plt.scatter(short_positions.index, short_positions['close'], color='red', label='Short', marker='v', s=100)

    plt.title('Price Data with Long and Short Positions')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.legend()

    plt.show()

In [None]:
def find_features(df):
    ofe = OpenFE()
    features = ofe.fit(
        task="regression",
        data=df,
        label=df['position'],
        verbose=False,
        n_jobs=10,
        )

    for feature in ofe.new_features_list[:10]:
        print(tree_to_formula(feature))

In [None]:
df = label_dataset()

In [None]:
plot_data(df)

In [None]:
find_features(df)

"""
[month]
GroupByThenMin(open,position)
GroupByThenRank(volume,position)
min(low,volume)
round(volume)
(high*volume)
(close*volume)
(low*volume)
(close/volume)
(high-low)
(open*volume)

[year]
GroupByThenStd(high,position)
GroupByThenMin(open,position)
GroupByThenRank(volume,position)
(close*volume)
(low*volume)
(open*volume)
sigmoid(volume)
log(volume)
(high/low)
sqrt(volume)

???
round(open)
(close+volume)
(high+position)
min(high,low)
(high+low)
(high/close)
(open-high)
(open*low)
(open+low)
freq(low)
"""