In [408]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pandas_ta as ta

# sklearn imports
from sklearn.model_selection import (train_test_split, RandomizedSearchCV, TimeSeriesSplit)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline

# metrics
from sklearn.metrics import RocCurveDisplay, ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, roc_curve, auc, precision_score, recall_score, f1_score

# import base models
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# import meta model
from xgboost import XGBClassifier

from boruta import BorutaPy
import quantstats as qs
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')

In [409]:
def load_and_prepare_data(data):
    """
    Loads BTC 1m data initializes additional columns for analysis.
    
    """
    df = pd.read_csv(data, index_col=0)

    if not isinstance(df.index, pd.DatetimeIndex):
        df.index = pd.to_datetime(df.index)

    df['avg_price'] = (df['high'] + df['low'] + df['close']) / 3
    df['avg_price_vol'] = df['avg_price'] * df['volume']

    return df

def momentum(df):
    """
    Calculates momentum for price and volume from 1m data, and their respective changes over specified periods.
    
    """
    for period in [5, 10, 15, 30, 45]:

        # price momentum
        df[f'pm{period}'] = df['close'].pct_change(periods=period)
        df[f'delta_pm{period}'] = df[f'pm{period}'] - df[f'pm{period}'].shift(1)

        # log-volume momentum
        df[f'lv{period}'] = np.log(df['volume'] + 1).diff(periods=period)
        df[f'delta_lv{period}'] = df[f'lv{period}'].diff(1)

def resample_data(df):
    """
    Resamples the dataframe from 1m intervals to 1hour intervals.
    
    """
    ohlc_dict = {'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last'}
    df_1h = df.resample('1h').agg(ohlc_dict)
    df_1h['volume'] = df['avg_price_vol'].resample('1h').sum()
    
    return df_1h

def generate_features(df, df_1h):
    """
    Generate features using Pandas TA & expand more features computed from 1m data
    
    """
    try:
        df_1h.ta.strategy('All')
    except Exception as e:
        pass

    momentum(df)

    df_1h['VWAP1m_1h'] = df.resample('1h').apply(
        lambda x: (x['avg_price'] * x['avg_price_vol']).sum() / x['avg_price_vol'].sum()
    )

    for col in ['delta_pm', 'delta_lv']:
        for period in [5, 10, 15, 30, 45]:
            for metric in ['mean', 'max', 'min', 'std']:
                df_1h[f'{col}{period}_{metric}'] = df[col + str(period)].resample('1h').agg(metric)

    return df_1h

def check_missing_intervals(df):
    """
    Checking if there's any missing rows between hour
    
    """
    full_range = pd.date_range(start=df.index.min(), end=df.index.max(), freq='H')
    
    missing = full_range.difference(df.index)
    
    if len(missing) > 0:
        print(f"There are {len(missing)} missing intervals:")
        print(missing)
    else:
        print("No missing intervals found.")

def find_edge_nan_columns(df):
    """
    NaN Values at the beginning or end of rows (no NaN in the middle)
    
    """
    edge_nan_columns = []
    
    for column in df.columns:
        series = df[column]
        nan_indices = series.index[series.isna()]
        
        if len(nan_indices) > 0:
            first_non_nan = series.first_valid_index()
            last_non_nan = series.last_valid_index()
            
            if (nan_indices < first_non_nan).all() or (nan_indices > last_non_nan).all():
                edge_nan_columns.append(column)
    
    return edge_nan_columns

In [410]:
df_1m = load_and_prepare_data('BTCUSDT_PERP_1min_29May2020-30May2024.csv')

df_1h = resample_data(df_1m)

df_1h

Unnamed: 0,open,high,low,close,volume
2020-05-29 00:00:00,9572.0,9600.0,9502.5,9512.0,9.507407e+06
2020-05-29 01:00:00,9512.0,9533.0,9462.0,9509.5,5.964113e+06
2020-05-29 02:00:00,9509.5,9535.5,9492.5,9501.0,2.078164e+06
2020-05-29 03:00:00,9501.0,9528.0,9489.5,9516.5,1.998929e+06
2020-05-29 04:00:00,9516.5,9540.0,9492.0,9496.5,1.758524e+06
...,...,...,...,...,...
2024-05-30 19:00:00,69401.9,69404.0,68451.7,68677.5,4.238648e+08
2024-05-30 20:00:00,68677.5,68687.7,68023.8,68477.8,3.894684e+08
2024-05-30 21:00:00,68477.8,68555.2,68335.3,68438.0,1.053803e+08
2024-05-30 22:00:00,68438.0,68450.0,68239.2,68325.2,6.592580e+07


In [411]:
generate_features(df_1m, df_1h)

0it [00:00, ?it/s]

[X] Install TA-Lib to use 2crows. (pip install TA-Lib)
[X] Install TA-Lib to use 3blackcrows. (pip install TA-Lib)
[X] Install TA-Lib to use 3inside. (pip install TA-Lib)
[X] Install TA-Lib to use 3linestrike. (pip install TA-Lib)
[X] Install TA-Lib to use 3outside. (pip install TA-Lib)
[X] Install TA-Lib to use 3starsinsouth. (pip install TA-Lib)
[X] Install TA-Lib to use 3whitesoldiers. (pip install TA-Lib)
[X] Install TA-Lib to use abandonedbaby. (pip install TA-Lib)
[X] Install TA-Lib to use advanceblock. (pip install TA-Lib)
[X] Install TA-Lib to use belthold. (pip install TA-Lib)
[X] Install TA-Lib to use breakaway. (pip install TA-Lib)
[X] Install TA-Lib to use closingmarubozu. (pip install TA-Lib)
[X] Install TA-Lib to use concealbabyswall. (pip install TA-Lib)
[X] Install TA-Lib to use counterattack. (pip install TA-Lib)
[X] Install TA-Lib to use darkcloudcover. (pip install TA-Lib)
[X] Install TA-Lib to use dojistar. (pip install TA-Lib)
[X] Install TA-Lib to use dragonflydoj

117it [00:03, 29.80it/s]


Unnamed: 0,open,high,low,close,volume,ABER_ZG_5_15,ABER_SG_5_15,ABER_XG_5_15,ABER_ATR_5_15,ACCBL_20,...,delta_lv15_min,delta_lv15_std,delta_lv30_mean,delta_lv30_max,delta_lv30_min,delta_lv30_std,delta_lv45_mean,delta_lv45_max,delta_lv45_min,delta_lv45_std
2020-05-29 00:00:00,9572.0,9600.0,9502.5,9512.0,9.507407e+06,,,,,,...,-4.611929,2.142892,0.043208,3.308568,-4.669919,1.846258,0.230571,3.309715,-4.931050,2.228605
2020-05-29 01:00:00,9512.0,9533.0,9462.0,9509.5,5.964113e+06,,,,,,...,-4.822806,1.846413,0.013772,3.748332,-4.301250,2.084037,0.000065,3.704239,-3.826581,1.900640
2020-05-29 02:00:00,9509.5,9535.5,9492.5,9501.0,2.078164e+06,,,,,,...,-3.754168,1.627820,-0.000783,3.443400,-3.302462,1.603103,0.009588,3.592094,-4.946686,1.639747
2020-05-29 03:00:00,9501.0,9528.0,9489.5,9516.5,1.998929e+06,,,,,,...,-4.491781,1.541653,-0.012743,3.737606,-3.515634,1.820774,0.006277,3.507535,-4.009781,1.618520
2020-05-29 04:00:00,9516.5,9540.0,9492.0,9496.5,1.758524e+06,9514.033333,,,,,...,-4.673797,1.772791,0.010037,4.076960,-3.647096,1.677442,-0.036807,3.238884,-4.161870,1.652544
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-30 19:00:00,69401.9,69404.0,68451.7,68677.5,4.238648e+08,68895.706667,69398.313104,68393.100229,502.606438,66977.756551,...,-2.634938,1.212307,0.013080,3.086077,-3.241488,1.292181,0.058989,3.427352,-1.865859,1.094077
2024-05-30 20:00:00,68677.5,68687.7,68023.8,68477.8,3.894684e+08,68882.313333,69395.672675,68368.953991,513.359342,66966.645869,...,-2.124182,1.138521,-0.006608,2.413344,-3.425366,1.128146,-0.038935,1.949207,-2.595920,1.097861
2024-05-30 21:00:00,68477.8,68555.2,68335.3,68438.0,1.053803e+08,68844.880000,69338.675386,68351.084614,493.795386,67029.995582,...,-3.627930,1.402951,-0.009643,2.929229,-3.710615,1.511719,-0.046878,3.475506,-3.887843,1.495318
2024-05-30 22:00:00,68438.0,68450.0,68239.2,68325.2,6.592580e+07,68670.846667,69145.775693,68195.917640,474.929027,67073.053258,...,-2.296750,1.149026,0.019356,2.468059,-2.897658,1.050957,0.047627,3.232306,-3.097758,1.194005


In [412]:
df_1h.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
open,35112.0,3.398342e+04,1.608584e+04,8946.000000,2.096250e+04,3.043865e+04,4.450112e+04,7.365620e+04
high,35112.0,3.414101e+04,1.616930e+04,8975.000000,2.109650e+04,3.054950e+04,4.476100e+04,7.394900e+04
low,35112.0,3.381829e+04,1.599662e+04,8840.000000,2.084388e+04,3.033295e+04,4.426450e+04,7.327670e+04
close,35112.0,3.398511e+04,1.608637e+04,8946.000000,2.096325e+04,3.043895e+04,4.450455e+04,7.365620e+04
volume,35112.0,1.326168e+08,2.026057e+08,113600.261167,3.457233e+07,7.428505e+07,1.525884e+08,7.184480e+09
...,...,...,...,...,...,...,...,...
delta_lv30_std,35112.0,1.378355e+00,2.774402e-01,0.422880,1.183646e+00,1.348933e+00,1.541844e+00,3.109201e+00
delta_lv45_mean,35112.0,6.960616e-06,3.214599e-02,-0.145999,-2.112958e-02,1.527288e-04,2.123073e-02,2.305714e-01
delta_lv45_max,35112.0,3.286965e+00,8.546039e-01,1.195560,2.678245e+00,3.171862e+00,3.773366e+00,9.429325e+00
delta_lv45_min,35112.0,-3.292741e+00,8.514557e-01,-8.323391,-3.778737e+00,-3.177881e+00,-2.688623e+00,-1.273630e+00


In [413]:
check_missing_intervals(df_1h)

No missing intervals found.


In [414]:
# Drop unwanted columns
df_1h = df_1h.drop(columns=['open', 'high', 'low', 'volume'])

# Move 'VWAP1m_1h' next to 'close'
col_list = df_1h.columns.tolist()
vwap_index = col_list.index('VWAP1m_1h')
close_index = col_list.index('close')
col_list.insert(close_index + 1, col_list.pop(vwap_index))
df_1h = df_1h[col_list]

df_1h.head()

Unnamed: 0,close,VWAP1m_1h,ABER_ZG_5_15,ABER_SG_5_15,ABER_XG_5_15,ABER_ATR_5_15,ACCBL_20,ACCBM_20,ACCBU_20,AD,...,delta_lv15_min,delta_lv15_std,delta_lv30_mean,delta_lv30_max,delta_lv30_min,delta_lv30_std,delta_lv45_mean,delta_lv45_max,delta_lv45_min,delta_lv45_std
2020-05-29 00:00:00,9512.0,9551.934218,,,,,,,,-7654682.0,...,-4.611929,2.142892,0.043208,3.308568,-4.669919,1.846258,0.230571,3.309715,-4.93105,2.228605
2020-05-29 01:00:00,9509.5,9488.966795,,,,,,,,-5638643.0,...,-4.822806,1.846413,0.013772,3.748332,-4.30125,2.084037,6.5e-05,3.704239,-3.826581,1.90064
2020-05-29 02:00:00,9501.0,9513.10001,,,,,,,,-6895207.0,...,-3.754168,1.62782,-0.000783,3.4434,-3.302462,1.603103,0.009588,3.592094,-4.946686,1.639747
2020-05-29 03:00:00,9516.5,9513.690202,,,,,,,,-6090444.0,...,-4.491781,1.541653,-0.012743,3.737606,-3.515634,1.820774,0.006277,3.507535,-4.009781,1.61852
2020-05-29 04:00:00,9496.5,9514.799447,9514.033333,,,,,,,-7519245.0,...,-4.673797,1.772791,0.010037,4.07696,-3.647096,1.677442,-0.036807,3.238884,-4.16187,1.652544


In [415]:
total_nan_values = df_1h.isna().sum().sum()
print(f"Total number of NaN values in the DataFrame: {total_nan_values}")

Total number of NaN values in the DataFrame: 132100


In [416]:
print("Columns with NaN values:")
cols_with_nan = df_1h.columns[df_1h.isna().any()].tolist()
print(cols_with_nan)

Columns with NaN values:
['ABER_ZG_5_15', 'ABER_SG_5_15', 'ABER_XG_5_15', 'ABER_ATR_5_15', 'ACCBL_20', 'ACCBM_20', 'ACCBU_20', 'ADOSC_3_10', 'ADX_14', 'ADXR_14_2', 'DMP_14', 'DMN_14', 'ALMA_9_6.0_0.85', 'AO_5_34', 'OBV', 'OBV_min_2', 'OBV_max_2', 'OBVe_4', 'OBVe_12', 'APO_12_26', 'AROOND_14', 'AROONU_14', 'AROONOSC_14', 'ATRr_14', 'BBL_5_2.0', 'BBM_5_2.0', 'BBU_5_2.0', 'BBB_5_2.0', 'BBP_5_2.0', 'BIAS_SMA_26', 'AR_26', 'BR_26', 'CCI_14_0.015', 'open_Z_30_1', 'high_Z_30_1', 'low_Z_30_1', 'close_Z_30_1', 'CFO_9', 'CG_10', 'CHOP_14_1_100.0', 'CKSPl_10_3_20', 'CKSPs_10_3_20', 'CMF_20', 'CMO_14', 'COPC_11_14_10', 'CTI_12', 'DEMA_10', 'DCL_20_20', 'DCM_20_20', 'DCU_20_20', 'DPO_20', 'EBSW_40_10', 'EFI_13', 'EMA_10', 'ENTP_10', 'EOM_14_100000000', 'ER_10', 'BULLP_13', 'BEARP_13', 'FISHERT_9_1', 'FISHERTs_9_1', 'FWMA_10', 'HILO_13_21', 'HILOl_13_21', 'HILOs_13_21', 'HMA_10', 'ISA_9', 'ISB_26', 'ITS_9', 'IKS_26', 'ICS_26', 'INERTIA_20_14', 'JMA_7_0.0', 'KAMA_10_2_30', 'KCLe_20_2', 'KCBe_20_2', '

In [417]:
print("Distribution of NaN values:")
nan_distribution = df_1h.isna().sum().value_counts().sort_index()
print(nan_distribution)

Distribution of NaN values:
0        90
1        14
2         2
3         1
4         7
6         2
7         1
8         4
9        13
10        5
11        4
12        5
13       10
14        8
15        2
16        2
17        4
18        2
19       16
21        1
23        1
24        2
25       15
26        2
28        2
29       10
30        1
32        2
33        3
38        1
39        1
44        1
50        1
52        1
55        1
67        1
76        1
254       1
11800     1
12115     1
16998     1
17308     1
17412     1
17726     1
17810     1
18115     1
Name: count, dtype: int64


In [418]:
edge_nan_cols = find_edge_nan_columns(df_1h)

print("Columns with NaN values only at the beginning or end:")
print(edge_nan_cols)

Columns with NaN values only at the beginning or end:
['ABER_ZG_5_15', 'ABER_SG_5_15', 'ABER_XG_5_15', 'ABER_ATR_5_15', 'ACCBL_20', 'ACCBM_20', 'ACCBU_20', 'ADOSC_3_10', 'ADX_14', 'ADXR_14_2', 'DMP_14', 'DMN_14', 'ALMA_9_6.0_0.85', 'AO_5_34', 'OBV', 'OBV_min_2', 'OBV_max_2', 'OBVe_4', 'OBVe_12', 'APO_12_26', 'AROOND_14', 'AROONU_14', 'AROONOSC_14', 'ATRr_14', 'BBL_5_2.0', 'BBM_5_2.0', 'BBU_5_2.0', 'BBB_5_2.0', 'BBP_5_2.0', 'BIAS_SMA_26', 'AR_26', 'BR_26', 'CCI_14_0.015', 'open_Z_30_1', 'high_Z_30_1', 'low_Z_30_1', 'close_Z_30_1', 'CFO_9', 'CG_10', 'CHOP_14_1_100.0', 'CKSPl_10_3_20', 'CKSPs_10_3_20', 'CMF_20', 'CMO_14', 'COPC_11_14_10', 'CTI_12', 'DEMA_10', 'DCL_20_20', 'DCM_20_20', 'DCU_20_20', 'EBSW_40_10', 'EFI_13', 'EMA_10', 'ENTP_10', 'EOM_14_100000000', 'ER_10', 'BULLP_13', 'BEARP_13', 'FISHERT_9_1', 'FISHERTs_9_1', 'FWMA_10', 'HILO_13_21', 'HMA_10', 'ISA_9', 'ISB_26', 'ITS_9', 'IKS_26', 'ICS_26', 'INERTIA_20_14', 'JMA_7_0.0', 'KAMA_10_2_30', 'KCLe_20_2', 'KCBe_20_2', 'KCUe_20_2',

In [419]:
max_nan_beginning_count = 0
max_nan_beginning_column = ''
max_nan_beginning_first_valid = None

max_nan_end_count = 0
max_nan_end_column = ''
max_nan_end_last_valid = None

for col in edge_nan_cols:
    series = df_1h[col]
    first_valid = series.first_valid_index()
    last_valid = series.last_valid_index()
    nan_count = series.isna().sum()
    total_count = len(series)
    
    nans_at_beginning = series.isna().head(1).bool()
    nans_at_end = series.isna().tail(1).bool()
    
    if nans_at_beginning and nans_at_end:
        beginning_nans = series.index.get_loc(first_valid)
        end_nans = total_count - series.index.get_loc(last_valid) - 1
        # print(f"{col}: {beginning_nans} NaNs at the beginning, {end_nans} NaNs at the end.")
        # print(f"First valid index: {first_valid}, Last valid index: {last_valid}")
        if beginning_nans > max_nan_beginning_count:
            max_nan_beginning_count = beginning_nans
            max_nan_beginning_column = col
            max_nan_beginning_first_valid = first_valid
        if end_nans > max_nan_end_count:
            max_nan_end_count = end_nans
            max_nan_end_column = col
            max_nan_end_last_valid = last_valid
    elif nans_at_beginning:
        beginning_nans = series.index.get_loc(first_valid)
        # print(f"{col}: {nan_count} NaNs at the beginning. First valid index: {first_valid}")
        if beginning_nans > max_nan_beginning_count:
            max_nan_beginning_count = beginning_nans
            max_nan_beginning_column = col
            max_nan_beginning_first_valid = first_valid
    elif nans_at_end:
        end_nans = total_count - series.index.get_loc(last_valid) - 1
        # print(f"{col}: {nan_count} NaNs at the end. Last valid index: {last_valid}")
        if end_nans > max_nan_end_count:
            max_nan_end_count = end_nans
            max_nan_end_column = col
            max_nan_end_last_valid = last_valid

# After the loop, print the results
print(f"\nColumn with most NaNs at the beginning:")
print(f"Column: {max_nan_beginning_column}")
print(f"Number of NaNs at beginning: {max_nan_beginning_count}")
print(f"First valid index: {max_nan_beginning_first_valid}")

print(f"\nColumn with most NaNs at the end:")
print(f"Column: {max_nan_end_column}")
print(f"Number of NaNs at end: {max_nan_end_count}")
print(f"Last valid index: {max_nan_end_last_valid}")


Column with most NaNs at the beginning:
Column: PVIe_255
Number of NaNs at beginning: 254
First valid index: 2020-06-08 14:00:00

Column with most NaNs at the end:
Column: ICS_26
Number of NaNs at end: 25
Last valid index: 2024-05-29 22:00:00


In [420]:
# Drop rows before the 'First valid index' of the column with most NaNs at the beginning
df_1h = df_1h.loc[max_nan_beginning_first_valid:]

# Drop rows after the 'Last valid index' of the column with most NaNs at the end
df_1h = df_1h.loc[:max_nan_end_last_valid]

df_1h.shape

(34833, 248)

In [421]:
total_nan_values = df_1h.isna().sum().sum()
print(f"Total number of NaN values in the DataFrame: {total_nan_values}")

Total number of NaN values in the DataFrame: 128193


In [422]:
print("Columns with NaN values:")
cols_with_nan = df_1h.columns[df_1h.isna().any()].tolist()
pprint(cols_with_nan)

Columns with NaN values:
['HILOl_13_21',
 'HILOs_13_21',
 'PSARl_0.02_0.2',
 'PSARs_0.02_0.2',
 'QQEl_14_5_4.236',
 'QQEs_14_5_4.236',
 'SUPERTl_7_3.0',
 'SUPERTs_7_3.0']


In [423]:
print("Distribution of NaN values:")
nan_distribution = df_1h.isna().sum().value_counts().sort_index()
print(nan_distribution)

Distribution of NaN values:
0        240
11707      1
11987      1
16884      1
17195      1
17223      1
17610      1
17638      1
17949      1
Name: count, dtype: int64


In [424]:
df_1h = df_1h.drop(columns=cols_with_nan)

In [425]:
df_1h.shape

(34833, 240)

In [426]:
total_nan_values = df_1h.isna().sum().sum()
print(f"Total number of NaN values in the DataFrame: {total_nan_values}")

Total number of NaN values in the DataFrame: 0


In [427]:
check_missing_intervals(df_1h)

No missing intervals found.


In [428]:
# Definition of the Target Variable

df_1h['Trend'] = np.where((df_1h['VWAP1m_1h'].shift(-3) > df_1h['close']) & 
                          (df_1h['close'].shift(-3) > df_1h['close']), 1, 0)

df_1h

Unnamed: 0,close,VWAP1m_1h,ABER_ZG_5_15,ABER_SG_5_15,ABER_XG_5_15,ABER_ATR_5_15,ACCBL_20,ACCBM_20,ACCBU_20,AD,...,delta_lv15_std,delta_lv30_mean,delta_lv30_max,delta_lv30_min,delta_lv30_std,delta_lv45_mean,delta_lv45_max,delta_lv45_min,delta_lv45_std,Trend
2020-06-08 14:00:00,9671.0,9679.383516,9713.433333,9767.342803,9659.523863,53.909470,9573.526770,9726.575,9876.401770,9.885630e+07,...,1.653113,0.020297,3.930902,-4.534776,1.702648,0.011128,3.945219,-2.925543,1.413177,1
2020-06-08 15:00:00,9704.5,9683.193162,9704.600000,9757.515505,9651.684495,52.915505,9605.697921,9723.575,9845.697921,1.005255e+08,...,1.574295,-0.028121,3.167943,-4.007485,1.595530,0.026700,3.919837,-3.722418,1.675839,0
2020-06-08 16:00:00,9710.5,9683.181948,9697.700000,9751.387805,9644.012195,53.687805,9610.987882,9722.525,9835.612882,1.020476e+08,...,1.518580,0.037134,4.169462,-2.645048,1.488451,-0.001739,5.019359,-4.879053,1.730621,0
2020-06-08 17:00:00,9718.5,9721.911049,9696.633333,9749.075285,9644.191382,52.441951,9611.587324,9723.300,9835.837324,1.016252e+08,...,1.459548,-0.034227,2.992733,-3.351601,1.346757,-0.062129,3.412916,-3.829416,1.680962,0
2020-06-08 18:00:00,9700.5,9714.128153,9699.300000,9750.645821,9647.954179,51.345821,9614.320599,9720.925,9830.695599,1.009972e+08,...,1.384836,-0.000515,2.539230,-3.657212,1.168252,0.010364,4.373495,-2.272352,1.366392,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-29 18:00:00,67425.1,67430.416801,67556.266667,68017.137631,67095.395702,460.870964,67124.805027,68124.460,69156.855027,1.258931e+11,...,1.667223,0.022957,3.660536,-3.428235,1.575367,-0.022157,2.829299,-3.272219,1.338053,1
2024-05-29 19:00:00,67297.6,67488.086765,67496.420000,67962.839567,67030.000433,466.419567,67043.212369,68067.160,69138.462369,1.257985e+11,...,1.163025,-0.005053,3.022257,-2.639231,1.249990,0.012229,3.639103,-3.287896,1.290702,1
2024-05-29 20:00:00,67504.3,67329.209714,67469.240000,67928.144929,67010.335071,458.904929,66990.025087,68015.470,69078.825087,1.258853e+11,...,1.341830,-0.020726,2.046208,-2.777921,1.031009,-0.002537,2.860018,-3.489744,1.263421,0
2024-05-29 21:00:00,67723.9,67623.172845,67499.080000,67960.371267,67037.788733,461.291267,66932.943837,67971.205,69040.993837,1.259465e+11,...,1.206207,0.009595,2.149816,-3.046755,1.117112,-0.000489,2.920149,-3.001259,1.119997,0


In [429]:
y = df_1h['Trend']

pd.Series(y).value_counts()

Trend
0    18693
1    16140
Name: count, dtype: int64

In [430]:
all_features = df_1h.drop(['close', 'VWAP1m_1h', 'Trend'], axis=1)

all_features.shape

(34833, 238)