In [3]:
import pandas as pd
import numpy as np
import glob
from ta import add_all_ta_features
from ta.utils import dropna
from pathlib import Path
from tqdm.auto import tqdm  


from gym_trading_env.downloader import download
import datetime

In [4]:
download_dir = "data/raw"
processed_dir = "data/processed/"

TARGET_TIMEFRAME = pd.Timedelta("30m")
download_timeframe = "5m"

# Download

In [4]:
download(
    exchange_names = ["binance"],
    symbols= ["BTC/USDT"],
    timeframe= download_timeframe,
    dir = download_dir,
    since= datetime.datetime(year= 2017, month= 1, day=1),
    # until= datetime.datetime(year= 2023, month= 12, day=31),
)

BTC/USDT downloaded from binance and stored at data/raw/binance-BTCUSDT-5m.pkl


# Process

In [5]:
# only features beginning with 'feature' are used as inputs
not_features_but_keep_for_info = ['open',
    'high',
    'low',
    'close',
    'date_close', 
    'volume']

# chosen by the mighty chatGPT
important_columns = [
    'open',
    'high',
    'low',
    'close',
    'date_close',
    'volume',
    'volume_adi',
    'volatility_atr',
    'momentum_rsi',
    'trend_macd',
]

In [24]:
pathes = glob.glob(f"{download_dir}/*pkl")
for path in tqdm(pathes):
    name = Path(path).name.split(".")[0]
    df  = pd.read_pickle(path)

    df = add_all_ta_features(df, open="open", high="high", low="low", close="close", volume="volume", fillna=False)
    df = df[important_columns]

    df.columns = ['feature_' + str(col) if col not in  not_features_but_keep_for_info else str(col) for col in df.columns]
    
    df.dropna(inplace = True)
    
    # until 2024, change name at bottom
    # df = df.loc[:'2023']
    # from 2024
    # df = df.loc['2024':]
    df = df.loc['2017-12':'2018-11']
    
    timeframe = (df.index - df.index.to_series().shift(1)).value_counts().index[0]
    for offset in tqdm(range(TARGET_TIMEFRAME//timeframe)):
        process_df = df.resample("30min", offset= offset*timeframe).last()
        process_df.to_pickle(f"data/processed/{name}-{offset}-dez2017-dez2018.pkl")

  0%|          | 0/1 [00:00<?, ?it/s]

  self._psar[i] = high2


  0%|          | 0/6 [00:00<?, ?it/s]

# Read Data

In [8]:
df = pd.read_pickle("render_logs/Stock_2024-05-17_14-09-45.pkl")

In [4]:
df = pd.read_pickle("data/processed/training/binance-BTCUSDT-5m-0-2019-2023.pkl")

In [5]:
df

Unnamed: 0_level_0,open,high,low,close,volume,date_close
date_open,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-12-31 23:00:00,3702.00,3702.70,3697.69,3698.09,97.856818,2018-12-31 23:30:00
2018-12-31 23:30:00,3691.65,3707.50,3691.61,3702.90,204.466547,2019-01-01 00:00:00
2019-01-01 00:00:00,3692.94,3695.20,3690.56,3694.20,35.485194,2019-01-01 00:30:00
2019-01-01 00:30:00,3706.59,3707.46,3695.02,3700.31,45.608623,2019-01-01 01:00:00
2019-01-01 01:00:00,3696.89,3700.00,3695.50,3699.25,57.169151,2019-01-01 01:30:00
...,...,...,...,...,...,...
2023-12-30 20:30:00,42288.71,42296.00,42261.10,42283.95,51.826740,2023-12-30 21:00:00
2023-12-30 21:00:00,42276.33,42310.00,42271.10,42309.96,47.172610,2023-12-30 21:30:00
2023-12-30 21:30:00,42271.10,42305.08,42264.81,42305.08,54.102640,2023-12-30 22:00:00
2023-12-30 22:00:00,42250.01,42269.76,42218.87,42265.83,50.369660,2023-12-30 22:30:00


In [19]:
df.loc['2017-12':'2018-11']

Unnamed: 0_level_0,open,high,low,close,volume,date_close
date_open,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-12-01 00:00:00,9828.89,9830.00,9799.00,9800.00,32.337984,2017-12-01 00:30:00
2017-12-01 00:30:00,9668.51,9720.00,9646.13,9685.00,24.739673,2017-12-01 01:00:00
2017-12-01 01:00:00,9677.05,9698.96,9552.22,9555.00,40.431815,2017-12-01 01:30:00
2017-12-01 01:30:00,9649.99,9650.00,9633.00,9635.00,17.253340,2017-12-01 02:00:00
2017-12-01 02:00:00,9518.60,9549.97,9510.00,9535.04,11.428189,2017-12-01 02:30:00
...,...,...,...,...,...,...
2018-11-30 20:30:00,4029.35,4031.85,4023.18,4025.10,88.339186,2018-11-30 21:00:00
2018-11-30 21:00:00,4036.63,4036.99,4019.87,4026.00,134.286321,2018-11-30 21:30:00
2018-11-30 21:30:00,3983.43,4014.51,3973.09,3997.76,292.030301,2018-11-30 22:00:00
2018-11-30 22:00:00,3976.80,3985.00,3976.80,3980.24,77.194871,2018-11-30 22:30:00


In [7]:
# Add ta features filling NaN values
df = add_all_ta_features(df, open="open", high="high", low="low", close="close", volume="volume", fillna=False)

  self._psar[i] = high2


In [10]:
df = df[important_columns]

In [11]:
df[df.isna().any(axis=1)]

Unnamed: 0_level_0,open,high,low,close,date_close,volume,volume_adi,volume_obv,volume_cmf,volume_mfi,...,volatility_bbm,volatility_kcc,volatility_dcl,volatility_atr,momentum_uo,momentum_rsi,momentum_stoch_rsi,trend_macd,trend_ema_fast,trend_ichimoku_a
date_open,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-17 04:00:00,4261.48,4280.56,4261.48,4261.48,2017-08-17 04:05:00,2.189061,-2.189061e+00,2.189061,,,...,,,,0.000000,,,,,,
2017-08-17 04:05:00,4261.48,4261.48,4261.48,4261.48,2017-08-17 04:10:00,0.000000,-2.189061e+00,2.189061,,,...,,,,0.000000,,,,,,
2017-08-17 04:10:00,4261.48,4261.48,4261.48,4261.48,2017-08-17 04:15:00,0.000000,-2.189061e+00,2.189061,,,...,,,,0.000000,,,,,,
2017-08-17 04:15:00,4261.48,4264.88,4261.48,4261.48,2017-08-17 04:20:00,0.484666,-2.673727e+00,2.673727,,,...,,,,0.000000,,,,,,
2017-08-17 04:20:00,4264.88,4266.29,4264.88,4266.29,2017-08-17 04:25:00,2.328570,-3.451570e-01,5.002297,,,...,,,,0.000000,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-24 12:15:00,28080.00,28080.00,28080.00,28080.00,2023-03-24 12:20:00,0.000000,2.102762e+06,-542615.735574,0.440987,100.0,...,28058.5990,28080.0,27963.84,19.936465,,52.786593,0.99254,5.033717,28072.872215,28048.0075
2023-03-24 12:20:00,28080.00,28080.00,28080.00,28080.00,2023-03-24 12:25:00,0.000000,2.102762e+06,-542615.735574,0.427035,100.0,...,28061.5105,28080.0,27963.84,17.942819,,52.786593,0.99254,5.229447,28073.968797,28052.7575
2023-03-24 12:25:00,28080.00,28080.00,28080.00,28080.00,2023-03-24 12:30:00,0.000000,2.102762e+06,-542615.735574,0.533010,100.0,...,28065.0305,28080.0,27963.84,16.148537,,52.786593,0.00000,5.323203,28074.896675,28052.7575
2023-03-24 12:30:00,28080.00,28080.00,28080.00,28080.00,2023-03-24 12:35:00,0.000000,2.102762e+06,-542615.735574,0.469722,100.0,...,28066.6600,28080.0,27963.84,14.533683,,52.786593,0.00000,5.335994,28075.681802,28052.7575


In [18]:
df.isna().sum()

open                  0
high                  0
low                   0
close                 0
date_close            0
volume                0
volume_adi            0
volume_obv            0
volume_cmf            0
volume_mfi            0
volume_nvi            0
volatility_bbm        0
volatility_kcc        0
volatility_dcl        0
volatility_atr        0
momentum_uo           0
momentum_rsi          0
momentum_stoch_rsi    0
trend_macd            0
trend_ema_fast        0
trend_ichimoku_a      0
dtype: int64

In [12]:
df

Unnamed: 0_level_0,open,high,low,close,volume,date_close
date_open,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-11-30 23:05:00,9787.90,9849.00,9770.77,9803.95,17.134057,2017-11-30 23:10:00
2017-11-30 23:10:00,9786.09,9840.36,9786.09,9826.15,22.362661,2017-11-30 23:15:00
2017-11-30 23:15:00,9841.95,9891.00,9826.17,9885.51,17.654021,2017-11-30 23:20:00
2017-11-30 23:20:00,9880.02,9970.00,9880.01,9970.00,38.797065,2017-11-30 23:25:00
2017-11-30 23:25:00,9969.99,9999.00,9910.15,9998.99,23.830377,2017-11-30 23:30:00
...,...,...,...,...,...,...
2018-11-30 22:35:00,3965.65,3977.02,3953.03,3969.00,164.352279,2018-11-30 22:40:00
2018-11-30 22:40:00,3967.07,3973.44,3960.00,3968.74,108.125426,2018-11-30 22:45:00
2018-11-30 22:45:00,3969.80,3972.20,3963.02,3972.19,90.554172,2018-11-30 22:50:00
2018-11-30 22:50:00,3972.00,4017.41,3970.22,4003.51,324.810132,2018-11-30 22:55:00


In [9]:
df = pd.read_pickle("data/raw/binance-BTCUSDT-5m-2017-2018.pkl")

In [47]:
df.loc[:'2023']

Unnamed: 0_level_0,open,high,low,close,date_close,feature_volume,feature_volume_adi,feature_volume_obv,feature_volume_cmf,feature_volume_mfi,...,feature_volatility_bbm,feature_volatility_kcc,feature_volatility_dcl,feature_volatility_atr,feature_momentum_uo,feature_momentum_rsi,feature_momentum_stoch_rsi,feature_trend_macd,feature_trend_ema_fast,feature_trend_ichimoku_a
date_open,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [38]:
df = df.loc['2024':]

In [49]:
df.columns

Index(['open', 'high', 'low', 'close', 'date_close', 'feature_volume',
       'feature_volume_adi', 'feature_volume_obv', 'feature_volume_cmf',
       'feature_volume_mfi', 'feature_volume_nvi', 'feature_volatility_bbm',
       'feature_volatility_kcc', 'feature_volatility_dcl',
       'feature_volatility_atr', 'feature_momentum_uo', 'feature_momentum_rsi',
       'feature_momentum_stoch_rsi', 'feature_trend_macd',
       'feature_trend_ema_fast', 'feature_trend_ichimoku_a'],
      dtype='object')