In [6]:
import pandas as pd
from datetime import timedelta
from tqdm.auto import tqdm

def get_file(ticker):
    ''' Find files buy ticker names, file names can be in different formats '''
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker}_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker}_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}-SWAP_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}-SWAP_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    return None, None


def create_train_df(df, type, target_offset, span):
    ''' Create train dataset from signal statistics and ticker candle data'''
    train_df = pd.DataFrame()
    tickers = df['ticker'].unique()
    
    for ticker in tqdm(tickers):
        signal_df = df[df['ticker'] == ticker]
        times = signal_df['time']
        
        tmp_df_1h, tmp_df_4h = get_file(ticker)

        for t in times.to_list():
            pass_cycle = False
            row = tmp_df_1h.loc[tmp_df_1h['time'] == t, :].reset_index(drop=True)
            
            for i in range(1, span):
                time_prev = t + timedelta(hours= -i)
                try:
                    row_tmp = tmp_df_1h.loc[tmp_df_1h['time'] == time_prev, :].reset_index(drop=True)
                    row_tmp.columns = [c + f'_prev_{i}' for c in row_tmp.columns]
                except IndexError:
                    pass_cycle = True
                    break
                row = pd.concat([row, row_tmp.iloc[:,1:]], axis=1)
                
            if pass_cycle:
                continue
            
            time_next = t + timedelta(hours=target_offset)
            if type == 'buy':
                target = tmp_df_1h.loc[tmp_df_1h['time'] == time_next, 'high'].reset_index(drop=True)
            else:
                target = tmp_df_1h.loc[tmp_df_1h['time'] == time_next, 'low'].reset_index(drop=True)

            target.name = 'target'
            rows = pd.concat([row, target], axis=1)
            
            if train_df.shape[0] == 0:
                train_df = rows
            else:
                train_df = pd.concat([train_df, rows])
    
    return train_df.reset_index(drop=True)

# for how long time (in hours) we want to predict
target_offset = 24
# how many data points from past (in hours) we collect for prediction 
span = 10
# dataset with the signal statistics
df = pd.read_pickle('signal_stat/buy_stat_1h.pkl')
# dataset for model train
train_df = create_train_df(df, 'buy', target_offset, span)
train_df.head()


100%|██████████| 302/302 [00:16<00:00, 18.01it/s]


Unnamed: 0,time,open,high,low,close,volume,open_prev_1,high_prev_1,low_prev_1,close_prev_1,...,high_prev_8,low_prev_8,close_prev_8,volume_prev_8,open_prev_9,high_prev_9,low_prev_9,close_prev_9,volume_prev_9,target
0,2023-07-15 05:00:00,30336.45,30348.0,30288.7,30288.7,831.44144,30328.77,30390.9,30286.0,30336.45,...,30770.95,30050.0,30091.83,13453.45116,31213.99,31214.0,30648.0,30688.64,6225.48626,30335.0
1,2023-07-24 22:00:00,29124.25,29157.15,29102.0,29117.99,852.34591,29039.75,29137.65,29000.0,29124.26,...,29330.69,29222.34,29269.99,1486.61624,29178.0,29383.11,29116.16,29324.01,4638.01236,29277.95
2,2023-07-15 05:00:00,30336.45,30348.0,30288.7,30288.7,831.44144,30328.77,30390.9,30286.0,30336.45,...,30770.95,30050.0,30091.83,13453.45116,31213.99,31214.0,30648.0,30688.64,6225.48626,30335.0
3,2023-08-02 20:00:00,1829.11,1839.3,1825.65,1831.43,17174.0285,1842.42,1847.53,1821.21,1829.12,...,1860.0,1854.42,1854.71,7972.0558,1857.08,1858.23,1853.66,1857.38,12243.3514,1859.12
4,2023-08-07 21:00:00,1815.12,1831.33,1813.63,1821.9,16893.7539,1815.59,1818.83,1805.95,1815.12,...,1837.39,1833.5,1837.38,4040.7355,1833.32,1836.13,1832.55,1835.63,3994.0966,1871.43
