In [86]:
import pandas as pd

x = pd.read_pickle('../ml/signal_stat/buy_stat_1h.pkl')
y = pd.read_pickle('../ml/signal_stat/sell_stat_1h.pkl')
x = pd.concat([x, y])

In [93]:
import pandas as pd
from datetime import timedelta
from tqdm.auto import tqdm

def get_file(ticker):
    ''' Find files buy ticker names, file names can be in different formats '''
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker}_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker}_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    try:
        tmp_df_1h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}-SWAP_1h.pkl')
        tmp_df_4h = pd.read_pickle(f'../optimizer/ticker_dataframes/{ticker[:-4]}-{ticker[-4:]}-SWAP_4h.pkl')
    except FileNotFoundError:
        pass
    else:
        return tmp_df_1h, tmp_df_4h
    
    return None, None


def create_train_df(df, type, target_offset, first, last, step):
    ''' Create train dataset from signal statistics and ticker candle data'''
    train_df = pd.DataFrame()
    tickers = df['ticker'].unique()
    
    for ticker in tqdm(tickers):
        signal_df = df[df['ticker'] == ticker]
        times = signal_df['time']
        
        tmp_df_1h, tmp_df_4h = get_file(ticker)

        for t in times.to_list():
            pass_cycle = False
            row = tmp_df_1h.loc[tmp_df_1h['time'] == t, :].reset_index(drop=True)
            
            for i in range(first, last + step, step):
                time_prev = t + timedelta(hours= -i)
                try:
                    row_tmp = tmp_df_1h.loc[tmp_df_1h['time'] == time_prev, :].reset_index(drop=True)
                    row_tmp.columns = [c + f'_prev_{i}' for c in row_tmp.columns]
                except IndexError:
                    pass_cycle = True
                    break
                row = pd.concat([row, row_tmp.iloc[:,1:]], axis=1)
                
            if pass_cycle:
                continue
            
            time_next = t + timedelta(hours=target_offset)
            if type == 'buy':
                target = tmp_df_1h.loc[tmp_df_1h['time'] == time_next, 'high'].reset_index(drop=True)
            else:
                target = tmp_df_1h.loc[tmp_df_1h['time'] == time_next, 'low'].reset_index(drop=True)

            target.name = 'target'
            rows = pd.concat([row, target], axis=1)
            
            if train_df.shape[0] == 0:
                train_df = rows
            else:
                train_df = pd.concat([train_df, rows])
    
    return train_df

# for how long time (in hours) we want to predict
target_offset = 24
# first previous data point to collect for model training (value represents number of hours before signal point)
first = 1
# last previous data point to collect for model training (value represents number of hours before signal point)
last = 10
# step of previous data points collecting (total number of points to collect is (last - first + step) / step)
step = 1
# dataset with the signal statistics
df = pd.read_pickle('signal_stat/buy_stat_1h.pkl')
tmp = pd.read_pickle('../ml/signal_stat/sell_stat_1h.pkl')
df = pd.concat([df, tmp])
# dataset for model train
train_df = create_train_df(df, 'buy', target_offset, first, last, step)
train_df = train_df.dropna().sort_values('time').reset_index(drop=True)
train_df.head()


100%|██████████| 374/374 [01:00<00:00,  6.18it/s]


Unnamed: 0,time,open,high,low,close,volume,open_prev_1,high_prev_1,low_prev_1,close_prev_1,...,high_prev_9,low_prev_9,close_prev_9,volume_prev_9,open_prev_10,high_prev_10,low_prev_10,close_prev_10,volume_prev_10,target
0,2022-12-21 23:00:00,0.0422,0.0423,0.042,0.0422,916271.0,0.0426,0.0426,0.0422,0.0423,...,0.043,0.0428,0.0429,751674.0,0.0427,0.0429,0.0427,0.0429,604272.0,0.0432
1,2022-12-22 03:00:00,246.2,248.1,245.8,248.0,8174.0,246.6,247.7,245.2,246.2,...,249.2,247.8,249.1,11987.502,249.0,249.9,247.6,248.3,12723.229,246.2
2,2022-12-23 07:00:00,0.693,0.777,0.691,0.768,516397.7,0.693,0.696,0.688,0.695,...,0.656,0.653,0.656,20277.6,0.663,0.667,0.653,0.653,94242.3,0.73
3,2022-12-23 07:00:00,1.074,1.1,1.073,1.085,310312.721,1.082,1.084,1.074,1.074,...,1.038,1.027,1.035,24770.684,1.022,1.029,1.018,1.029,26867.02,1.086
4,2022-12-23 19:00:00,0.05791,0.058277,0.05768,0.058167,395965.0,0.058276,0.058441,0.057649,0.057916,...,0.059364,0.059013,0.05923,566941.0,0.059387,0.059476,0.059013,0.059064,846843.0,0.057341


In [95]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

scaler = StandardScaler()

train_cols = [c for c in train_df.columns if c not in ['time', 'target']]

X, y = train_df[train_cols], train_df['target']
X = scaler.fit_transform(X)
# y =  scaler.fit_transform(y.values.reshape(-1, 1))

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LinearRegression(positive=True)
lr.fit(X_train, y_train)

preds = lr.predict(X_val)
# preds = scaler.inverse_transform(preds)
# y_val = scaler.inverse_transform(y_val)

mean_squared_error(y_val, preds, squared=False)

24.694775778796973

In [85]:
train_df[train_cols].loc[:,lr.coef_ > 0]

Unnamed: 0,high,close,volume_prev_1
0,1.100000,1.085000,76595.593
1,0.777000,0.768000,35608.300
2,0.916400,0.911600,146720.670
3,1.292000,1.290000,43202.800
4,146.100000,146.000000,1443.995
...,...,...,...
3397,0.004374,0.004337,3448154.000
3398,0.446000,0.444000,7844.400
3399,4.520000,4.520000,14085.430
3400,0.446000,0.444000,7844.400


In [67]:
import lightgbm as lgb

scaler = StandardScaler()

X, y = train_df[train_cols], train_df['target']
X = scaler.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

lgbr = lgb.LGBMRegressor()
lgbr.fit(X_train, y_train)

preds = lgbr.predict(X_val)
mean_squared_error(y_val, preds, squared=False)

  y = column_or_1d(y, warn=True)


0.18503467367774978

In [110]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

scaler = StandardScaler()

X, y = train_df[train_cols], (train_df['target'] > train_df['open']).map({True: 1, False: 0})
X = scaler.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

logr = LogisticRegression(C=0.1, max_iter=100000, class_weight='balanced')
logr.fit(X_train, y_train)

preds = logr.predict(X_val)

accuracy_score(y_val, preds)

0.44990892531876137

In [97]:
X, y = train_df[train_cols], (train_df['target'] > train_df['open']).map({True: 1, False: 0})

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

lgbc = lgb.LGBMClassifier()
lgbc.fit(X_train, y_train)

preds = lgbc.predict(X_val)
accuracy_score(y_val, preds)

0.6530054644808743