## Feature engineering

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [8]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join('..')))

from feature_engineering.indicators import simple_moving_average, momentum, exponential_moving_average, bollinger_bands, macd

In [9]:
data = pd.read_csv('../data/binance/BTC/1h.csv')

In [10]:
# Apply indicators on data

# Apply Simple Moving Average (SMA)
data = simple_moving_average(data, 'Close', window=5)
data = simple_moving_average(data, 'Close', window=10)

# Apply Momentum (MTM)
data = momentum(data, 'Close', window=5)
data = momentum(data, 'Close', window=10)

# Apply Exponential Moving Average (EMA)
data = exponential_moving_average(data, 'Close', span=5)
data = exponential_moving_average(data, 'Close', span=10)

# Apply Bollinger Bands (BB)
data = bollinger_bands(data, 'Close', window=20)

# Apply Moving Average Convergence Divergence (MACD)
data = macd(data, 'Close')

In [15]:
data.dropna(inplace=True)

In [16]:
data.head()

Unnamed: 0,Open Time,Open,High,Low,Close,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,...,EMA_5,EMA_10,MA,STD,UB,LB,EMA_short,EMA_long,MACD,Signal_Line
19,2024-09-18 14:00:00,59508.5,59776.0,59340.0,59487.6,1376.1409,2024-09-18 14:59:59.999,81907440.0,269176,699.97346,...,59708.750622,59875.94743,60116.288,307.360626,60731.009252,59501.566748,59915.197065,60014.784935,-99.58787,-6.340387
20,2024-09-18 15:00:00,59487.61,59757.06,59320.0,59429.18,991.84446,2024-09-18 15:59:59.999,59034820.0,194340,465.73532,...,59615.560415,59794.716988,60089.3475,342.632981,60774.613462,59404.081538,59840.425209,59971.406791,-130.981582,-31.268626
21,2024-09-18 16:00:00,59429.19,59954.0,59174.8,59915.26,1247.00154,2024-09-18 16:59:59.999,74311200.0,232644,643.03586,...,59715.460276,59816.6339,60079.1105,344.721153,60768.552805,59389.668195,59851.938254,59967.24777,-115.309516,-48.076804
22,2024-09-18 17:00:00,59915.26,60247.69,59485.45,60013.01,1472.53323,2024-09-18 17:59:59.999,88200860.0,225803,816.54712,...,59814.643518,59852.338645,60063.0575,339.661731,60742.380963,59383.734037,59876.718522,59970.637564,-93.919042,-57.245252
23,2024-09-18 18:00:00,60013.02,61318.6,59759.84,60629.79,8768.74537,2024-09-18 18:59:59.999,531606400.0,1046947,4692.90527,...,60086.359012,59993.693437,60082.5465,360.869481,60804.285461,59360.807539,59992.575673,60019.463671,-26.887998,-51.173801


## Create Lagged Features

Use past data points as features to predict the next data points

In [19]:
for lag in range(1, 6):
    data[f'Close_lag_{lag}'] = data['Close'].shift(lag)
                                                   
data.dropna(inplace=True)
data.head()

Unnamed: 0,Open Time,Open,High,Low,Close,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,...,LB,EMA_short,EMA_long,MACD,Signal_Line,Close_lag_1,Close_lag_2,Close_lag_3,Close_lag_4,Close_lag_5
24,2024-09-18 19:00:00,60629.79,60745.99,59987.25,60057.99,3156.74947,2024-09-18 19:59:59.999,190557000.0,753066,1532.69879,...,59356.257078,60002.639415,60022.317473,-19.678058,-44.874652,60629.79,60013.01,59915.26,59429.18,59487.6
25,2024-09-18 20:00:00,60057.99,60320.0,59473.68,60230.01,2083.48272,2024-09-18 20:59:59.999,124733900.0,286619,995.87495,...,59356.245615,60037.619505,60037.702105,-0.082599,-35.916242,60057.99,60629.79,60013.01,59915.26,59429.18
26,2024-09-18 21:00:00,60230.01,60496.95,60168.04,60199.46,808.6144,2024-09-18 21:59:59.999,48775470.0,104108,372.10512,...,59358.36826,60062.518043,60049.684171,12.833872,-26.166219,60230.01,60057.99,60629.79,60013.01,59915.26
27,2024-09-18 22:00:00,60199.46,60700.0,60194.0,60684.78,732.26197,2024-09-18 22:59:59.999,44293550.0,105072,401.74056,...,59333.448653,60158.250652,60096.728306,61.522345,-8.628506,60199.46,60230.01,60057.99,60629.79,60013.01
28,2024-09-18 23:00:00,60684.78,61786.24,60680.0,61759.99,2346.43342,2024-09-18 23:59:59.999,143730200.0,252632,1465.86349,...,59100.958361,60404.67209,60219.932876,184.739214,30.045038,60684.78,60199.46,60230.01,60057.99,60629.79


### Split the data into features and target

In [20]:
features = data[['Close_lag_1', 'Close_lag_2', 'Close_lag_3', 'Close_lag_4', 'Close_lag_5',
                 'SMA_5', 'SMA_10', 'MTM_5', 'MTM_10', 'EMA_5', 'EMA_10', 'MA', 'UB', 'LB', 'MACD']]
target = data['Close']

In [23]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)