## Feature engineering

In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [32]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join('..')))

from feature_engineering.indicators import simple_moving_average, momentum, exponential_moving_average, bollinger_bands, macd

In [33]:
data = pd.read_csv('../data/binance/BTC/1h.csv')

In [34]:
data.head()

Unnamed: 0,Open Time,Open,High,Low,Close,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,Taker Buy Quote Asset Volume,Ignore
0,2024-09-17 19:00:00,60540.17,60696.48,59618.26,59967.99,2112.0532,2024-09-17 19:59:59.999,127061200.0,322417,881.13029,52996740.0,0
1,2024-09-17 20:00:00,59967.98,60342.85,59715.42,60120.0,1665.06151,2024-09-17 20:59:59.999,99969200.0,208296,828.74091,49757090.0,0
2,2024-09-17 21:00:00,60119.99,60424.25,60105.01,60334.07,662.68788,2024-09-17 21:59:59.999,39973260.0,85064,356.50218,21502870.0,0
3,2024-09-17 22:00:00,60334.07,60343.99,60122.5,60240.01,424.37655,2024-09-17 22:59:59.999,25561030.0,94677,173.20763,10431710.0,0
4,2024-09-17 23:00:00,60240.01,60324.0,60005.31,60313.99,489.52738,2024-09-17 23:59:59.999,29457380.0,73556,267.6605,16109520.0,0


In [43]:
# Ensure data is sorted by date
data['Date'] = pd.to_datetime(data['Open Time'])
data.sort_values('Date', inplace=True)

In [None]:
# Apply indicators on data

# Apply Simple Moving Average (SMA)
data = simple_moving_average(data, 'Close', window=5)
data = simple_moving_average(data, 'Close', window=10)

# Apply Momentum (MTM)
data = momentum(data, 'Close', window=5)
data = momentum(data, 'Close', window=10)

# Apply Exponential Moving Average (EMA)
data = exponential_moving_average(data, 'Close', span=5)
data = exponential_moving_average(data, 'Close', span=10)

# Apply Bollinger Bands (BB)
data = bollinger_bands(data, 'Close', window=20)

# Apply Moving Average Convergence Divergence (MACD)
data = macd(data, 'Close')

In [36]:
data.dropna(inplace=True)

## Create Lagged Features

Use past data points as features to predict the next data points

In [37]:
for lag in range(1, 6):
    data[f'Close_lag_{lag}'] = data['Close'].shift(lag)
                                                   
data.dropna(inplace=True)
data.head()

Unnamed: 0,Open Time,Open,High,Low,Close,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,...,LB,EMA_short,EMA_long,MACD,Signal_Line,Close_lag_1,Close_lag_2,Close_lag_3,Close_lag_4,Close_lag_5
24,2024-09-18 19:00:00,60629.79,60745.99,59987.25,60057.99,3156.74947,2024-09-18 19:59:59.999,190557000.0,753066,1532.69879,...,59356.257078,60002.639415,60022.317473,-19.678058,-44.874652,60629.79,60013.01,59915.26,59429.18,59487.6
25,2024-09-18 20:00:00,60057.99,60320.0,59473.68,60230.01,2083.48272,2024-09-18 20:59:59.999,124733900.0,286619,995.87495,...,59356.245615,60037.619505,60037.702105,-0.082599,-35.916242,60057.99,60629.79,60013.01,59915.26,59429.18
26,2024-09-18 21:00:00,60230.01,60496.95,60168.04,60199.46,808.6144,2024-09-18 21:59:59.999,48775470.0,104108,372.10512,...,59358.36826,60062.518043,60049.684171,12.833872,-26.166219,60230.01,60057.99,60629.79,60013.01,59915.26
27,2024-09-18 22:00:00,60199.46,60700.0,60194.0,60684.78,732.26197,2024-09-18 22:59:59.999,44293550.0,105072,401.74056,...,59333.448653,60158.250652,60096.728306,61.522345,-8.628506,60199.46,60230.01,60057.99,60629.79,60013.01
28,2024-09-18 23:00:00,60684.78,61786.24,60680.0,61759.99,2346.43342,2024-09-18 23:59:59.999,143730200.0,252632,1465.86349,...,59100.958361,60404.67209,60219.932876,184.739214,30.045038,60684.78,60199.46,60230.01,60057.99,60629.79


### Split the data into features and target

In [54]:
features = data[['Close_lag_1', 'Close_lag_2', 'Close_lag_3', 'Close_lag_4', 'Close_lag_5',
                 'SMA_5', 'SMA_10', 'MTM_5', 'MTM_10', 'EMA_5', 'EMA_10', 'MA', 'UB', 'LB', 'MACD']]

target = data['Close']

In [55]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

### Preprocessing

In [66]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [60]:
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4]
}

gbr = GradientBoostingRegressor()

grid_search = GridSearchCV(estimator=gbr,
                           param_grid=param_grid,
                           cv=3,
                           n_jobs=-1,
                           scoring='neg_mean_squared_error'
                           )

grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
print(f'Best parameters found: {best_params}')

(556, 15) (556,)
Best parameters found: {'learning_rate': 0.2, 'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 200}


In [62]:
best_gbr = GradientBoostingRegressor(**best_params)
best_gbr.fit(X_train_scaled, y_train)

y_pred = best_gbr.predict(X_test_scaled)
print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred)}')
print(f'R^2 Score: {r2_score(y_test, y_pred)}')

Mean Squared Error: 18792.268562718138
R^2 Score: 0.9952920878944008


In [65]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(best_gbr, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')

# Convert negative MSE to positive
cv_scores = -cv_scores

print(f'Cross-Validation MSE: {cv_scores}')
print(f'Mean Cross-Validation MSE: {cv_scores.mean()}')
print(f'Standard Deviation of Cross-Validation MSE: {cv_scores.std()}')

Cross-Validation MSE: [44008.60169453 21686.59308828 16541.40055999 38186.55819007
 29709.27571262]
Mean Cross-Validation MSE: 30026.48584909903
Standard Deviation of Cross-Validation MSE: 10134.972154991401


### Try Randomized Search for better performance

In [68]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# Define the parameter distribution for Random Search
param_dist = {
    'n_estimators': randint(100, 500),
    'learning_rate': uniform(0.01, 0.1),
    'max_depth': randint(3, 10),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10)
}

# Initialize the model
gbr = GradientBoostingRegressor()

# Initialize Random Search
random_search = RandomizedSearchCV(estimator=gbr, param_distributions=param_dist, n_iter=100, cv=5, n_jobs=-1, scoring='neg_mean_squared_error', random_state=42)

# Perform Random Search
random_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = random_search.best_params_
print(f'Best parameters found: {best_params}')

# Train the model with the best parameters
best_gbr = GradientBoostingRegressor(**best_params)
best_gbr.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = best_gbr.predict(X_test_scaled)
print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred)}')
print(f'R^2 Score: {r2_score(y_test, y_pred)}')

# Perform cross-validation with the best model
cv_scores = cross_val_score(best_gbr, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
cv_scores = -cv_scores

print(f'Cross-Validation MSE: {cv_scores}')
print(f'Mean Cross-Validation MSE: {cv_scores.mean()}')
print(f'Standard Deviation of Cross-Validation MSE: {cv_scores.std()}')

Best parameters found: {'learning_rate': 0.10832308858067882, 'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 6, 'n_estimators': 406}
Mean Squared Error: 18403.032458151818
R^2 Score: 0.995389600835029
Cross-Validation MSE: [38859.88436478 24576.33867179 17831.85558992 31109.52345374
 32349.91562338]
Mean Cross-Validation MSE: 28945.503540720685
Standard Deviation of Cross-Validation MSE: 7171.760221358816


### Try TimeSeriesSplit Cross Validation

more approriate because data is time-series

In [69]:
from sklearn.model_selection import TimeSeriesSplit

# Initialize TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

# Perform cross-validation with TimeSeriesSplit
cv_scores = cross_val_score(best_gbr, X_train_scaled, y_train, cv=tscv, scoring='neg_mean_squared_error')
cv_scores = -cv_scores

print(f'TimeSeriesSplit Cross-Validation MSE: {cv_scores}')
print(f'Mean TimeSeriesSplit Cross-Validation MSE: {cv_scores.mean()}')
print(f'Standard Deviation of TimeSeriesSplit Cross-Validation MSE: {cv_scores.std()}')

TimeSeriesSplit Cross-Validation MSE: [67569.66155861 35309.27523852 25446.70918679 31415.55393405
 36317.60239105]
Mean TimeSeriesSplit Cross-Validation MSE: 39211.76046180444
Standard Deviation of TimeSeriesSplit Cross-Validation MSE: 14683.552494543404


### Experiment with Xgboost

### Makes it worse!!

In [72]:
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

# Initialize the XGBoost model
xgbr = xgb.XGBRegressor(objective='reg:squarederror')

# Perform cross-validation with TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)
cv_scores = cross_val_score(xgbr, X_train_scaled, y_train, cv=tscv, scoring='neg_mean_squared_error')
cv_scores = -cv_scores

print(f'TimeSeriesSplit Cross-Validation MSE with XGBoost: {cv_scores}')
print(f'Mean TimeSeriesSplit Cross-Validation MSE with XGBoost: {cv_scores.mean()}')
print(f'Standard Deviation of TimeSeriesSplit Cross-Validation MSE with XGBoost: {cv_scores.std()}')

# Train the XGBoost model with the entire training set
xgbr.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = xgbr.predict(X_test_scaled)
print(f'Mean Squared Error with XGBoost: {mean_squared_error(y_test, y_pred)}')
print(f'R^2 Score with XGBoost: {r2_score(y_test, y_pred)}')

TimeSeriesSplit Cross-Validation MSE with XGBoost: [95311.65321882 47442.60540423 29833.59240377 35519.57954936
 44852.0497799 ]
Mean TimeSeriesSplit Cross-Validation MSE with XGBoost: 50591.896071217256
Standard Deviation of TimeSeriesSplit Cross-Validation MSE with XGBoost: 23241.35298547837
Mean Squared Error with XGBoost: 23399.201432899863
R^2 Score with XGBoost: 0.9941379411793928
