## IMPORTS

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.callbacks import EarlyStopping

In [28]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join('..')))

from feature_engineering.indicators import (
    simple_moving_average,
    momentum,
    cci,
    williams_r,
    exponential_moving_average,
    bollinger_bands,
    macd,
    atr,
    obv
)
from feature_engineering.time_based_features import add_time_based_features

In [29]:
# Read data
data = pd.read_csv('../data/binance/BTC/1h.csv')

In [30]:
# Ensure data is sorted by date
data['Date'] = pd.to_datetime(data['Open Time'])
data.sort_values('Date', inplace=True)

### Add engineered features

In [31]:
# Apply Simple Moving Average (SMA)
data = simple_moving_average(data, 'Close', window=5)
data = simple_moving_average(data, 'Close', window=10)

# Apply Momentum (MTM)
data = momentum(data, 'Close', window=5)
data = momentum(data, 'Close', window=10)

# Apply Exponential Moving Average (EMA)
data = exponential_moving_average(data, 'Close', span=5)
data = exponential_moving_average(data, 'Close', span=10)

# Apply Bollinger Bands (BB)
data = bollinger_bands(data, 'Close', window=20)

# Apply Moving Average Convergence Divergence (MACD)
data = macd(data, 'Close')

# Apply Average True Range (ATR)
data = atr(data, window=14)

# Apply On-Balance Volume (OBV)
data = obv(data)

# Apply Commodity Channel Index (CCI)
data = cci(data, window=20)

# Apply Williams %R
data = williams_r(data, window=14)

# Apply time-based features
data = add_time_based_features(data, 'Date')

In [32]:
data.dropna(inplace=True)

### Create lagged features

In [33]:
for lag in range(1, 6):
    data[f'Close_lag_{lag}'] = data['Close'].shift(lag)
                                                   
data.dropna(inplace=True)
data.head()

Unnamed: 0,Open Time,Open,High,Low,Close,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,...,CCI_20,Williams_%R_14,Day_of_Week,Hour_of_Day,Month_of_Year,Close_lag_1,Close_lag_2,Close_lag_3,Close_lag_4,Close_lag_5
24,2024-09-18 19:00:00,60629.79,60745.99,59987.25,60057.99,3156.74947,2024-09-18 19:59:59.999,190557000.0,753066,1532.69879,...,43.420706,-58.802594,2,19,9,60629.79,60013.01,59915.26,59429.18,59487.6
25,2024-09-18 20:00:00,60057.99,60320.0,59473.68,60230.01,2083.48272,2024-09-18 20:59:59.999,124733900.0,286619,995.87495,...,-12.175255,-50.778524,2,20,9,60057.99,60629.79,60013.01,59915.26,59429.18
26,2024-09-18 21:00:00,60230.01,60496.95,60168.04,60199.46,808.6144,2024-09-18 21:59:59.999,48775470.0,104108,372.10512,...,47.715405,-52.203564,2,21,9,60230.01,60057.99,60629.79,60013.01,59915.26
27,2024-09-18 22:00:00,60199.46,60700.0,60194.0,60684.78,732.26197,2024-09-18 22:59:59.999,44293550.0,105072,401.74056,...,92.793425,-29.565258,2,22,9,60199.46,60230.01,60057.99,60629.79,60013.01
28,2024-09-18 23:00:00,60684.78,61786.24,60680.0,61759.99,2346.43342,2024-09-18 23:59:59.999,143730200.0,252632,1465.86349,...,236.260632,-1.005193,2,23,9,60684.78,60199.46,60230.01,60057.99,60629.79


### Get features and split data

In [34]:
features = data[['Close_lag_1', 'Close_lag_2', 'Close_lag_3', 'Close_lag_4', 'Close_lag_5',
                 'SMA_5', 'SMA_10', 'MTM_5', 'MTM_10', 'EMA_5', 'EMA_10', 'MA', 'UB', 'LB', 'MACD',
                 'Signal_Line', 'ATR_14', 'OBV', 'CCI_20', 'Williams_%R_14', 
                 'Day_of_Week', 'Hour_of_Day', 'Month_of_Year']]
target = data['Close']

In [71]:
scaler_features = StandardScaler()
features_scaled = scaler_features.fit_transform(features)

scaler_target = StandardScaler()
target_scaled = scaler_target.fit_transform(target.values.reshape(-1, 1))

### Create sequences for lstm


In [64]:
def create_sequences(features, target, time_steps=10):
    X, y= [], []
    for i in range(len(features) - time_steps):
        X.append(features[i:i + time_steps])
        y.append(target[i + time_steps])
    return np.array(X), np.array(y)

time_steps = 10
X, y = create_sequences(features_scaled, target_scaled, time_steps)

In [65]:
print(f'X shape: {X.shape}')  # Should be (number_of_samples, time_steps, number_of_features)

X shape: (685, 10, 23)


In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model creation

In [67]:
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mean_squared_error')

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

  super().__init__(**kwargs)


In [72]:
# Train and evaluate the model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=32, callbacks=[early_stopping])

# Evaluate the model
y_pred_scaled = model.predict(X_test)
y_pred = scaler_target.inverse_transform(y_pred_scaled)
y_test_original = scaler_target.inverse_transform(y_test)
mse = mean_squared_error(y_test_original, y_pred)
r2 = r2_score(y_test_original, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Epoch 1/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0272 - val_loss: 0.0213
Epoch 2/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0308 - val_loss: 0.0200
Epoch 3/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0274 - val_loss: 0.0213
Epoch 4/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0293 - val_loss: 0.0200
Epoch 5/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0266 - val_loss: 0.0211
Epoch 6/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0272 - val_loss: 0.0195
Epoch 7/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0285 - val_loss: 0.0271
Epoch 8/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0317 - val_loss: 0.0218
Epoch 9/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━

In [73]:

# Perform cross-validation with TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)
cv_mse = []
for train_index, test_index in tscv.split(X_train):
    X_train_cv, X_test_cv = X_train[train_index], X_train[test_index]
    y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]
    
    model_cv = Sequential()
    model_cv.add(Dense(64, input_dim=X_train_cv.shape[1], activation='relu'))
    model_cv.add(Dropout(0.2))
    model_cv.add(Dense(32, activation='relu'))
    model_cv.add(Dropout(0.2))
    model_cv.add(Dense(1))
    
    model_cv.compile(optimizer='adam', loss='mean_squared_error')
    model_cv.fit(X_train_cv, y_train_cv, epochs=100, batch_size=32, verbose=0)
    
    y_pred_cv = model_cv.predict(X_test_cv)
    mse_cv = mean_squared_error(y_test_cv, y_pred_cv)
    cv_mse.append(mse_cv)

print(f'TimeSeriesSplit Cross-Validation MSE: {cv_mse}')
print(f'Mean TimeSeriesSplit Cross-Validation MSE: {np.mean(cv_mse)}')
print(f'Standard Deviation of TimeSeriesSplit Cross-Validation MSE: {np.std(cv_mse)}')

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

### Hyperparameter tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

# Define the model creation function
def create_model(units=50, dropout_rate=0.2, optimizer='adam'):
    model = Sequential()
    model.add(LSTM(units, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(units, return_sequences=False))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1))
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# Wrap the model using KerasRegressor
model = KerasRegressor(build_fn=create_model, epochs=100, batch_size=32, verbose=0)

# Define the hyperparameter grid
param_dist = {
    'units': [50, 100, 150],
    'dropout_rate': [0.2, 0.3, 0.4],
    'optimizer': ['adam', 'rmsprop'],
    'epochs': [50, 100],
    'batch_size': [32, 64]
}

# Perform Randomized Search
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, cv=3, verbose=1, n_jobs=-1)
random_search_result = random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search_result.best_params_
print(f'Best parameters found: {best_params}')

# Train the model with the best parameters
best_model = create_model(units=best_params['units'], dropout_rate=best_params['dropout_rate'], optimizer=best_params['optimizer'])
history = best_model.fit(X_train, y_train, validation_split=0.2, epochs=best_params['epochs'], batch_size=best_params['batch_size'], callbacks=[early_stopping])

# Evaluate the model
y_pred_scaled = best_model.predict(X_test)
y_pred = scaler_target.inverse_transform(y_pred_scaled)
y_test_original = scaler_target.inverse_transform(y_test)

mse = mean_squared_error(y_test_original, y_pred)
r2 = r2_score(y_test_original, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

ModuleNotFoundError: No module named 'keras.wrappers'