In [1]:
import pandas as pd

data = pd.read_csv('data_sample_three_years.csv')

data = data.iloc[:, [0,1,2,3,4]]
data.rename(columns={'date': 'open', 'low':'close', 'open':'high', 'high':'low'}, inplace=True)

data['date'] = pd.to_datetime(data['timestamp'], unit='s')
data = data.iloc[:, [0,5,1,2,3,4]]

data_backup = data.loc[ data.timestamp >= 1675814400 ]
data = data.loc[ data.timestamp < 1675814400 ]

from talib import RSI
data['rsi_14'] = RSI(data['close'], timeperiod=14)
data['rsi_14'] = data['rsi_14'].shift(1)

from talib import MA, SMA, EMA, WMA
data['ma_9'] = MA(data['close'], timeperiod=9)
data['ma_9'] = data['ma_9'].shift(1)
data['sma_9'] = SMA(data['close'], timeperiod=9)
data['sma_9'] = data['sma_9'].shift(1)
data['wma_9'] = WMA(data['close'], timeperiod=9)
data['wma_9'] = data['wma_9'].shift(1)

from talib import MACD
data['macd'], data['signal'], data['hist'] = MACD(data['close'])
data['macd'] = data['macd'].shift(1)
data['signal'] = data['signal'].shift(1)
data['hist'] = data['hist'].shift(1)

from talib import ADX
data['adx'] = ADX(data['high'], data['low'], data['close'])
data['adx'] = data['adx'].shift(1)

from talib import ATR
data['atr'] = ATR(high=data['high'], low=data['low'], close=data['close'], timeperiod=14)
data['atr'] = data['atr'].shift(1)

from talib import SAR
data['sar'] = SAR(high=data['high'], low=data['low'], acceleration=0.02, maximum=0.2)
data['sar'] = data['sar'].shift(1)

from talib import TEMA
data['tema'] = TEMA(data['close'], timeperiod=14)
data['tema'] = data['tema'].shift(1)

from talib import ROC
data['roc'] = ROC(data['close'], timeperiod=14)
data['roc'] = data['roc'].shift(1)

data.dropna(axis=0, inplace=True)

In [2]:
from talib import RSI
data_backup['rsi_14'] = RSI(data_backup['close'], timeperiod=14)
data_backup['rsi_14'] = data_backup['rsi_14'].shift(1)

from talib import MA, SMA, EMA, WMA
data_backup['ma_9'] = MA(data_backup['close'], timeperiod=9)
data_backup['ma_9'] = data_backup['ma_9'].shift(1)
data_backup['sma_9'] = SMA(data_backup['close'], timeperiod=9)
data_backup['sma_9'] = data_backup['sma_9'].shift(1)
data_backup['wma_9'] = WMA(data_backup['close'], timeperiod=9)
data_backup['wma_9'] = data_backup['wma_9'].shift(1)

from talib import MACD
data_backup['macd'], data_backup['signal'], data_backup['hist'] = MACD(data_backup['close'])
data_backup['macd'] = data_backup['macd'].shift(1)
data_backup['signal'] = data_backup['signal'].shift(1)
data_backup['hist'] = data_backup['hist'].shift(1)

from talib import ADX
data_backup['adx'] = ADX(data_backup['high'], data_backup['low'], data_backup['close'])
data_backup['adx'] = data_backup['adx'].shift(1)

from talib import ATR
data_backup['atr'] = ATR(high=data_backup['high'], low=data_backup['low'], close=data_backup['close'], timeperiod=14)
data_backup['atr'] = data_backup['atr'].shift(1)

from talib import SAR
data_backup['sar'] = SAR(high=data_backup['high'], low=data_backup['low'], acceleration=0.02, maximum=0.2)
data_backup['sar'] = data_backup['sar'].shift(1)

from talib import TEMA
data_backup['tema'] = TEMA(data_backup['close'], timeperiod=14)
data_backup['tema'] = data_backup['tema'].shift(1)

from talib import ROC
data_backup['roc'] = ROC(data_backup['close'], timeperiod=14)
data_backup['roc'] = data_backup['roc'].shift(1)

data_backup.dropna(axis=0, inplace=True)

In [3]:
best_features_subset = ('close', 'ma_9', 'sma_9', 'macd', 'signal', 'hist', 'adx', 'atr', 'sar', 'tema', 'roc')

In [5]:
data_input = data[list(best_features_subset)]
data_target = data[['close']]

In [6]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_input)

scaler_target = MinMaxScaler()
target_scaled = scaler_target.fit_transform(data_target)

In [7]:
seq_length = 10  # Number of time steps in each sequence
num_features = data_input.shape[1]
X_features = data_scaled
y_target = target_scaled

In [8]:
import numpy as np

# Create input sequences and targets
def create_sequences(features, target, seq_length):
    X_seq = []
    y_seq = []
    for i in range(len(features) - seq_length):
        X_seq.append(features[i:i+seq_length])  # Input sequence
        y_seq.append(target[i+seq_length]) # Target value (next data point)
    return np.array(X_seq), np.array(y_seq)

In [9]:
X_seq, y_seq = create_sequences(X_features, y_target, seq_length)

# Reshape X_seq to fit LSTM model input shape
X_seq = X_seq.reshape(X_seq.shape[0], seq_length, num_features)

In [10]:
NEURONS = [25, 50, 100]
OPTIMIZERS = ['adam', 'rmsprop']
BATCH_SIZES = [25, 32, 45]

In [11]:
import itertools

# Generate all combinations of parameters
param_combinations = list(itertools.product(NEURONS, OPTIMIZERS, BATCH_SIZES))
total_combinations = len(param_combinations)
print(f"param_combinations: {total_combinations} different options.")

param_combinations: 18 different options.


In [13]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, accuracy_score

best_features = None
best_score = float('inf')

for i, params in enumerate(param_combinations):

    print(i)
    
    neurons = params[0]
    optimizer = params[1]
    batch_size = params[2]

    # Build LSTM model
    model = Sequential([
        LSTM(neurons, input_shape=(seq_length, num_features)),
        Dense(1)
    ])
    
    # Compile the model
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    
    # Adding early stopping to prevent overfitting
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    
    # Training the model with early stopping
    model.fit(X_seq, y_seq, epochs=100, batch_size=batch_size, validation_split=0.2, callbacks=[early_stopping])

    db = data_backup.copy()
    data_back_scaled = scaler.transform(db[list(best_features_subset)])
    target_back_scaled = scaler_target.transform(db[['close']])
    X_b, y_b = create_sequences(data_back_scaled, target_back_scaled, seq_length)
    X_b = X_b.reshape(X_b.shape[0], seq_length, num_features)

    y_pred = model.predict(X_b)
    y_pred = scaler_target.inverse_transform(y_pred)

    score = mean_squared_error(y_pred, db[['close']].iloc[10:])
    print(f'MSE for {params} is {score}')
    with open('lstm-5_2-log.txt', 'a+') as file:
        file.write(f'MSE for {params} is {score}\n')

    # Update best feature combination if score improves
    if score < best_score:
        best_score = score
        best_features = params

print("Best feature combination:", best_features)
print("Best MSE:", best_score)

0
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
MSE for (25, 'adam', 25) is 108.66778513941834
1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch