In [None]:
pip install tensorflow scikit-learn numpy scipy scikeras pandas 


In [None]:
df = pd.read_parquet("../data/full_dataset_feature_engineering_v2.parquet")

In [None]:
N_TIMESTEPS = 12 
N_FEATURES = X_train.shape[2]  # Number of features in the input data


In [None]:
df.drop("target",axis=1)


In [None]:
import pickle
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

# Load the scalers and feature list
with open('./pickles/stat_feature_scaler.pkl', 'rb') as file:
    feature_scaler = pickle.load(file)

with open('./pickles/stat_target_scaler.pkl', 'rb') as file:
    target_scaler = pickle.load(file)

with open('./pickles/stat_selected_features.pkl', 'rb') as file:
    selected_features = pickle.load(file)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

X_df = df[selected_features].values
y_df = df['return_forward'].values  # Replace with your target column name
original_indexes = df.index.tolist()

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_df)

# If your target needs scaling too (for regression problems)
y_scaler = MinMaxScaler()
y_scaled = y_scaler.fit_transform(y_df.reshape(-1, 1))

# Define sequence length (time steps to look back)
sequence_length = 12  # Adjust based on your specific problem

# Create sequences for LSTM
X_sequences = []
y_sequences = []
sequence_indexes = []

for i in range(len(X_scaled) - sequence_length):
    X_sequences.append(X_scaled[i:i+sequence_length])
    y_sequences.append(y_scaled[i+sequence_length])
    sequence_indexes.append(original_indexes[i+sequence_length])

# Convert to numpy arrays
X_sequences = np.array(X_sequences)
y_sequences = np.array(y_sequences)

# Check the resulting shapes
print(f"X shape: {X_sequences.shape}")  # Should be (samples, sequence_length, num_features)
print(f"y shape: {y_sequences.shape}")  # Should be (samples, 1) or (samples,)

# Split into training and testing sets

X_train, X_test, y_train, y_test ,train_idx, test_idx = train_test_split(
    X_sequences, y_sequences, sequence_indexes, test_size=0.25062, shuffle=False
)

print(f"Total sequences: {len(sequence_indexes)}")
print(f"Training sequences: {len(train_idx)}")
print(f"Testing sequences: {len(test_idx)}")


In [None]:
def create_lstm_model(lstm_units=50, dropout_rate=0.2, recurrent_dropout_rate=0.2,
                      learning_rate=0.001, optimizer_name='adam',
                      n_timesteps=N_TIMESTEPS, n_features=N_FEATURES):
    """
    Creates and compiles a Keras LSTM model.
    Accepts hyperparameters as arguments.
    """
    model = Sequential(name="LSTM_Classifier")
    model.add(Input(shape=(n_timesteps, n_features), name="Input_Layer"))
    model.add(LSTM(units=lstm_units,
                   dropout=dropout_rate,
                   recurrent_dropout=recurrent_dropout_rate,
                   name="LSTM_Layer"))
    model.add(Dropout(dropout_rate, name="Post_LSTM_Dropout")) # Optional extra dropout
    model.add(Dense(1, activation='sigmoid', name="Output_Layer")) # Sigmoid for binary classification

    # Select optimizer based on name and set learning rate
    if optimizer_name == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_name == 'rmsprop':
        optimizer = RMSprop(learning_rate=learning_rate)
    elif optimizer_name == 'sgd':
        optimizer = SGD(learning_rate=learning_rate)
    else:
        raise ValueError(f"Unsupported optimizer: {optimizer_name}")

    # Compile model - binary crossentropy for binary classification
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from tensorflow.keras.callbacks import EarlyStopping

# Use scikeras wrapper
# from tensorflow.keras.wrappers.scikit_learn import KerasClassifier # Old way
from scikeras.wrappers import KerasClassifier

from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import make_scorer, accuracy_score # Or other relevant metrics
from scipy.stats import randint, uniform # For sampling distributions

# Optional: Set random seeds for reproducibility
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)


In [None]:
param_distributions = {
    'model__lstm_units': randint(16, 128),          # Number of LSTM units (integer)
    'model__dropout_rate': uniform(0.0, 0.5),       # Dropout rate (float)
    'model__recurrent_dropout_rate': uniform(0.0, 0.5), # Recurrent dropout rate (float)
    'model__learning_rate': uniform(0.0001, 0.01),  # Learning rate (float)
    'batch_size': [32, 64, 128],                   # Batch size for training (categorical/integer)
    'epochs': [20, 50, 80]                         # Number of training epochs (categorical/integer)
}

In [None]:
# --- Create KerasClassifier wrapper ---
# Pass parameters that are *fixed* during the search but needed by the model function
# Use verbose=0 inside the wrapper to avoid excessive Keras logs during search
keras_estimator = KerasClassifier(
    model=create_lstm_model,
    # Pass fixed params needed by create_lstm_model NOT being tuned here
    model__n_timesteps=N_TIMESTEPS,
    model__n_features=N_FEATURES,
    verbose=0
)

# --- Define Callbacks (Optional but Recommended) ---
# Early stopping prevents overfitting and speeds up search if models converge early
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=10,         # Stop after 10 epochs with no improvement
    restore_best_weights=True, # Restore model weights from the epoch with the best value
    verbose=0            # Set to 1 to see when stopping occurs
)

# --- Setup Randomized Search CV ---
N_ITER_SEARCH = 15  # How many parameter combinations to try
CV_FOLDS = 3       # Number of cross-validation folds

random_search = RandomizedSearchCV(
    estimator=keras_estimator,
    param_distributions=param_distributions,
    n_iter=N_ITER_SEARCH,                 # Number of parameter settings that are sampled
    cv=CV_FOLDS,                          # Cross-validation strategy
    scoring='accuracy',                   # Metric to optimize (can use custom scorer)
    verbose=2,                            # Higher verbose level shows more info
    n_jobs=1,                             # Use 1 job to avoid potential GPU memory issues
                                          # Set to -1 to use all CPUs, but be careful with GPUs
    random_state=SEED,                    # For reproducible sampling
    error_score='raise'                   # Raise errors during model fitting
)


In [None]:
print("Starting Randomized Search...")

# Pass callbacks to the fit method of RandomizedSearchCV
# These will be used during the training of each candidate model
search_result = random_search.fit(X_train, y_train,
                                  callbacks=[early_stopping],
                                  validation_split=0.2) # Use a portion of training data for early stopping validation

print("Randomized Search Finished.")


In [None]:
print(f"\nBest Score (Accuracy): {search_result.best_score_:.4f}")
print("Best Parameters Found:")
for param, value in search_result.best_params_.items():
    # Adjust param name for display if it has 'model__' prefix
    display_param = param.replace('model__', '')
    print(f"- {display_param}: {value}")
