## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import joblib
import warnings
warnings.filterwarnings('ignore')

print('Libraries loaded successfully!')
print(f'TensorFlow version: {tf.__version__}')
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

2025-12-12 02:02:01.802247: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-12 02:02:02.564348: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX512_FP16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-12 02:02:04.567017: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Libraries loaded successfully!
TensorFlow version: 2.20.0
GPU Available: []


2025-12-12 02:02:05.507895: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


## 2. Load Data

In [2]:
# Load Heiken Ashi data
df = pd.read_csv('BTCUSD_15m_HA_data.csv')

print(f'Data loaded: {len(df)} rows')
print(f'Columns: {df.columns.tolist()}')
print(f"Date range: {df['Time'].min()} to {df['Time'].max()}")
print(f'\nFirst 5 rows:')
print(df.head())
print(f'\nData info:')
print(df.info())

Data loaded: 95764 rows
Columns: ['Time', 'HA_Open', 'HA_High', 'HA_Low', 'HA_Close', 'Volume']
Date range: 2023-01-01 00:00:00 to 2025-09-24 18:15:00

First 5 rows:
                  Time       HA_Open   HA_High    HA_Low    HA_Close  Volume
0  2023-01-01 00:00:00  16520.510000  16529.87  16508.58  16519.8675       0
1  2023-01-01 00:15:00  16520.188750  16530.87  16506.51  16519.8000       0
2  2023-01-01 00:30:00  16519.994375  16533.31  16502.55  16518.7100       0
3  2023-01-01 00:45:00  16519.352188  16521.36  16505.79  16512.3350       0
4  2023-01-01 01:00:00  16515.843594  16530.62  16511.78  16520.6325       0

Data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95764 entries, 0 to 95763
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Time      95764 non-null  object 
 1   HA_Open   95764 non-null  float64
 2   HA_High   95764 non-null  float64
 3   HA_Low    95764 non-null  float64
 4   HA_Close  957

## 3. Calculate Heiken Ashi Metrics

In [3]:
# HA body size
df['HA_Body'] = abs(df['HA_Close'] - df['HA_Open'])

# HA range
df['HA_Range'] = df['HA_High'] - df['HA_Low']

# HA close change
df['HA_Close_Change'] = df['HA_Close'].diff()

# HA momentum
df['HA_Momentum'] = df['HA_Close'].diff(3)

# HA volatility
df['HA_Volatility'] = df['HA_Range'].rolling(5).std()

print('HA metrics calculated:')
print(df[['HA_Body', 'HA_Range', 'HA_Close_Change', 'HA_Momentum', 'HA_Volatility']].head())

HA metrics calculated:
    HA_Body  HA_Range  HA_Close_Change  HA_Momentum  HA_Volatility
0  0.642500     21.29              NaN          NaN            NaN
1  0.388750     24.36          -0.0675          NaN            NaN
2  1.284375     30.76          -1.0900          NaN            NaN
3  7.017188     15.57          -6.3750      -7.5325            NaN
4  4.788906     18.84           8.2975       0.8325       5.788077


## 4. K-Means Clustering (Python Implementation)

In [4]:
from sklearn.cluster import KMeans

# Use 252 bars for clustering (like in MT5 EA)
window = 252
df['Cluster'] = np.nan
df['Cluster_Density'] = np.nan

for i in range(window, len(df)):
    close_prices = df['HA_Close'].iloc[i-window:i].values.reshape(-1, 1)
    
    # Fit K-means with 3 clusters
    kmeans = KMeans(n_clusters=3, n_init=10, random_state=42)
    clusters = kmeans.fit_predict(close_prices)
    
    # Get current bar cluster
    current_cluster = clusters[-1]
    df.loc[i, 'Cluster'] = current_cluster
    
    # Calculate cluster density
    cluster_count = np.sum(clusters == current_cluster)
    density = (cluster_count / window) * 100
    df.loc[i, 'Cluster_Density'] = density

print('K-Means clustering complete')
print(df[['Cluster', 'Cluster_Density']].tail(10))

K-Means clustering complete
       Cluster  Cluster_Density
95754      2.0        60.714286
95755      2.0        60.317460
95756      1.0        60.714286
95757      2.0         8.730159
95758      2.0         8.730159
95759      2.0         9.126984
95760      2.0         9.126984
95761      2.0         9.126984
95762      2.0         9.126984
95763      2.0         9.126984


## 5. Consecutive Bars Pattern Detection

In [5]:
# Track consecutive up/down bars
df['Consecutive_Up'] = 0
df['Consecutive_Down'] = 0

up_count = 0
down_count = 0

for i in range(1, len(df)):
    if df['HA_Close'].iloc[i] > df['HA_Close'].iloc[i-1]:
        up_count += 1
        down_count = 0
    elif df['HA_Close'].iloc[i] < df['HA_Close'].iloc[i-1]:
        down_count += 1
        up_count = 0
    else:
        up_count = 0
        down_count = 0
    
    df.loc[i, 'Consecutive_Up'] = up_count
    df.loc[i, 'Consecutive_Down'] = down_count

print('Consecutive bars pattern calculated')
print(df[['HA_Close', 'Consecutive_Up', 'Consecutive_Down']].tail(10))

Consecutive bars pattern calculated
          HA_Close  Consecutive_Up  Consecutive_Down
95754  113351.3650               0                 2
95755  113469.1875               1                 0
95756  113655.6350               2                 0
95757  113728.4325               3                 0
95758  113818.1150               4                 0
95759  113836.1325               5                 0
95760  113878.8700               6                 0
95761  113815.7675               0                 1
95762  113728.6775               0                 2
95763  113647.4925               0                 3


## 6. Volume Analysis

In [6]:
# Volume change
df['Volume_Change'] = df['Volume'].pct_change()

# Volume MA
df['Volume_MA_5'] = df['Volume'].rolling(5).mean()

# Volume ratio
df['Volume_Ratio'] = df['Volume'] / df['Volume_MA_5']

print('Volume features calculated')
print(df[['Volume', 'Volume_Change', 'Volume_Ratio']].tail(10))

Volume features calculated
       Volume  Volume_Change  Volume_Ratio
95754       0            NaN           NaN
95755       0            NaN           NaN
95756       0            NaN           NaN
95757       0            NaN           NaN
95758       0            NaN           NaN
95759       0            NaN           NaN
95760       0            NaN           NaN
95761       0            NaN           NaN
95762       0            NaN           NaN
95763       0            NaN           NaN


## 7. Create Target Label (±1 Direction)

In [None]:
# Target: Predict price direction 10 bars ahead (~2.5 hours on M15)
# This converts from scalping (1-bar) to day trading (2.5+ hours)
shift_bars = 10  # 10 bars * 15min = 2.5 hours

df['Target'] = 0
for i in range(len(df) - shift_bars):
    if df['HA_Close'].iloc[i + shift_bars] > df['HA_Close'].iloc[i]:
        df.loc[i, 'Target'] = 1  # Bullish (price higher in 2.5 hours)
    else:
        df.loc[i, 'Target'] = -1  # Bearish (price lower in 2.5 hours)

# Remove last 10 rows (no target, not enough bars ahead)
df = df[:-shift_bars]

print(f'Target distribution (predicting {shift_bars} bars / 2.5 hours ahead):')
print(df['Target'].value_counts())
print(f"Bullish: {(df['Target'] == 1).sum()} ({(df['Target'] == 1).sum()/len(df)*100:.1f}%)")
print(f"Bearish: {(df['Target'] == -1).sum()} ({(df['Target'] == -1).sum()/len(df)*100:.1f}%)")


NameError: name 'df' is not defined

## 8. Feature Engineering

In [8]:
# Select features for LSTM
feature_columns = [
    'HA_Open', 'HA_High', 'HA_Low', 'HA_Close',  # Price
    'HA_Body', 'HA_Range', 'HA_Close_Change',    # HA metrics
    'HA_Momentum', 'HA_Volatility',              # Momentum
    'Cluster_Density',                           # Clustering
    'Consecutive_Up', 'Consecutive_Down',       # Patterns
    'Volume', 'Volume_Change', 'Volume_Ratio'   # Volume
]

# Handle NaN values
df = df.fillna(0)

X = df[feature_columns].values
y = df['Target'].values

print(f'Features shape: {X.shape}')
print(f'Target shape: {y.shape}')
print(f'Feature columns: {feature_columns}')

Features shape: (95763, 15)
Target shape: (95763,)
Feature columns: ['HA_Open', 'HA_High', 'HA_Low', 'HA_Close', 'HA_Body', 'HA_Range', 'HA_Close_Change', 'HA_Momentum', 'HA_Volatility', 'Cluster_Density', 'Consecutive_Up', 'Consecutive_Down', 'Volume', 'Volume_Change', 'Volume_Ratio']


## 9. Standardize Features

In [9]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f'Scaled X shape: {X_scaled.shape}')
print(f'Mean of scaled features: {X_scaled.mean(axis=0)[:5]}')
print(f'Std of scaled features: {X_scaled.std(axis=0)[:5]}')

# Save scaler
joblib.dump(scaler, 'scaler_lstm_ha15m.save')
print('\nScaler saved: scaler_lstm_ha15m.save')

Scaled X shape: (95763, 15)
Mean of scaled features: [ 1.89946995e-17  1.89946995e-17  1.89946995e-17  2.08941694e-16
 -1.06845184e-16]
Std of scaled features: [1. 1. 1. 1. 1.]

Scaler saved: scaler_lstm_ha15m.save


## 10. Create Sequences for LSTM

In [10]:
# Create sequences: use past 5 bars to predict next bar
sequence_length = 5

X_seq = []
y_seq = []

for i in range(len(X_scaled) - sequence_length):
    X_seq.append(X_scaled[i:i+sequence_length])
    y_seq.append(y[i+sequence_length])

X_seq = np.array(X_seq)
y_seq = np.array(y_seq)

print(f'Sequence X shape: {X_seq.shape}')  # (samples, 5, features)
print(f'Sequence y shape: {y_seq.shape}')
print(f'Total sequences: {len(X_seq)}')

Sequence X shape: (95758, 5, 15)
Sequence y shape: (95758,)
Total sequences: 95758


## 11. Train-Test Split

In [19]:
# Split 80/20
X_train, X_test, y_train, y_test = train_test_split(
    X_seq, y_seq, test_size=0.2, random_state=42, shuffle=False
)

print(f'Training set: {X_train.shape} with {len(y_train)} samples')
print(f'Test set: {X_test.shape} with {len(y_test)} samples')
print(f'\nTraining target distribution:')
print(f'  Bullish: {(y_train == 1).sum()}')
print(f'  Bearish: {(y_train == -1).sum()}')

Training set: (76606, 5, 15) with 76606 samples
Test set: (19152, 5, 15) with 19152 samples

Training target distribution:
  Bullish: 38579
  Bearish: 38027


## 12. Build LSTM Model

In [20]:
model = Sequential([
    LSTM(64, activation='relu', return_sequences=True, input_shape=(sequence_length, len(feature_columns))),
    Dropout(0.2),
    LSTM(32, activation='relu'),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1, activation='tanh')  # Output: -1 to 1
])

model.compile(
    optimizer='adam',
    loss='mse',
    metrics=['mae']
)

print('LSTM Model Architecture:')
model.summary()

LSTM Model Architecture:


## 13. Train Model

In [21]:
# Early stopping to prevent overfitting
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

# Train
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

print(f"Training completed in {len(history.history['loss'])} epochs")

Epoch 1/100


[1m1916/1916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 5ms/step - loss: 0.9992 - mae: 0.9983 - val_loss: 1.0006 - val_mae: 0.9950
Epoch 2/100
[1m1916/1916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - loss: 0.9972 - mae: 0.9966 - val_loss: 0.9981 - val_mae: 0.9960
Epoch 3/100
[1m1916/1916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - loss: 0.9960 - mae: 0.9957 - val_loss: 1.0027 - val_mae: 0.9948
Epoch 4/100
[1m1916/1916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - loss: 0.9954 - mae: 0.9950 - val_loss: 1.0221 - val_mae: 0.9946
Epoch 5/100
[1m1916/1916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - loss: 0.9952 - mae: 0.9944 - val_loss: 1.0036 - val_mae: 0.9965
Epoch 6/100
[1m1916/1916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - loss: 0.9947 - mae: 0.9943 - val_loss: 1.0070 - val_mae: 0.9967
Epoch 7/100
[1m1916/1916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s

## 14. Evaluate Model

In [22]:
# Get predictions
y_pred_prob = model.predict(X_test, verbose=0)
y_pred = np.where(y_pred_prob > 0, 1, -1).flatten()

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

print(f'LSTM Model Performance:')
print(f'  Accuracy:  {accuracy:.4f}')
print(f'  Precision: {precision:.4f}')
print(f'  Recall:    {recall:.4f}')
print(f'  F1-Score:  {f1:.4f}')
print(f'\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))

LSTM Model Performance:
  Accuracy:  0.5082
  Precision: 0.5094
  Recall:    0.6016
  F1-Score:  0.5517

Confusion Matrix:
[[3938 5581]
 [3838 5795]]


## 15. Save Model

In [23]:
model.save('lstm_ha15m_trend_model.h5')
print('Model saved: lstm_ha15m_trend_model.h5')



Model saved: lstm_ha15m_trend_model.h5


## 16. Generate Forecast CSV

In [26]:
# Generate predictions for entire dataset (vectorized - much faster!)
all_predictions = np.zeros(len(df))

# First sequence_length samples cannot be predicted
for i in range(sequence_length):
    all_predictions[i] = 0

# Batch predict all sequences at once (100x faster than loop)
print('Generating batch predictions...')
batch_preds = model.predict(X_seq, verbose=0)
all_predictions[sequence_length:] = np.where(batch_preds.flatten() > 0, 1, -1)

print(f'Total predictions: {len(all_predictions)}')
print(f'DataFrame length: {len(df)}')
print(f'Bullish: {(all_predictions == 1).sum()}')
print(f'Bearish: {(all_predictions == -1).sum()}')
print(f'Neutral: {(all_predictions == 0).sum()}')

Generating batch predictions...


Total predictions: 95763
DataFrame length: 95763
Bullish: 55501
Bearish: 40257
Neutral: 5


## 17. Save Forecast CSV

In [27]:
# Create forecast dataframe
forecast_df = pd.DataFrame({
    'Time': df['Time'].values,
    'LSTM_Prediction': all_predictions
})

# Save
forecast_df.to_csv('lstm_ha15m_forecast.csv', index=False)

print(f'Forecast saved: lstm_ha15m_forecast.csv')
print(f'Shape: {forecast_df.shape}')
print(f'\nFirst 10 rows:')
print(forecast_df.head(10))
print(f'\nLast 10 rows:')
print(forecast_df.tail(10))

Forecast saved: lstm_ha15m_forecast.csv
Shape: (95763, 2)

First 10 rows:
                  Time  LSTM_Prediction
0  2023-01-01 00:00:00              0.0
1  2023-01-01 00:15:00              0.0
2  2023-01-01 00:30:00              0.0
3  2023-01-01 00:45:00              0.0
4  2023-01-01 01:00:00              0.0
5  2023-01-01 01:15:00             -1.0
6  2023-01-01 01:30:00             -1.0
7  2023-01-01 01:45:00              1.0
8  2023-01-01 02:00:00             -1.0
9  2023-01-01 02:15:00             -1.0

Last 10 rows:
                      Time  LSTM_Prediction
95753  2025-09-24 15:45:00             -1.0
95754  2025-09-24 16:00:00              1.0
95755  2025-09-24 16:15:00              1.0
95756  2025-09-24 16:30:00              1.0
95757  2025-09-24 16:45:00             -1.0
95758  2025-09-24 17:00:00             -1.0
95759  2025-09-24 17:15:00             -1.0
95760  2025-09-24 17:30:00             -1.0
95761  2025-09-24 17:45:00             -1.0
95762  2025-09-24 18:00:00     

## 18. Training Summary

In [28]:
print('='*50)
print('LSTM TRAINING SUMMARY')
print('='*50)
print(f'\nModel Type: LSTM (2-layer with Dropout)')
print(f'Input Sequence Length: {sequence_length} bars')
print(f'Total Features: {len(feature_columns)}')
print(f'Total Samples: {len(df)}')
print(f'Training Samples: {len(X_train)}')
print(f'Test Samples: {len(X_test)}')
print(f'\nTest Set Performance:')
print(f'  Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)')
print(f'  Precision: {precision:.4f}')
print(f'  Recall:    {recall:.4f}')
print(f'  F1-Score:  {f1:.4f}')
print(f'\nOutput Files:')
print(f'  1. lstm_ha15m_trend_model.h5')
print(f'  2. scaler_lstm_ha15m.save')
print(f'  3. lstm_ha15m_forecast.csv')
print('='*50)

LSTM TRAINING SUMMARY

Model Type: LSTM (2-layer with Dropout)
Input Sequence Length: 5 bars
Total Features: 15
Total Samples: 95763
Training Samples: 76606
Test Samples: 19152

Test Set Performance:
  Accuracy:  0.5082 (50.82%)
  Precision: 0.5094
  Recall:    0.6016
  F1-Score:  0.5517

Output Files:
  1. lstm_ha15m_trend_model.h5
  2. scaler_lstm_ha15m.save
  3. lstm_ha15m_forecast.csv
