In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import joblib
import warnings
warnings.filterwarnings('ignore')

print('Libraries loaded successfully!')

Libraries loaded successfully!


## 2. Load Data

In [3]:
# Load Heiken Ashi data
df = pd.read_csv('BTCUSD_15m_HA_data.csv')

print(f'Data loaded: {len(df)} rows')
print(f'Columns: {df.columns.tolist()}')
print(f"Date range: {df['Time'].min()} to {df['Time'].max()}")
print(f'\nFirst 5 rows:')
print(df.head())

Data loaded: 95764 rows
Columns: ['Time', 'HA_Open', 'HA_High', 'HA_Low', 'HA_Close', 'Volume']
Date range: 2023-01-01 00:00:00 to 2025-09-24 18:15:00

First 5 rows:
                  Time       HA_Open   HA_High    HA_Low    HA_Close  Volume
0  2023-01-01 00:00:00  16520.510000  16529.87  16508.58  16519.8675       0
1  2023-01-01 00:15:00  16520.188750  16530.87  16506.51  16519.8000       0
2  2023-01-01 00:30:00  16519.994375  16533.31  16502.55  16518.7100       0
3  2023-01-01 00:45:00  16519.352188  16521.36  16505.79  16512.3350       0
4  2023-01-01 01:00:00  16515.843594  16530.62  16511.78  16520.6325       0


## 3. Calculate Heiken Ashi Metrics

In [4]:
# HA body size
df['HA_Body'] = abs(df['HA_Close'] - df['HA_Open'])

# HA range
df['HA_Range'] = df['HA_High'] - df['HA_Low']

# HA close change
df['HA_Close_Change'] = df['HA_Close'].diff()

# HA momentum
df['HA_Momentum'] = df['HA_Close'].diff(3)

# HA volatility
df['HA_Volatility'] = df['HA_Range'].rolling(5).std()

print('HA metrics calculated')

HA metrics calculated


## 4. K-Means Clustering

In [None]:
from sklearn.cluster import KMeans

# Use 252 bars for clustering
window = 252
n_bars = len(df)

# Pre-allocate numpy arrays (much faster than .loc in a loop)
clusters = np.full(n_bars, np.nan)
densities = np.full(n_bars, np.nan)

# Cache close prices as numpy array (avoid repeated .iloc conversions)
close_prices = df['HA_Close'].values.reshape(-1, 1)

for i in range(window, n_bars):
    # Get rolling window (numpy slicing is faster)
    window_prices = close_prices[i-window:i]
    
    # Fit K-means with 3 clusters
    # Optimized parameters: n_init=2 (instead of 10), max_iter limit
    kmeans = KMeans(
        n_clusters=3,
        n_init=2,           # Reduced from 10 (still stable)
        max_iter=100,       # Add iteration limit for convergence
        random_state=42,
        tol=1e-3            # Slightly relax tolerance
    )
    cluster_labels = kmeans.fit_predict(window_prices)
    
    # Current cluster
    current_cluster = cluster_labels[-1]
    clusters[i] = current_cluster
    
    # Cluster density (pure numpy is faster)
    cluster_count = np.sum(cluster_labels == current_cluster)
    densities[i] = (cluster_count / window) * 100

# Assign back to dataframe
df['Cluster'] = clusters
df['Cluster_Density'] = densities

print('K-Means clustering complete')
print(f'Processed {n_bars - window} rolling windows')

KeyboardInterrupt: 

## 5. Consecutive Bars Pattern

In [None]:
df['Consecutive_Up'] = 0
df['Consecutive_Down'] = 0

up_count = 0
down_count = 0

for i in range(1, len(df)):
    if df['HA_Close'].iloc[i] > df['HA_Close'].iloc[i-1]:
        up_count += 1
        down_count = 0
    elif df['HA_Close'].iloc[i] < df['HA_Close'].iloc[i-1]:
        down_count += 1
        up_count = 0
    else:
        up_count = 0
        down_count = 0
    
    df.loc[i, 'Consecutive_Up'] = up_count
    df.loc[i, 'Consecutive_Down'] = down_count

print('Consecutive bars calculated')

## 6. Volume Analysis

In [None]:
df['Volume_Change'] = df['Volume'].pct_change()
df['Volume_MA_5'] = df['Volume'].rolling(5).mean()
df['Volume_Ratio'] = df['Volume'] / df['Volume_MA_5']

print('Volume features calculated')

## 7. Create Target Label (10 bars ahead for day trading)

In [None]:
# Target: Predict price direction 10 bars ahead (~2.5 hours on M15)
# This converts from scalping (1-bar) to day trading (2.5+ hours)
shift_bars = 10  # 10 bars * 15min = 2.5 hours

df['Target'] = 0
for i in range(len(df) - shift_bars):
    if df['HA_Close'].iloc[i + shift_bars] > df['HA_Close'].iloc[i]:
        df.loc[i, 'Target'] = 1  # Bullish (price higher in 2.5 hours)
    else:
        df.loc[i, 'Target'] = -1  # Bearish (price lower in 2.5 hours)

# Remove last 10 rows (no target, not enough bars ahead)
df = df[:-shift_bars]

print(f'Target distribution (predicting {shift_bars} bars / 2.5 hours ahead):')
print(df['Target'].value_counts())
print(f"\nBullish: {(df['Target'] == 1).sum()} ({(df['Target'] == 1).sum()/len(df)*100:.1f}%)")
print(f"Bearish: {(df['Target'] == -1).sum()} ({(df['Target'] == -1).sum()/len(df)*100:.1f}%)")
print(f"\nThis means:")
print(f"  - Model predicts price direction 2.5 hours into the future")
print(f"  - Average trade holding time will be ~2.5 hours (not 1 bar)")
print(f"  - Day trading style, not scalping")

## 8. Feature Engineering

In [None]:
feature_columns = [
    'HA_Open', 'HA_High', 'HA_Low', 'HA_Close',  # Price
    'HA_Body', 'HA_Range', 'HA_Close_Change',    # HA metrics
    'HA_Momentum', 'HA_Volatility',              # Momentum
    'Cluster_Density',                           # Clustering
    'Consecutive_Up', 'Consecutive_Down',       # Patterns
    'Volume', 'Volume_Change', 'Volume_Ratio'   # Volume
]

df = df.fillna(0)
X = df[feature_columns].values
y = df['Target'].values

print(f'Features shape: {X.shape}')
print(f'Target shape: {y.shape}')

## 9. Standardize Features

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

joblib.dump(scaler, 'scaler_randomforest_ha15m.save')
print('Scaler saved: scaler_randomforest_ha15m.save')

## 10. Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, shuffle=False
)

print(f'Training set: {X_train.shape}')
print(f'Test set: {X_test.shape}')

## 11. Train Random Forest Model

In [None]:
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

print('Training Random Forest...')
model.fit(X_train, y_train)
print('Training complete!')

## 12. Evaluate Model

In [None]:
# Predictions
y_pred = model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

print(f'Random Forest Model Performance:')
print(f'  Accuracy:  {accuracy:.4f}')
print(f'  Precision: {precision:.4f}')
print(f'  Recall:    {recall:.4f}')
print(f'  F1-Score:  {f1:.4f}')
print(f'\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))

## 13. Feature Importance

In [None]:
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

print('Feature Importance Ranking:')
for i in range(min(10, len(feature_columns))):
    print(f'{i+1}. {feature_columns[indices[i]]}: {importances[indices[i]]:.4f}')

## 14. Save Model

In [None]:
joblib.dump(model, 'randomforest_ha15m_trend_model.pkl')
print('Model saved: randomforest_ha15m_trend_model.pkl')

## 15. Generate Predictions

In [None]:
# Predict for entire dataset
y_pred_all = model.predict(X_scaled)

print(f'Total predictions: {len(y_pred_all)}')
print(f'Bullish: {(y_pred_all == 1).sum()}')
print(f'Bearish: {(y_pred_all == -1).sum()}')

## 16. Save Forecast CSV

In [None]:
forecast_df = df[['Time']].copy()
forecast_df['RF_Prediction'] = y_pred_all

forecast_df.to_csv('randomforest_ha15m_forecast.csv', index=False)

print(f'Forecast saved: randomforest_ha15m_forecast.csv')
print(forecast_df.head(10))

## 17. Training Summary

In [None]:
print('='*50)
print('RANDOM FOREST TRAINING SUMMARY')
print('='*50)
print(f'\nModel Type: Random Forest (200 trees, max_depth=20)')
print(f'Total Features: {len(feature_columns)}')
print(f'Total Samples: {len(df)}')
print(f'Training Samples: {len(X_train)}')
print(f'Test Samples: {len(X_test)}')
print(f'\nTest Set Performance:')
print(f'  Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)')
print(f'  Precision: {precision:.4f}')
print(f'  Recall:    {recall:.4f}')
print(f'  F1-Score:  {f1:.4f}')
print(f'\nOutput Files:')
print(f'  1. randomforest_ha15m_trend_model.pkl')
print(f'  2. scaler_randomforest_ha15m.save')
print(f'  3. randomforest_ha15m_forecast.csv')
print('='*50)