# Stage 2 Training - Download Stage 1 from HuggingFace

This notebook:
1. Downloads Stage 1 classification model from HuggingFace
2. Prepares Stage 2 training data with correct input shape
3. Trains Stage 2 model (HH/LH/HL/LL classification)
4. Evaluates performance

Working with: **BTCUSDT 15m** (single symbol test)
Can be extended to all 22 symbols

## Step 0: Setup & Configuration

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import pickle
import json
from pathlib import Path
from typing import Tuple
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.insert(0, os.path.abspath('../'))

# Configuration
SYMBOL = 'BTCUSDT'
TIMEFRAME = '15m'
SYMBOL_SHORT = 'BTC'
HF_DATASET_ID = 'zongowo111/v2-crypto-ohlcv-data'
HF_MODEL_PATH = f'v1_model/{SYMBOL}/{TIMEFRAME}'
HF_DATA_PATH = f'klines/{SYMBOL}/{SYMBOL_SHORT}_{TIMEFRAME}.parquet'

# Stage 1 Model Architecture Parameters
STAGE1_SEQUENCE_LENGTH = 10  # 10 bars
STAGE1_NUM_FEATURES = 20     # 20 features per bar

# Local paths
DATA_DIR = Path('../data')
STAGE2_DATA_DIR = DATA_DIR / 'stage2' / f'{SYMBOL.lower()}_{TIMEFRAME}'
MODEL_DIR = Path('../models')
STAGE1_MODEL_DIR = MODEL_DIR / 'stage1' / f'{SYMBOL.lower()}_{TIMEFRAME}'
STAGE2_MODEL_DIR = MODEL_DIR / 'stage2' / f'{SYMBOL.lower()}_{TIMEFRAME}'

# Create directories
STAGE2_DATA_DIR.mkdir(parents=True, exist_ok=True)
STAGE1_MODEL_DIR.mkdir(parents=True, exist_ok=True)
STAGE2_MODEL_DIR.mkdir(parents=True, exist_ok=True)

print(f'Symbol: {SYMBOL}')
print(f'Timeframe: {TIMEFRAME}')
print(f'Stage 1 Input Shape: ({STAGE1_SEQUENCE_LENGTH}, {STAGE1_NUM_FEATURES})')
print(f'Stage 1 Model Dir: {STAGE1_MODEL_DIR}')
print(f'Stage 2 Data Dir: {STAGE2_DATA_DIR}')
print(f'Stage 2 Model Dir: {STAGE2_MODEL_DIR}')

## Step 1: Download Stage 1 Model from HuggingFace

In [None]:
from huggingface_hub import hf_hub_download

print(f'Downloading Stage 1 model from HuggingFace...')
print(f'Dataset: {HF_DATASET_ID}')
print(f'Path: {HF_MODEL_PATH}')

# Download classification.h5
try:
    classification_path = hf_hub_download(
        repo_id=HF_DATASET_ID,
        filename=f'{HF_MODEL_PATH}/classification.h5',
        repo_type='dataset',
        cache_dir=str(STAGE1_MODEL_DIR)
    )
    print(f'✓ Downloaded classification.h5: {classification_path}')
except Exception as e:
    print(f'Error downloading classification.h5: {e}')

# Download params.json
try:
    params_path = hf_hub_download(
        repo_id=HF_DATASET_ID,
        filename=f'{HF_MODEL_PATH}/params.json',
        repo_type='dataset',
        cache_dir=str(STAGE1_MODEL_DIR)
    )
    print(f'✓ Downloaded params.json: {params_path}')
except Exception as e:
    print(f'Error downloading params.json: {e}')

# List downloaded files
print(f'\nDownloaded files:')
for file in STAGE1_MODEL_DIR.rglob('*'):
    if file.is_file():
        size = file.stat().st_size / 1024 / 1024
        print(f'  - {file.name} ({size:.2f} MB)')

## Step 2: Load and Verify Stage 1 Model

In [None]:
from tensorflow import keras

# Find classification.h5
classification_files = list(STAGE1_MODEL_DIR.rglob('classification.h5'))
if not classification_files:
    raise FileNotFoundError(f'No classification.h5 found in {STAGE1_MODEL_DIR}')

stage1_model_path = classification_files[0]
print(f'Loading Stage 1 model from: {stage1_model_path}')

# Load model
stage1_model = keras.models.load_model(str(stage1_model_path))
print(f'✓ Model loaded successfully')

# Get model input shape
input_shape = stage1_model.input_shape
print(f'\nModel Input Shape: {input_shape}')
print(f'Model Summary:')
stage1_model.summary()

# Verify expected shape
if len(input_shape) == 3:  # (batch, seq_len, features)
    STAGE1_SEQUENCE_LENGTH = input_shape[1]
    STAGE1_NUM_FEATURES = input_shape[2]
    print(f'\n✓ Detected 3D input: sequence_length={STAGE1_SEQUENCE_LENGTH}, num_features={STAGE1_NUM_FEATURES}')
else:
    raise ValueError(f'Expected 3D input (batch, seq, features), got {len(input_shape)}D')

## Step 3: Download Training Data

In [None]:
print(f'Downloading training data from HuggingFace...')
print(f'Path: {HF_DATA_PATH}')

try:
    data_file = hf_hub_download(
        repo_id=HF_DATASET_ID,
        filename=HF_DATA_PATH,
        repo_type='dataset',
        cache_dir=str(DATA_DIR)
    )
    print(f'✓ Downloaded data: {data_file}')
    data_file = Path(data_file)
except Exception as e:
    print(f'Error downloading data: {e}')
    raise

# Load data
df = pd.read_parquet(data_file)
print(f'\nData loaded: {df.shape[0]} rows, {df.shape[1]} columns')
print(f'Columns: {list(df.columns)}')
print(f'\nFirst 5 rows:')
print(df.head())

## Step 4: Feature Engineering & ZigZag Labeling

In [None]:
# Import from src
from src.zigzag_indicator import ZigZagIndicator
from src.features import FeatureEngineer
from src.utils import time_series_split

print('Applying ZigZag indicator...')
zigzag = ZigZagIndicator(depth=12, deviation=5, backstep=2)
df = zigzag.label_kbars(df)
print(f'✓ ZigZag labels applied')

# Check label distribution
print(f'\nLabel distribution:')
print(df['zigzag_label'].value_counts().sort_index())

# Feature engineering
print(f'\nCalculating technical indicators...')
fe = FeatureEngineer(lookback_periods=[5, 10, 20, 50, 200])
df = fe.calculate_all_features(df)
print(f'✓ Features calculated')

# Get feature columns
feature_cols = fe.get_feature_columns(df)
if 'symbol' in feature_cols:
    feature_cols.remove('symbol')

print(f'Total features: {len(feature_cols)}')
print(f'Feature columns: {feature_cols}')

# Handle missing values
df[feature_cols] = df[feature_cols].fillna(method='ffill').fillna(0)
print(f'✓ Missing values handled')

print(f'\nFinal data shape: {df.shape}')

## Step 5: Create 3D Time Series Features for Stage 1 Model

In [None]:
def create_3d_sequences(X, y, seq_length=10):
    """
    Convert 2D features to 3D sequences for LSTM/CNN models.
    \n    Args:
        X: Features (n_samples, n_features)
        y: Labels (n_samples,)
        seq_length: Window length for sequences
    \n    Returns:
        X_seq: 3D sequences (n_sequences, seq_length, n_features)
        y_seq: Corresponding labels (n_sequences,)
    """
    X_seq = []
    y_seq = []
    
    for i in range(len(X) - seq_length):
        X_seq.append(X[i:i+seq_length])
        y_seq.append(y[i+seq_length])  # Label for end of sequence
    
    return np.array(X_seq), np.array(y_seq)


print(f'Creating 3D sequences with sequence_length={STAGE1_SEQUENCE_LENGTH}...')

# Split before creating sequences (to prevent data leakage)
train_df, val_df, test_df = time_series_split(df, train_ratio=0.7, validation_ratio=0.15)

# === Train Set ===
X_train_2d = train_df[feature_cols].values
y_train_2d = train_df['zigzag_label'].values
X_train_3d, y_train_3d = create_3d_sequences(X_train_2d, y_train_2d, seq_length=STAGE1_SEQUENCE_LENGTH)
print(f'Train: {X_train_3d.shape}')

# === Validation Set ===
X_val_2d = val_df[feature_cols].values
y_val_2d = val_df['zigzag_label'].values
X_val_3d, y_val_3d = create_3d_sequences(X_val_2d, y_val_2d, seq_length=STAGE1_SEQUENCE_LENGTH)
print(f'Val: {X_val_3d.shape}')

# === Test Set ===
X_test_2d = test_df[feature_cols].values
y_test_2d = test_df['zigzag_label'].values
X_test_3d, y_test_3d = create_3d_sequences(X_test_2d, y_test_2d, seq_length=STAGE1_SEQUENCE_LENGTH)
print(f'Test: {X_test_3d.shape}')

print(f'\n✓ All sets converted to 3D format')

## Step 6: Apply Stage 1 Model with Correct Input Shape

In [None]:
print(f'Applying Stage 1 model to filtered datasets...')

# === Train Set ===
print(f'\n=== TRAIN SET ===')
print(f'Input shape: {X_train_3d.shape}')
stage1_probs_train = stage1_model.predict(X_train_3d, verbose=0)
stage1_preds_train = (stage1_probs_train[:, 1] > 0.5).astype(int)
signal_mask = stage1_preds_train == 1
print(f'Signals detected: {signal_mask.sum()} / {len(X_train_3d)} ({100*signal_mask.sum()/len(X_train_3d):.2f}%)')

# Filter to Stage 2 data
X_stage2_train_3d = X_train_3d[signal_mask]
y_stage2_train = y_train_3d[signal_mask]

# Convert back to 2D for Stage 2 model (use last bar features)
X_stage2_train = X_stage2_train_3d[:, -1, :]  # Use last timestep only

# Remove invalid labels (0 = no zigzag)
valid_mask = y_stage2_train > 0
X_stage2_train = X_stage2_train[valid_mask]
y_stage2_train = y_stage2_train[valid_mask]
print(f'Valid Stage 2 samples (train): {len(X_stage2_train)}')

# === Validation Set ===
print(f'\n=== VALIDATION SET ===')
print(f'Input shape: {X_val_3d.shape}')
stage1_probs_val = stage1_model.predict(X_val_3d, verbose=0)
stage1_preds_val = (stage1_probs_val[:, 1] > 0.5).astype(int)
signal_mask_val = stage1_preds_val == 1
print(f'Signals detected: {signal_mask_val.sum()} / {len(X_val_3d)} ({100*signal_mask_val.sum()/len(X_val_3d):.2f}%)')

X_stage2_val_3d = X_val_3d[signal_mask_val]
y_stage2_val = y_val_3d[signal_mask_val]
X_stage2_val = X_stage2_val_3d[:, -1, :]
valid_mask_val = y_stage2_val > 0
X_stage2_val = X_stage2_val[valid_mask_val]
y_stage2_val = y_stage2_val[valid_mask_val]
print(f'Valid Stage 2 samples (val): {len(X_stage2_val)}')

# === Test Set ===
print(f'\n=== TEST SET ===')
print(f'Input shape: {X_test_3d.shape}')
stage1_probs_test = stage1_model.predict(X_test_3d, verbose=0)
stage1_preds_test = (stage1_probs_test[:, 1] > 0.5).astype(int)
signal_mask_test = stage1_preds_test == 1
print(f'Signals detected: {signal_mask_test.sum()} / {len(X_test_3d)} ({100*signal_mask_test.sum()/len(X_test_3d):.2f}%)')

X_stage2_test_3d = X_test_3d[signal_mask_test]
y_stage2_test = y_test_3d[signal_mask_test]
X_stage2_test = X_stage2_test_3d[:, -1, :]
valid_mask_test = y_stage2_test > 0
X_stage2_test = X_stage2_test[valid_mask_test]
y_stage2_test = y_stage2_test[valid_mask_test]
print(f'Valid Stage 2 samples (test): {len(X_stage2_test)}')

## Step 7: Save Stage 2 Training Data

In [None]:
print(f'Saving Stage 2 data to {STAGE2_DATA_DIR}...')

with open(STAGE2_DATA_DIR / 'X_stage2_train.pkl', 'wb') as f:
    pickle.dump(X_stage2_train, f)
with open(STAGE2_DATA_DIR / 'y_stage2_train.pkl', 'wb') as f:
    pickle.dump(y_stage2_train, f)
with open(STAGE2_DATA_DIR / 'X_stage2_val.pkl', 'wb') as f:
    pickle.dump(X_stage2_val, f)
with open(STAGE2_DATA_DIR / 'y_stage2_val.pkl', 'wb') as f:
    pickle.dump(y_stage2_val, f)
with open(STAGE2_DATA_DIR / 'X_stage2_test.pkl', 'wb') as f:
    pickle.dump(X_stage2_test, f)
with open(STAGE2_DATA_DIR / 'y_stage2_test.pkl', 'wb') as f:
    pickle.dump(y_stage2_test, f)

print('✓ All data saved')

# List saved files
print(f'\nSaved files:')
for file in sorted(STAGE2_DATA_DIR.glob('*.pkl')):
    size = file.stat().st_size / 1024
    print(f'  - {file.name} ({size:.0f} KB)')

## Step 8: Train Stage 2 Model

In [None]:
from src.stage2_trainer import Stage2Trainer

print(f'Initializing Stage 2 Trainer...')
trainer = Stage2Trainer(model_dir=str(STAGE2_MODEL_DIR))

print(f'\nTraining Stage 2 model...')
print(f'Train samples: {len(X_stage2_train)}')
print(f'Val samples: {len(X_stage2_val)}')
print(f'Test samples: {len(X_stage2_test)}')
print(f'Input shape: {X_stage2_train.shape}')

train_results = trainer.train(
    X_stage2_train, y_stage2_train,
    X_stage2_val, y_stage2_val,
    normalize=True,
    cv_folds=5,
    save_model=True
)

print(f'\nTraining Results:')
for key, value in train_results.items():
    print(f'  {key}: {value:.4f}')

## Step 9: Evaluate on Test Set

In [None]:
print(f'Evaluating on test set...')
test_metrics = trainer.evaluate(X_stage2_test, y_stage2_test)

print(f'\nTest Metrics:')
for metric, value in test_metrics.items():
    print(f'  {metric}: {value:.4f}')

# Per-class performance
print(f'\nTest Predictions:')
y_pred = trainer.predict(X_stage2_test)
print(f'Unique labels in predictions: {np.unique(y_pred)}')
print(f'Unique labels in ground truth: {np.unique(y_stage2_test)}')

## Step 10: Cross-Validation

In [None]:
print(f'Running 5-fold cross-validation...')
cv_results = trainer.cross_validate(
    np.vstack([X_stage2_train, X_stage2_val]),
    np.hstack([y_stage2_train, y_stage2_val]),
    cv=5
)

print(f'\nCross-Validation Results:')
print(f'  Mean Accuracy: {cv_results["mean_accuracy"]:.4f}')
print(f'  Std Accuracy: {cv_results["std_accuracy"]:.4f}')
print(f'  Min Accuracy: {cv_results["min_accuracy"]:.4f}')
print(f'  Max Accuracy: {cv_results["max_accuracy"]:.4f}')

## Step 11: Summary Report

In [None]:
print(f'\n' + '='*80)
print(f'STAGE 2 TRAINING SUMMARY - {SYMBOL} {TIMEFRAME}')
print(f'='*80)

print(f'Stage 1 Architecture:')
print(f'  Input shape: ({STAGE1_SEQUENCE_LENGTH}, {STAGE1_NUM_FEATURES})')
print(f'  Type: 3D time series (LSTM/CNN)')

print(f'\nData Preparation:')
print(f'  Original K-bars: {len(df):,}')
print(f'  3D Sequences: {len(X_train_3d) + len(X_val_3d) + len(X_test_3d):,}')
print(f'  Stage 1 Signals: {signal_mask.sum() + signal_mask_val.sum() + signal_mask_test.sum():,}')
print(f'  Stage 2 Valid: {len(X_stage2_train) + len(X_stage2_val) + len(X_stage2_test):,}')

print(f'Train/Val/Test Split:')
print(f'  Train: {len(X_stage2_train):,}')
print(f'  Val: {len(X_stage2_val):,}')
print(f'  Test: {len(X_stage2_test):,}')

print(f'Model Performance:')
print(f'  Train Accuracy: {train_results["train_accuracy"]:.4f}')
print(f'  Val Accuracy: {train_results["val_accuracy"]:.4f}')
print(f'  Test Accuracy: {test_metrics["accuracy"]:.4f}')
print(f'  Test F1-Score: {test_metrics["f1_score"]:.4f}')

print(f'Cross-Validation:')
print(f'  Mean Accuracy: {cv_results["mean_accuracy"]:.4f} +/- {cv_results["std_accuracy"]:.4f}')

print(f'Models Saved:')
print(f'  Location: {STAGE2_MODEL_DIR}')
for file in sorted(STAGE2_MODEL_DIR.glob('*')):
    if file.is_file():
        size = file.stat().st_size / 1024 / 1024
        print(f'    - {file.name} ({size:.2f} MB)')

print(f'='*80)