# Stage 2 Data Preparation

This notebook prepares training data for Stage 2 (Signal Type Classification).

We will:
1. Load the training data and Stage 1 model
2. Use Stage 1 to identify bars with signals
3. Extract labels for those signal bars
4. Prepare data for Stage 2 training


In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

project_root = Path('.').resolve().parent
sys.path.insert(0, str(project_root))

from data.fetch_data import CryptoDataFetcher
from src.zigzag_indicator import ZigZagIndicator
from src.features import FeatureEngineer
from src.utils import time_series_split, normalize_data
from tensorflow import keras

### Step 1: Load and Prepare Data

In [None]:
# Fetch data
fetcher = CryptoDataFetcher()
btc_15m = fetcher.fetch_symbol_timeframe('BTCUSDT', '15m')
print(f'Data shape: {btc_15m.shape}')

# Apply ZigZag
zigzag = ZigZagIndicator(depth=12, deviation=5, backstep=2)
btc_15m = zigzag.label_kbars(btc_15m)

# Feature engineering
fe = FeatureEngineer(lookback_periods=[5, 10, 20, 50, 200])
btc_15m = fe.calculate_all_features(btc_15m)

# Fill NaN
feature_cols = fe.get_feature_columns(btc_15m)
btc_15m[feature_cols] = btc_15m[feature_cols].fillna(method='ffill').fillna(0)

# Remove 'symbol' column if present
if 'symbol' in feature_cols:
    feature_cols.remove('symbol')

print(f'Data prepared. Shape: {btc_15m.shape}, Features: {len(feature_cols)}')

### Step 2: Time Series Split

In [None]:
train_df, val_df, test_df = time_series_split(btc_15m, train_ratio=0.7, validation_ratio=0.15)
print(f'Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}')

### Step 3: Load Stage 1 Model

In [None]:
# Load Stage 1 model
stage1_model = keras.models.load_model('models/btcusdt_15m/classification.h5')
print('Stage 1 model loaded')

# Get predictions on training data
X_train = train_df[feature_cols].values
y_train = train_df['zigzag_label'].values

stage1_probs = stage1_model.predict(X_train)
stage1_preds = (stage1_probs[:, 1] > 0.5).astype(int)

print(f'Stage 1 predictions shape: {stage1_preds.shape}')
print(f'Signal bars: {(stage1_preds == 1).sum()}')

### Step 4: Filter Data for Stage 2

In [None]:
# Filter: keep only signal bars identified by Stage 1
signal_mask = stage1_preds == 1
X_stage2_train = X_train[signal_mask]
y_stage2_train = y_train[signal_mask]

# Remove NO_SIGNAL labels
valid_mask = y_stage2_train > 0
X_stage2_train = X_stage2_train[valid_mask]
y_stage2_train = y_stage2_train[valid_mask]

print(f'\nStage 2 Training Data Summary:')
print(f'  Total samples: {len(X_train)}')
print(f'  Samples with Stage 1 signal: {signal_mask.sum()}')
print(f'  Valid Stage 2 samples: {len(X_stage2_train)}')
print(f'  Signal percentage: {(len(X_stage2_train) / len(X_train)) * 100:.2f}%')

print(f'\n  Label distribution:')
label_counts = pd.Series(y_stage2_train).value_counts().sort_index()
label_names = {1: 'HH', 2: 'LH', 3: 'HL', 4: 'LL'}
for label_id, count in label_counts.items():
    label_name = label_names.get(label_id, 'Unknown')
    pct = (count / len(y_stage2_train)) * 100
    print(f'    {label_name} ({label_id}): {count} ({pct:.1f}%)')

### Step 5: Prepare Validation and Test Sets

In [None]:
# Validation set
X_val = val_df[feature_cols].values
y_val = val_df['zigzag_label'].values

stage1_probs_val = stage1_model.predict(X_val)
stage1_preds_val = (stage1_probs_val[:, 1] > 0.5).astype(int)

signal_mask_val = stage1_preds_val == 1
X_stage2_val = X_val[signal_mask_val]
y_stage2_val = y_val[signal_mask_val]
valid_mask_val = y_stage2_val > 0
X_stage2_val = X_stage2_val[valid_mask_val]
y_stage2_val = y_stage2_val[valid_mask_val]

print(f'Validation Stage 2: {len(X_stage2_val)} samples')

# Test set
X_test = test_df[feature_cols].values
y_test = test_df['zigzag_label'].values

stage1_probs_test = stage1_model.predict(X_test)
stage1_preds_test = (stage1_probs_test[:, 1] > 0.5).astype(int)

signal_mask_test = stage1_preds_test == 1
X_stage2_test = X_test[signal_mask_test]
y_stage2_test = y_test[signal_mask_test]
valid_mask_test = y_stage2_test > 0
X_stage2_test = X_stage2_test[valid_mask_test]
y_stage2_test = y_stage2_test[valid_mask_test]

print(f'Test Stage 2: {len(X_stage2_test)} samples')

### Step 6: Save Stage 2 Data

In [None]:
import pickle

data_dir = Path('data/stage2')
data_dir.mkdir(parents=True, exist_ok=True)

# Save training data
with open(data_dir / 'X_stage2_train.pkl', 'wb') as f:
    pickle.dump(X_stage2_train, f)
with open(data_dir / 'y_stage2_train.pkl', 'wb') as f:
    pickle.dump(y_stage2_train, f)

# Save validation data
with open(data_dir / 'X_stage2_val.pkl', 'wb') as f:
    pickle.dump(X_stage2_val, f)
with open(data_dir / 'y_stage2_val.pkl', 'wb') as f:
    pickle.dump(y_stage2_val, f)

# Save test data
with open(data_dir / 'X_stage2_test.pkl', 'wb') as f:
    pickle.dump(X_stage2_test, f)
with open(data_dir / 'y_stage2_test.pkl', 'wb') as f:
    pickle.dump(y_stage2_test, f)

print(f'Stage 2 data saved to {data_dir}')

### Step 7: Summary Statistics

In [None]:
print(f'\nStage 2 Data Summary')
print('='*50)
print(f'Training:   {len(X_stage2_train)} samples')
print(f'Validation: {len(X_stage2_val)} samples')
print(f'Test:       {len(X_stage2_test)} samples')
print(f'Total:      {len(X_stage2_train) + len(X_stage2_val) + len(X_stage2_test)} samples')
print(f'\nFeatures: {len(feature_cols)}')
print('='*50)