# Quick Start: Training Your First Stock Prediction Model

This notebook demonstrates the complete workflow from loading data to training and evaluating models.

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from preprocessing import temporal_train_test_split, scale_features
from models.xgboost_model import train_xgboost, predict_xgboost, get_feature_importance
from evaluation.metrics import regression_metrics_report, financial_metrics_report
from backtesting.strategy import run_backtest

%matplotlib inline

## Step 1: Load Engineered Features

In [None]:
# Load engineered features
df = pd.read_csv('../data/processed/features_engineered.csv', parse_dates=['dt'])
df.set_index('dt', inplace=True)

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df.index.min()} to {df.index.max()}")

## Step 2: Prepare Features and Target

In [None]:
# Select target variable
target_col = 'target_price_next_day'  # Change this for different prediction tasks

# Select feature columns (exclude all target columns)
target_columns = [col for col in df.columns if 'target' in col]
feature_cols = [col for col in df.columns if col not in target_columns and col != 'sp500']

print(f"Number of features: {len(feature_cols)}")
print(f"Target variable: {target_col}")

# Create feature matrix and target
X = df[feature_cols]
y = df[target_col]

# Drop rows with missing target
valid_idx = y.notna()
X = X[valid_idx]
y = y[valid_idx]

print(f"\nFinal dataset shape: X={X.shape}, y={y.shape}")

## Step 3: Temporal Train/Val/Test Split

In [None]:
# Split data temporally (NO SHUFFLING!)
train_data, val_data, test_data = temporal_train_test_split(
    pd.concat([X, y], axis=1),
    train_end='2018-12-31',
    val_end='2021-12-31'
)

# Separate features and target
X_train = train_data[feature_cols]
y_train = train_data[target_col]

X_val = val_data[feature_cols]
y_val = val_data[target_col]

X_test = test_data[feature_cols]
y_test = test_data[target_col]

print(f"\nTrain: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

## Step 4: Train XGBoost Model

In [None]:
# Train XGBoost
model, evals_result = train_xgboost(
    X_train, y_train,
    X_val, y_val,
    objective='reg:squarederror',
    early_stopping_rounds=50
)

# Plot training curves
plt.figure(figsize=(10, 5))
plt.plot(evals_result['train']['rmse'], label='Train RMSE')
plt.plot(evals_result['val']['rmse'], label='Val RMSE')
plt.xlabel('Boosting Round')
plt.ylabel('RMSE')
plt.title('XGBoost Training Progress')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Step 5: Make Predictions

In [None]:
# Predict on test set
predictions = predict_xgboost(model, X_test)

print(f"Predictions shape: {predictions.shape}")
print(f"Sample predictions: {predictions[:5]}")

## Step 6: Evaluate Model

In [None]:
# Calculate regression metrics
metrics = regression_metrics_report(y_test.values, predictions)

print("\n" + "="*60)
print("MODEL PERFORMANCE METRICS")
print("="*60)
for metric, value in metrics.items():
    print(f"{metric:25s}: {value:10.4f}")
print("="*60)

In [None]:
# Visualize predictions vs actual
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Time series comparison
axes[0].plot(test_data.index, y_test.values, label='Actual', linewidth=1.5, alpha=0.7)
axes[0].plot(test_data.index, predictions, label='Predicted', linewidth=1.5, alpha=0.7)
axes[0].set_title('Actual vs Predicted S&P 500 Prices', fontweight='bold', fontsize=13)
axes[0].set_xlabel('Date')
axes[0].set_ylabel('Price')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Scatter plot
axes[1].scatter(y_test.values, predictions, alpha=0.5, s=20)
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2, label='Perfect Prediction')
axes[1].set_title('Prediction Scatter Plot', fontweight='bold', fontsize=13)
axes[1].set_xlabel('Actual Price')
axes[1].set_ylabel('Predicted Price')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Step 7: Feature Importance

In [None]:
# Get feature importance
importance_df = get_feature_importance(model, feature_cols, importance_type='gain')

# Plot top 20 features
top_n = 20
plt.figure(figsize=(10, 8))
plt.barh(range(top_n), importance_df['importance'].head(top_n), color='steelblue')
plt.yticks(range(top_n), importance_df['feature'].head(top_n))
plt.xlabel('Importance (Gain)', fontsize=12)
plt.title(f'Top {top_n} Most Important Features', fontweight='bold', fontsize=13)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 10 Features:")
print(importance_df.head(10))

## Step 8: Backtest Trading Strategy

In [None]:
# Run backtest
backtest_results, trades_log = run_backtest(
    predictions,
    test_data,
    initial_capital=10000,
    price_column='sp500'
)

print("\n" + "="*60)
print("BACKTESTING RESULTS")
print("="*60)
for metric, value in backtest_results.items():
    if metric not in ['trades_log', 'equity_curve']:
        print(f"{metric:25s}: {value:10.2f}")
print("="*60)

In [None]:
# Plot equity curve
if len(backtest_results['equity_curve']) > 0:
    plt.figure(figsize=(14, 6))
    plt.plot(backtest_results['equity_curve'], linewidth=2, color='green')
    plt.axhline(y=10000, color='red', linestyle='--', label='Initial Capital')
    plt.title('Portfolio Equity Curve', fontweight='bold', fontsize=14)
    plt.xlabel('Trade Number')
    plt.ylabel('Portfolio Value ($)')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

## Step 9: Save Model

In [None]:
from models.xgboost_model import save_model

# Save trained model
save_model(model, '../models/xgboost_next_day.json')

print("Model saved successfully!")

## Summary

You've successfully:
1. ✅ Loaded engineered features
2. ✅ Split data temporally (no lookahead bias)
3. ✅ Trained XGBoost model
4. ✅ Evaluated on unseen test data
5. ✅ Analyzed feature importance
6. ✅ Backtested trading strategy
7. ✅ Saved model for production

**Next Steps:**
- Try different target variables (classification, multi-day predictions)
- Train LightGBM and LSTM models
- Hyperparameter tuning with Optuna
- Ensemble multiple models