# 05 — Feature Selection & Hyperparameter Tuning

This notebook walks through:
1. Feature selection (mutual info, importance, correlation filter)
2. Hyperparameter optimization with Optuna (Bayesian / TPE)
3. Comparing default vs tuned model performance
4. Visual analysis of tuning results

In [None]:
import sys, os
sys.path.insert(0, os.path.abspath('..'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
%matplotlib inline

## 1. Prepare Data

In [None]:
from src.data.fetcher import YFinanceFetcher
from src.data.preprocessing import preprocess_ohlcv
from src.features.technical import compute_technical_indicators
from src.features.returns import compute_return_features
from src.features.labels import generate_labels, get_clean_features_and_labels
from src.data.market_config import load_market_config, load_strategy_config

# Setup
TICKER = 'AAPL'
market_config = load_market_config('stocks')
strategy_config = load_strategy_config('short_term')

# Fetch & build features
fetcher = YFinanceFetcher()
end = datetime.now().strftime('%Y-%m-%d')
start = (datetime.now() - timedelta(days=5*365)).strftime('%Y-%m-%d')

raw = fetcher.fetch(TICKER, start=start, end=end)
df = preprocess_ohlcv(raw, market_config=market_config)
df = compute_technical_indicators(df)
df = compute_return_features(df)
df = generate_labels(df, horizon=1, label_type='classification', num_classes=3, threshold=0.01)

X, y = get_clean_features_and_labels(df)
print(f'Features: {X.shape[1]}, Samples: {len(X)}')
print(f'Label distribution:\n{y.value_counts().sort_index()}')

## 2. Feature Selection

In [None]:
from src.features.feature_selection import (
    select_by_importance,
    select_by_mutual_info,
    filter_correlated,
    select_features_pipeline,
)

# Run the full pipeline: correlation filter → importance ranking
selected_features, scores_df = select_features_pipeline(
    X, y, max_features=15, corr_threshold=0.90, method='importance'
)

print(f'\nSelected {len(selected_features)} features:')
for i, f in enumerate(selected_features, 1):
    print(f'  {i:2d}. {f}')

In [None]:
# Visualize feature selection scores
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

# XGBoost importance
top_imp = scores_df.nlargest(20, 'xgb_importance')
colors = ['#2ecc71' if s else '#95a5a6' for s in top_imp['selected']]
axes[0].barh(range(len(top_imp)), top_imp['xgb_importance'].values, color=colors)
axes[0].set_yticks(range(len(top_imp)))
axes[0].set_yticklabels(top_imp['feature'].values)
axes[0].set_title('XGBoost Gain Importance (green = selected)')
axes[0].invert_yaxis()

# Mutual Information
top_mi = scores_df.nlargest(20, 'mutual_info')
colors2 = ['#3498db' if s else '#95a5a6' for s in top_mi['selected']]
axes[1].barh(range(len(top_mi)), top_mi['mutual_info'].values, color=colors2)
axes[1].set_yticks(range(len(top_mi)))
axes[1].set_yticklabels(top_mi['feature'].values)
axes[1].set_title('Mutual Information (blue = selected)')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

In [None]:
# Also compare with mutual_info method
mi_features, mi_scores = select_features_pipeline(
    X, y, max_features=15, method='mutual_info'
)

# Overlap between methods
overlap = set(selected_features) & set(mi_features)
print(f'Importance-selected: {len(selected_features)}')
print(f'MI-selected:         {len(mi_features)}')
print(f'Overlap:             {len(overlap)}')
print(f'\nCommon features: {sorted(overlap)}')

## 3. Baseline: Default Hyperparameters

In [None]:
from src.models.xgboost_classifier import MarketPulseXGBClassifier
from src.utils.validation import WalkForwardValidator
from sklearn.metrics import f1_score, accuracy_score

# Walk-forward evaluation with default params & full features
validator = WalkForwardValidator.from_strategy_config(strategy_config)
folds = validator.split(X)

default_scores = []
for fold in folds[-10:]:  # last 10 folds for speed
    X_tr, y_tr, X_te, y_te = validator.get_fold_data(X, y, fold)
    model = MarketPulseXGBClassifier(num_classes=3)
    model.fit(X_tr, y_tr, balance_classes=True)
    y_pred = model.predict(X_te)
    default_scores.append({
        'fold': fold.fold_number,
        'accuracy': accuracy_score(y_te.astype(int), y_pred),
        'f1_macro': f1_score(y_te.astype(int), y_pred, average='macro', zero_division=0),
    })

default_df = pd.DataFrame(default_scores)
print(f'Default params (full {X.shape[1]} features):')
print(f'  Mean accuracy: {default_df["accuracy"].mean():.4f}')
print(f'  Mean F1 macro: {default_df["f1_macro"].mean():.4f}')

In [None]:
# Walk-forward with default params but SELECTED features
X_sel = X[selected_features]

selected_scores = []
for fold in folds[-10:]:
    X_tr, y_tr, X_te, y_te = validator.get_fold_data(X_sel, y, fold)
    model = MarketPulseXGBClassifier(num_classes=3)
    model.fit(X_tr, y_tr, balance_classes=True)
    y_pred = model.predict(X_te)
    selected_scores.append({
        'fold': fold.fold_number,
        'accuracy': accuracy_score(y_te.astype(int), y_pred),
        'f1_macro': f1_score(y_te.astype(int), y_pred, average='macro', zero_division=0),
    })

selected_df = pd.DataFrame(selected_scores)
print(f'Default params (selected {len(selected_features)} features):')
print(f'  Mean accuracy: {selected_df["accuracy"].mean():.4f}')
print(f'  Mean F1 macro: {selected_df["f1_macro"].mean():.4f}')
print(f'\n  Accuracy change: {selected_df["accuracy"].mean() - default_df["accuracy"].mean():+.4f}')
print(f'  F1 change:       {selected_df["f1_macro"].mean() - default_df["f1_macro"].mean():+.4f}')

## 4. Optuna Hyperparameter Tuning

Bayesian optimization using TPE (Tree-structured Parzen Estimator).
Searches 9 hyperparameter dimensions simultaneously.

In [None]:
from src.models.tuner import MarketPulseTuner, XGBOOST_SEARCH_SPACE

tuner = MarketPulseTuner(
    X=X,
    y=y,
    strategy_config=strategy_config,
    search_space=XGBOOST_SEARCH_SPACE,
    metric='f1_macro',
    max_folds=10,
    selected_features=selected_features,
)

# Run 30 trials (takes ~5-15 min depending on machine)
best_params = tuner.tune_optuna(n_trials=30, show_progress=True)

print(f'\nBest F1 macro: {tuner.best_score:.4f}')
print(f'\nBest hyperparameters:')
for k, v in sorted(best_params.items()):
    if k not in ('random_state', 'n_jobs', 'verbosity'):
        print(f'  {k}: {v}')

In [None]:
# Trial history
trials_df = tuner.get_results_df()

fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Score over trials
axes[0].plot(trials_df.sort_values('trial')['trial'], 
             trials_df.sort_values('trial')['score'], 
             'o-', alpha=0.6, markersize=4)
axes[0].axhline(y=tuner.best_score, color='r', linestyle='--', label=f'Best: {tuner.best_score:.4f}')
axes[0].axhline(y=default_df['f1_macro'].mean(), color='gray', linestyle=':', label=f'Default: {default_df["f1_macro"].mean():.4f}')
axes[0].set_xlabel('Trial')
axes[0].set_ylabel('F1 Macro')
axes[0].set_title('Optimization Progress')
axes[0].legend()

# Score distribution
axes[1].hist(trials_df['score'].dropna(), bins=20, edgecolor='black', alpha=0.7)
axes[1].axvline(x=tuner.best_score, color='r', linestyle='--', label='Best')
axes[1].axvline(x=default_df['f1_macro'].mean(), color='gray', linestyle=':', label='Default')
axes[1].set_xlabel('F1 Macro')
axes[1].set_ylabel('Count')
axes[1].set_title('Score Distribution Across Trials')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Hyperparameter importance (which params matter most?)
param_imp = tuner.get_param_importance()
if param_imp is not None:
    fig, ax = plt.subplots(figsize=(10, 6))
    param_imp.plot(kind='barh', ax=ax, color='#3498db')
    ax.set_title('Hyperparameter Importance (Optuna)')
    ax.set_xlabel('Importance')
    ax.invert_yaxis()
    plt.tight_layout()
    plt.show()

## 5. Compare Default vs Tuned

In [None]:
# Evaluate tuned params with walk-forward
X_sel = X[selected_features]
tuned_scores = []
for fold in folds[-10:]:
    X_tr, y_tr, X_te, y_te = validator.get_fold_data(X_sel, y, fold)
    model = MarketPulseXGBClassifier(
        hyperparameters=best_params, num_classes=3
    )
    model.fit(X_tr, y_tr, balance_classes=True)
    y_pred = model.predict(X_te)
    tuned_scores.append({
        'fold': fold.fold_number,
        'accuracy': accuracy_score(y_te.astype(int), y_pred),
        'f1_macro': f1_score(y_te.astype(int), y_pred, average='macro', zero_division=0),
    })

tuned_df = pd.DataFrame(tuned_scores)

# Comparison table
comparison = pd.DataFrame({
    'Configuration': ['Default (all features)', f'Default ({len(selected_features)} features)', f'Tuned ({len(selected_features)} features)'],
    'Mean Accuracy': [default_df['accuracy'].mean(), selected_df['accuracy'].mean(), tuned_df['accuracy'].mean()],
    'Mean F1 Macro': [default_df['f1_macro'].mean(), selected_df['f1_macro'].mean(), tuned_df['f1_macro'].mean()],
})
comparison['Δ F1 vs Default'] = comparison['Mean F1 Macro'] - comparison['Mean F1 Macro'].iloc[0]
print(comparison.to_string(index=False))

In [None]:
# Per-fold comparison plot
fig, ax = plt.subplots(figsize=(14, 5))

fold_nums = default_df['fold']
ax.plot(fold_nums, default_df['f1_macro'], 'o-', label='Default (all features)', alpha=0.7)
ax.plot(fold_nums, selected_df['f1_macro'], 's-', label=f'Default ({len(selected_features)} features)', alpha=0.7)
ax.plot(fold_nums, tuned_df['f1_macro'], '^-', label=f'Tuned ({len(selected_features)} features)', alpha=0.7)

ax.set_xlabel('Fold')
ax.set_ylabel('F1 Macro')
ax.set_title('Per-Fold F1 Macro: Default vs Feature-Selected vs Tuned')
ax.legend()
plt.tight_layout()
plt.show()

## 6. Save Best Configuration

Copy the best hyperparameters into your `config/strategies/short_term.yaml`.

In [None]:
import yaml

print('Best hyperparameters for short_term.yaml:')
print()
clean_params = {k: round(v, 6) if isinstance(v, float) else v 
                for k, v in best_params.items() 
                if k not in ('random_state', 'n_jobs', 'verbosity')}
print(yaml.dump({'hyperparameters': clean_params}, default_flow_style=False))
print(f'Selected features ({len(selected_features)}):')
print(selected_features)