# 02 — Feature Engineering

**MarketPulse Phase 1**

This notebook demonstrates and visualizes all 60 features we compute:
1. **Technical indicators** — trend, momentum, volatility, volume (33 features)
2. **Return-based features** — lagged returns, rolling stats, risk metrics (27 features)
3. Feature correlations and redundancy analysis
4. Feature importance preview

In [None]:
import sys, os
sys.path.insert(0, os.path.abspath('..'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

from src.data.market_config import load_market_config
from src.data.fetcher import YFinanceFetcher
from src.data.preprocessing import preprocess_ohlcv
from src.features.technical import compute_technical_indicators, get_feature_names
from src.features.returns import compute_return_features, get_return_feature_names

sns.set_theme(style='whitegrid')
%matplotlib inline

In [None]:
# Fetch and preprocess AAPL
config = load_market_config('stocks')
fetcher = YFinanceFetcher(market_config=config)

end_date = datetime.now().strftime('%Y-%m-%d')
start_date = (datetime.now() - timedelta(days=5*365)).strftime('%Y-%m-%d')

raw = fetcher.fetch('AAPL', start=start_date, end=end_date)
df = preprocess_ohlcv(raw, market_config=config)
print(f"Preprocessed: {len(df)} rows, columns: {list(df.columns)}")

## 1. Technical Indicators

We compute 5 groups of indicators using pandas-ta: **trend**, **momentum**, **volatility**, **volume**, and **custom** cross-indicator features.

In [None]:
df = compute_technical_indicators(df)

tech_features = get_feature_names()
present = [f for f in tech_features if f in df.columns]
missing = [f for f in tech_features if f not in df.columns]

print(f"Technical features computed: {len(present)}/{len(tech_features)}")
if missing:
    print(f"Missing (data-dependent): {missing}")
print(f"\nAll technical features: {present}")

### 1.1 Trend Indicators — SMA, EMA, MACD

Moving averages smooth out price noise. The MACD captures the momentum of the trend.

In [None]:
df_plot = df.tail(252)  # Last year

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10), sharex=True,
                                gridspec_kw={'height_ratios': [3, 1]})

# Price with SMAs
ax1.plot(df_plot.index, df_plot['close'], label='Close', color='black', linewidth=1.5)
ax1.plot(df_plot.index, df_plot['sma_20'], label='SMA 20', alpha=0.8)
ax1.plot(df_plot.index, df_plot['sma_50'], label='SMA 50', alpha=0.8)
ax1.plot(df_plot.index, df_plot['sma_200'], label='SMA 200', alpha=0.8, linestyle='--')
ax1.fill_between(df_plot.index, df_plot['bb_lower'], df_plot['bb_upper'],
                  alpha=0.1, color='blue', label='Bollinger Bands')
ax1.set_ylabel('Price ($)')
ax1.set_title('AAPL — Trend Indicators')
ax1.legend(loc='upper left', fontsize=9)
ax1.grid(True, alpha=0.3)

# MACD
ax2.plot(df_plot.index, df_plot['macd'], label='MACD', color='blue')
ax2.plot(df_plot.index, df_plot['macd_signal'], label='Signal', color='orange')
colors = ['green' if v >= 0 else 'red' for v in df_plot['macd_hist']]
ax2.bar(df_plot.index, df_plot['macd_hist'], color=colors, alpha=0.5, width=1)
ax2.axhline(0, color='gray', linewidth=0.5)
ax2.set_ylabel('MACD')
ax2.legend(fontsize=9)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### 1.2 Momentum Indicators — RSI, Stochastic, CCI

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(14, 10), sharex=True)

# RSI
axes[0].plot(df_plot.index, df_plot['rsi_14'], color='purple', linewidth=1)
axes[0].axhline(70, color='red', linestyle='--', alpha=0.5, label='Overbought (70)')
axes[0].axhline(30, color='green', linestyle='--', alpha=0.5, label='Oversold (30)')
axes[0].fill_between(df_plot.index, 30, 70, alpha=0.05, color='gray')
axes[0].set_ylabel('RSI (14)')
axes[0].set_title('Momentum Indicators')
axes[0].legend(fontsize=8)
axes[0].set_ylim(0, 100)
axes[0].grid(True, alpha=0.3)

# Stochastic
axes[1].plot(df_plot.index, df_plot['stoch_k'], label='%K', color='blue')
axes[1].plot(df_plot.index, df_plot['stoch_d'], label='%D', color='orange')
axes[1].axhline(80, color='red', linestyle='--', alpha=0.5)
axes[1].axhline(20, color='green', linestyle='--', alpha=0.5)
axes[1].set_ylabel('Stochastic')
axes[1].legend(fontsize=8)
axes[1].grid(True, alpha=0.3)

# CCI
axes[2].plot(df_plot.index, df_plot['cci_20'], color='teal', linewidth=1)
axes[2].axhline(100, color='red', linestyle='--', alpha=0.5)
axes[2].axhline(-100, color='green', linestyle='--', alpha=0.5)
axes[2].set_ylabel('CCI (20)')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### 1.3 Volatility — Bollinger Bands, ATR

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 8), sharex=True)

# Bollinger Band Width — measures volatility expansion/contraction
ax1.plot(df_plot.index, df_plot['bb_width'], color='orange', linewidth=1)
ax1.set_ylabel('BB Width')
ax1.set_title('Volatility Indicators')
ax1.axhline(df_plot['bb_width'].mean(), color='gray', linestyle='--', alpha=0.5)
ax1.grid(True, alpha=0.3)

# ATR as % of price — comparable across different price levels
ax2.plot(df_plot.index, df_plot['atr_pct'] * 100, color='red', linewidth=1)
ax2.set_ylabel('ATR (% of Price)')
ax2.set_xlabel('Date')
ax2.axhline(df_plot['atr_pct'].mean() * 100, color='gray', linestyle='--', alpha=0.5)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 2. Return-Based Features

In [None]:
df = compute_return_features(df)

ret_features = get_return_feature_names()
present_ret = [f for f in ret_features if f in df.columns]
print(f"Return features computed: {len(present_ret)}/{len(ret_features)}")
print(f"\nAll return features: {present_ret}")
print(f"\nTotal feature count: {len(present) + len(present_ret)}")

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Lagged returns (1d vs 5d vs 20d)
for col, label, color in [('ret_1d', '1-day', 'blue'), ('ret_5d', '5-day', 'green'), ('ret_20d', '20-day', 'red')]:
    axes[0, 0].hist(df[col].dropna(), bins=60, alpha=0.5, label=label, density=True)
axes[0, 0].set_title('Lagged Return Distributions')
axes[0, 0].legend()
axes[0, 0].set_xlabel('Return')

# Rolling volatility
df_tail = df.tail(504)
for col, label in [('vol_5d', '5-day'), ('vol_10d', '10-day'), ('vol_20d', '20-day')]:
    axes[0, 1].plot(df_tail.index, df_tail[col], label=label, alpha=0.8)
axes[0, 1].set_title('Rolling Volatility')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Z-score (mean reversion signal)
axes[1, 0].plot(df_tail.index, df_tail['zscore_20d'], label='20d z-score', color='teal')
axes[1, 0].axhline(2, color='red', linestyle='--', alpha=0.5)
axes[1, 0].axhline(-2, color='green', linestyle='--', alpha=0.5)
axes[1, 0].axhline(0, color='gray', linewidth=0.5)
axes[1, 0].set_title('Z-Score (Mean Reversion Signal)')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Rolling max drawdown
axes[1, 1].plot(df_tail.index, df_tail['max_dd_20d'], label='20d MDD', color='red')
axes[1, 1].plot(df_tail.index, df_tail['max_dd_60d'], label='60d MDD', color='darkred')
axes[1, 1].set_title('Rolling Maximum Drawdown')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.suptitle('AAPL — Return-Based Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 3. Feature Correlation Analysis

Highly correlated features provide redundant information. We check for multicollinearity.

In [None]:
# Get all feature columns
all_features = present + present_ret + ['returns', 'log_returns', 'volume_norm']
all_features = [f for f in all_features if f in df.columns]

corr_matrix = df[all_features].dropna().corr()

# Find highly correlated pairs (|r| > 0.9)
high_corr = []
for i in range(len(corr_matrix)):
    for j in range(i+1, len(corr_matrix)):
        if abs(corr_matrix.iloc[i, j]) > 0.9:
            high_corr.append((
                corr_matrix.index[i], 
                corr_matrix.columns[j], 
                corr_matrix.iloc[i, j]
            ))

print(f"Feature pairs with |correlation| > 0.9: {len(high_corr)}")
print("\nThese are candidates for removal during feature selection:")
for f1, f2, r in sorted(high_corr, key=lambda x: abs(x[2]), reverse=True)[:15]:
    print(f"  {f1:25s} ↔ {f2:25s}  r = {r:+.3f}")

In [None]:
# Full correlation heatmap (clustered)
fig, ax = plt.subplots(figsize=(18, 16))
sns.heatmap(corr_matrix, cmap='RdBu_r', center=0, vmin=-1, vmax=1,
            xticklabels=True, yticklabels=True, ax=ax,
            cbar_kws={'shrink': 0.8})
ax.set_title('Feature Correlation Matrix (All 60 Features)', fontsize=14)
plt.xticks(fontsize=7, rotation=90)
plt.yticks(fontsize=7)
plt.tight_layout()
plt.show()

## 4. Feature Coverage & NaN Analysis

Some features need a warmup period (e.g., SMA 200 needs 200 days). We check how many valid rows each feature has.

In [None]:
# NaN count per feature (sorted by most NaN)
nan_counts = df[all_features].isnull().sum().sort_values(ascending=False)
nan_pct = nan_counts / len(df) * 100

fig, ax = plt.subplots(figsize=(14, 8))
nan_pct.plot(kind='barh', ax=ax, color='coral')
ax.set_xlabel('% Missing (NaN)')
ax.set_title('Feature Missing Values — Warmup Period Impact')
ax.axvline(5, color='red', linestyle='--', label='5% threshold')
ax.legend()
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

print(f"\nTotal rows: {len(df)}")
print(f"Rows with ALL features valid: {df[all_features].dropna().shape[0]}")
print(f"Data lost to warmup: {len(df) - df[all_features].dropna().shape[0]} rows "
      f"({(1 - df[all_features].dropna().shape[0]/len(df)):.1%})")

## Key Takeaways

1. **60 features** are computed per ticker — 33 technical + 27 return-based.
2. Some features are **highly correlated** (e.g., SMA 20 ≈ BB mid, ret_1d ≈ returns). Feature selection in the modelling phase will prune these.
3. **Warmup period**: The longest indicator (SMA 200) needs 200 days. After dropping NaN rows, we lose ~16% of data — this is expected and acceptable with 5 years of history.
4. **Volatility features** (ATR, rolling vol, BB width) cluster together, as do **momentum features** (RSI, stochastic, MACD). This suggests natural feature groupings.

Next: Notebook 03 — Modelling (walk-forward training, evaluation, SHAP analysis).