# XGBoost – Next-Day Direction Classifier

Gradient-boosted trees typically outperform single trees and logistic regression on tabular financial data.  
Key advantages for this dataset:
- **Handles missing values natively** — no imputation required for sparse macro / sentiment columns.
- **Non-linear feature interactions** captured automatically.
- **SHAP values** provide per-prediction explanations.

**Time-based split:** train `< 2023-01-01`, test `≥ 2023-01-01`

> Install requirements if needed:  
> `pip install xgboost shap`

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, classification_report

try:
    import shap
    SHAP_AVAILABLE = True
    print('SHAP available – full feature explanation will run.')
except ImportError:
    SHAP_AVAILABLE = False
    print('SHAP not installed – skipping SHAP section. Run: pip install shap')

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 5)

## 1. Load Data

In [None]:
df = pd.read_csv('../data/merged_dataset.csv', parse_dates=['date'])
print(f'Shape: {df.shape}')
print(f'Date range: {df["date"].min().date()} – {df["date"].max().date()}')
df.head(3)

## 2. Feature Engineering

XGBoost handles `NaN` natively, so sparse macro and sentiment columns are **not imputed** here — they're passed as-is, letting XGBoost learn the missing-value splits.

In [None]:
df = df.sort_values(['ticker', 'date']).reset_index(drop=True)

# ── Target: next-day direction ────────────────────────────────────────────────
df['next_return'] = df.groupby('ticker')['daily_return'].shift(-1)
df['target'] = (df['next_return'] > 0).astype(int)

# ── Lagged returns ────────────────────────────────────────────────────────────
df['lag_return_1'] = df['daily_return']
df['lag_return_2'] = df.groupby('ticker')['daily_return'].shift(1)
df['lag_return_5'] = df.groupby('ticker')['daily_return'].shift(4)

# ── Cumulative multi-day returns ──────────────────────────────────────────────
df['cum_return_5']  = df.groupby('ticker')['adj_close'].transform(
    lambda x: x.pct_change(5)
)
df['cum_return_10'] = df.groupby('ticker')['adj_close'].transform(
    lambda x: x.pct_change(10)
)

# ── Price vs 20-day MA ────────────────────────────────────────────────────────
df['price_to_ma20'] = (
    df['close'] / df['rolling_mean_20'].replace(0, np.nan)
).replace([np.inf, -np.inf], np.nan) - 1

# ── Intraday range ────────────────────────────────────────────────────────────
df['hl_range'] = (df['high'] - df['low']) / df['close'].replace(0, np.nan)

# ── Overnight gap ─────────────────────────────────────────────────────────────
df['prev_close'] = df.groupby('ticker')['close'].shift(1)
df['oc_gap'] = (
    (df['open'] - df['prev_close']) / df['prev_close'].replace(0, np.nan)
).replace([np.inf, -np.inf], np.nan)

# ── Abnormal volume ───────────────────────────────────────────────────────────
df['vol_20ma'] = df.groupby('ticker')['volume'].transform(
    lambda x: x.rolling(20, min_periods=1).mean()
)
df['vol_norm'] = (df['volume'] / df['vol_20ma'].replace(0, np.nan)).clip(0, 10)

# ── Realised volatility (rolling 20d std of log-return) ──────────────────────
df['rv_20'] = df.groupby('ticker')['log_return'].transform(
    lambda x: x.rolling(20, min_periods=5).std() * np.sqrt(252)
)

# ── News flag + log news count ────────────────────────────────────────────────
df['has_news']       = (df['news_count'].fillna(0) > 0).astype(float)
df['log_news_count'] = np.log1p(df['news_count'].fillna(0))

# ── Log market cap ────────────────────────────────────────────────────────────
df['log_marketcap'] = np.log1p(df['Marketcap'].fillna(0))

# ── Sector encoding ───────────────────────────────────────────────────────────
df['Sector_encoded'] = df['Sector'].astype('category').cat.codes

# NOTE: VIX, Yield_Spread, Regime_GMM, sentiment columns are left as-is (NaN);
# XGBoost handles missing values natively via learned split directions.

print(f'Shape after feature engineering: {df.shape}')

## 3. Feature Selection & Train / Test Split

In [None]:
FEATURE_COLS = [
    # Price / return signals
    'lag_return_1', 'lag_return_2', 'lag_return_5',
    'cum_return_5', 'cum_return_10',
    'rolling_std_20', 'rv_20',
    'price_to_ma20',
    # Intraday / volume
    'hl_range', 'oc_gap', 'vol_norm',
    # Macro (sparse – NaN left as-is)
    'VIX', 'Yield_Spread', 'Regime_GMM',
    # Sentiment (sparse – NaN left as-is)
    'sentiment_mean', 'sentiment_ratio', 'has_news', 'log_news_count',
    # Fundamentals
    'log_marketcap', 'Revenuegrowth', 'Weight',
    # Categorical
    'Sector_encoded',
]

# Only drop rows where the target itself is NaN (last day per ticker)
model_df = df[FEATURE_COLS + ['target', 'date']].dropna(subset=['target'])
# Also require core price features
model_df = model_df.dropna(subset=['lag_return_1', 'lag_return_2'])

print(f'Rows: {len(model_df):,}')
print(f'Class balance — Up: {model_df["target"].mean():.2%}')

SPLIT_DATE = '2023-01-01'
train = model_df[model_df['date'] <  SPLIT_DATE]
test  = model_df[model_df['date'] >= SPLIT_DATE]

X_train, y_train = train[FEATURE_COLS], train['target'].astype(int)
X_test,  y_test  = test[FEATURE_COLS],  test['target'].astype(int)

naive = max(y_test.mean(), 1 - y_test.mean())
print(f'Train: {len(train):,}  |  Test: {len(test):,}')
print(f'Naive baseline accuracy: {naive:.4f}')

## 4. Train XGBoost

`eval_metric='logloss'` with early stopping prevents overfitting; a validation slice (last 10% of train by time) monitors performance.

In [None]:
# Use last 10% of training period as validation for early stopping
val_cutoff = train['date'].quantile(0.9)
tr   = train[train['date'] <  val_cutoff]
val  = train[train['date'] >= val_cutoff]

X_tr,  y_tr  = tr[FEATURE_COLS],  tr['target'].astype(int)
X_val, y_val = val[FEATURE_COLS], val['target'].astype(int)

model = XGBClassifier(
    n_estimators      = 1000,      # upper bound; early stopping will cut this
    learning_rate     = 0.05,
    max_depth         = 6,
    subsample         = 0.8,
    colsample_bytree  = 0.8,
    min_child_weight  = 50,        # regularise leaf size for 1M+ rows
    reg_alpha         = 0.1,       # L1
    reg_lambda        = 1.0,       # L2
    scale_pos_weight  = 1,
    eval_metric       = 'logloss',
    early_stopping_rounds = 30,
    random_state      = 42,
    n_jobs            = -1,
    verbosity         = 0,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    verbose=100,
)
print(f'\nBest iteration: {model.best_iteration}')

## 5. Evaluation

In [None]:
y_pred = model.predict(X_test)

print(f'Accuracy : {accuracy_score(y_test, y_pred):.4f}  (naive baseline: {naive:.4f})')
print()
print(classification_report(y_test, y_pred, target_names=['Down', 'Up']))

## 6. Built-in Feature Importance

Three XGBoost importance types:
- **weight** – number of times a feature is used in a split
- **gain** – average information gain per split (most informative)
- **cover** – average number of samples per split

In [None]:
importance_types = ['weight', 'gain', 'cover']
fig, axes = plt.subplots(1, 3, figsize=(20, 7))

for ax, imp_type in zip(axes, importance_types):
    scores = model.get_booster().get_score(importance_type=imp_type)
    fi_df = (
        pd.DataFrame.from_dict(scores, orient='index', columns=['importance'])
        .sort_values('importance', ascending=True)
    )
    ax.barh(fi_df.index, fi_df['importance'], color='steelblue')
    ax.set_title(f'Feature Importance ({imp_type})')
    ax.set_xlabel('Importance')

plt.tight_layout()
plt.show()

## 7. SHAP Values – Explainability

SHAP (SHapley Additive exPlanations) gives each feature a per-prediction contribution.  
The **beeswarm plot** shows both the magnitude and direction of impact across all test samples.

In [None]:
if SHAP_AVAILABLE:
    # Sample for speed (SHAP on full test set can be slow)
    shap_sample = X_test.sample(min(10_000, len(X_test)), random_state=42)
    
    explainer    = shap.TreeExplainer(model)
    shap_values  = explainer.shap_values(shap_sample)
    
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, shap_sample, show=False)
    plt.title('SHAP Beeswarm – Feature Impact on "Up" Prediction')
    plt.tight_layout()
    plt.show()
    
    # Bar plot of mean |SHAP|
    plt.figure(figsize=(10, 6))
    shap.summary_plot(shap_values, shap_sample, plot_type='bar', show=False)
    plt.title('Mean |SHAP| – Global Feature Importance')
    plt.tight_layout()
    plt.show()
else:
    print('SHAP not available. Install with: pip install shap')

## 8. Accuracy by Sector

In [None]:
acc = accuracy_score(y_test, y_pred)

results = test[['date', 'Sector_encoded']].copy()
results['y_true'] = y_test.values
results['correct'] = (results['y_true'] == y_pred).astype(int)

# Decode sector
sector_cat = df['Sector'].astype('category').cat
code_to_sector = dict(enumerate(sector_cat.categories))
results['Sector'] = results['Sector_encoded'].map(code_to_sector)

sector_acc = results.groupby('Sector')['correct'].mean().sort_values(ascending=False)

plt.figure(figsize=(12, 5))
colors = ['seagreen' if v >= naive else 'salmon' for v in sector_acc]
sector_acc.plot.bar(color=colors)
plt.axhline(naive, color='black', ls='--', lw=0.8, label=f'Naive baseline ({naive:.3f})')
plt.axhline(acc,   color='blue',  ls='--', lw=0.8, label=f'Overall accuracy ({acc:.3f})')
plt.title('XGBoost Accuracy by Sector (Test Set)')
plt.ylabel('Accuracy')
plt.xticks(rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.show()