# Thermal Coal Price Forecasting
## ML Capstone Project - Demo Notebook
**Author:** My-Linh To  
**Date:** January 2026

---
## 1. Setup & Data Loading

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, accuracy_score
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded successfully!")

In [None]:
# Load the dataset
df = pd.read_csv('../data/processed/coal_features.csv', parse_dates=['date'])
print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"\nNumber of features: {df.shape[1] - 1}")

In [None]:
# Preview the data
df.head()

In [None]:
# Check column names - our features
print("Sample features:")
print(df.columns.tolist()[:20])

---
## 2. Feature Engineering Examples

In [None]:
# Example: Creating lag features
# This is how we capture yesterday's return to predict today

example_df = df[['date', 'coal_china_yzcm_ret']].copy()

# Lag 1: Yesterday's return
example_df['lag_1'] = example_df['coal_china_yzcm_ret'].shift(1)

# Lag 5: Return from 5 days ago
example_df['lag_5'] = example_df['coal_china_yzcm_ret'].shift(5)

print("Lag features example:")
example_df.head(10)

In [None]:
# Example: Creating rolling statistics
# Moving averages capture momentum/trend

example_df['ma_21'] = example_df['coal_china_yzcm_ret'].rolling(21).mean()
example_df['ma_63'] = example_df['coal_china_yzcm_ret'].rolling(63).mean()  # ~3 months
example_df['volatility_21'] = example_df['coal_china_yzcm_ret'].rolling(21).std()

print("Rolling features example:")
example_df.dropna().head(10)

In [None]:
# The 63-day MA turned out to be one of our most important features!
print(f"Total engineered features in our model: 151")

---
## 3. Model Training with Walk-Forward Validation

In [None]:
# Prepare features and target
feature_cols = [col for col in df.columns if col not in ['date', 'coal_china_yzcm_ret']]
X = df[feature_cols].dropna()
y = df.loc[X.index, 'coal_china_yzcm_ret']

print(f"Features: {X.shape[1]}")
print(f"Samples: {X.shape[0]}")

In [None]:
# Walk-Forward Cross-Validation
# This respects the time series nature - always train on past, test on future

tscv = TimeSeriesSplit(n_splits=5)

print("Walk-Forward CV Splits:")
print("=" * 50)
for i, (train_idx, test_idx) in enumerate(tscv.split(X)):
    print(f"Fold {i+1}: Train on {len(train_idx)} samples, Test on {len(test_idx)} samples")

In [None]:
# Train Gradient Boosting model
gb_model = GradientBoostingRegressor(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    random_state=42
)

# Use last fold for demo
train_idx, test_idx = list(tscv.split(X))[-1]
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

print(f"Training on {len(X_train)} samples...")
gb_model.fit(X_train, y_train)
print("Model trained!")

---
## 4. Model Evaluation

In [None]:
# Make predictions
y_pred = gb_model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.4f}")

In [None]:
# Directional Accuracy - this is what really matters for trading!
y_true_direction = (y_test > 0).astype(int)
y_pred_direction = (y_pred > 0).astype(int)

directional_accuracy = accuracy_score(y_true_direction, y_pred_direction)
print(f"Directional Accuracy: {directional_accuracy:.1%}")
print(f"\nThis is {(directional_accuracy - 0.5)*100:.1f} percentage points above random guessing!")

In [None]:
# Feature Importance - What drives predictions?
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': gb_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print("=" * 50)
importance_df.head(10)

In [None]:
# Plot feature importance
plt.figure(figsize=(10, 6))
top_20 = importance_df.head(20)
plt.barh(range(len(top_20)), top_20['importance'].values)
plt.yticks(range(len(top_20)), top_20['feature'].values)
plt.xlabel('Importance')
plt.title('Top 20 Features - Gradient Boosting Model')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

---
## 5. Trading Strategy Backtest

In [None]:
# Simple trading strategy:
# - If model predicts UP (positive return), go LONG
# - If model predicts DOWN (negative return), stay in CASH

# Calculate strategy returns
signals = (y_pred > 0).astype(int)  # 1 = long, 0 = cash
strategy_returns = signals * y_test.values

# Calculate cumulative returns
cumulative_strategy = (1 + strategy_returns).cumprod()
cumulative_buyhold = (1 + y_test.values).cumprod()

print(f"Strategy Total Return: {(cumulative_strategy[-1] - 1)*100:.1f}%")
print(f"Buy & Hold Return: {(cumulative_buyhold[-1] - 1)*100:.1f}%")

In [None]:
# Calculate Sharpe Ratio
sharpe_ratio = np.sqrt(252) * strategy_returns.mean() / strategy_returns.std()
print(f"Sharpe Ratio: {sharpe_ratio:.2f}")
print("\n(Above 1.0 is considered good, above 2.0 is excellent)")

In [None]:
# Plot equity curves
plt.figure(figsize=(12, 6))
plt.plot(cumulative_strategy * 10000, label='ML Strategy', linewidth=2)
plt.plot(cumulative_buyhold * 10000, label='Buy & Hold', linewidth=2, alpha=0.7)
plt.axhline(y=10000, color='gray', linestyle='--', alpha=0.5)
plt.xlabel('Trading Days')
plt.ylabel('Portfolio Value ($)')
plt.title('Trading Strategy Backtest (Starting Capital: $10,000)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

---
## 6. Summary

In [None]:
print("="*60)
print("         COAL PRICE FORECASTING - RESULTS SUMMARY")
print("="*60)
print(f"\nüìä Data: {df.shape[0]} observations, {X.shape[1]} features")
print(f"\nüéØ Directional Accuracy: {directional_accuracy:.1%}")
print(f"   (vs 50% random baseline)")
print(f"\nüìà Sharpe Ratio: {sharpe_ratio:.2f}")
print(f"\nüèÜ Best Model: Gradient Boosting")
print(f"\nüîë Top Predictive Features:")
for i, row in importance_df.head(5).iterrows():
    print(f"   ‚Ä¢ {row['feature']}: {row['importance']:.1%}")
print("\n" + "="*60)

---
### Thank you!
**GitHub:** github.com/apocalip2001/coal-price-forecasting