# Machine Learning for CDS Spread Prediction

This notebook implements ML models for spread prediction and regime detection.

In [1]:
# Setup
import sys
import os
sys.path.insert(0, os.path.abspath('..'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# ML libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

# Our modules
from src.data import BloombergCDSConnector, CDSDataManager
from src.models import Region, Market, Tenor

print('Modules imported successfully')

Modules imported successfully


## 1. Load Historical Data for ML

In [3]:
# Initialize
connector = BloombergCDSConnector()
manager = CDSDataManager(connector)

# Load 1 year of data for ML training
end_date = datetime.now()
start_date = end_date - timedelta(days=365)
current_series = 43

# Fetch data
ticker = connector.get_index_ticker('EU', 'IG', current_series, '5Y')
print(f'Loading data for {ticker}...')

hist_data = connector.get_historical_spreads(
    ticker,
    start_date=start_date,
    end_date=end_date,
    fields=['px_last', 'volume']
)

if not hist_data.empty:
    print(f'Loaded {len(hist_data)} days of data')
    print(hist_data.head())

Loading data for ITRX EUR CDSI S43 5Y...
Loaded 129 days of data
            px_last
2025-03-20   58.645
2025-03-21   60.163
2025-03-24   58.253
2025-03-25   57.523
2025-03-26   58.714


## 2. Feature Engineering

In [4]:
def create_features(df, window_sizes=[5, 10, 20]):
    '''Create technical features for ML'''
    features = df.copy()
    
    # Price features
    features['returns'] = features['px_last'].pct_change()
    
    # Moving averages
    for window in window_sizes:
        features[f'ma_{window}'] = features['px_last'].rolling(window).mean()
        features[f'volatility_{window}'] = features['returns'].rolling(window).std()
    
    # Lag features
    for lag in [1, 2, 3, 5]:
        features[f'lag_{lag}'] = features['px_last'].shift(lag)
    
    # Drop NaN rows
    features = features.dropna()
    
    return features

# Create features
if not hist_data.empty:
    ml_data = create_features(hist_data)
    print(f'Created {len(ml_data.columns)} features')
    print('Features:', ml_data.columns.tolist())

Created 12 features
Features: ['px_last', 'returns', 'ma_5', 'volatility_5', 'ma_10', 'volatility_10', 'ma_20', 'volatility_20', 'lag_1', 'lag_2', 'lag_3', 'lag_5']


## 3. Train Prediction Model

In [5]:
# Prepare data for ML
if 'ml_data' in locals() and not ml_data.empty:
    # Define target: next day spread
    ml_data['target'] = ml_data['px_last'].shift(-1)
    ml_data = ml_data.dropna()
    
    # Select features
    feature_cols = [col for col in ml_data.columns 
                   if col not in ['px_last', 'target', 'volume']]
    
    X = ml_data[feature_cols]
    y = ml_data['target']
    
    # Split data
    split_point = int(len(X) * 0.8)
    X_train = X[:split_point]
    X_test = X[split_point:]
    y_train = y[:split_point]
    y_test = y[split_point:]
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train model
    model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
    model.fit(X_train_scaled, y_train)
    
    # Predictions
    test_pred = model.predict(X_test_scaled)
    
    # Metrics
    mse = mean_squared_error(y_test, test_pred)
    mae = mean_absolute_error(y_test, test_pred)
    r2 = r2_score(y_test, test_pred)
    
    print('Model Performance:')
    print(f'RMSE: {np.sqrt(mse):.3f} bps')
    print(f'MAE: {mae:.3f} bps')
    print(f'R²: {r2:.3f}')

Model Performance:
RMSE: 1.610 bps
MAE: 1.244 bps
R²: 0.158
