# 7. Model Training
Train XGBoost models for return prediction and direction classification

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, roc_auc_score, accuracy_score, classification_report
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from utils.hopsworks_helpers import get_feature_store, get_model_registry
import yaml
import joblib

# Load config
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

## Load Feature View

In [None]:
fs = get_feature_store()
feature_view = fs.get_feature_view('qqq_prediction_fv', version=1)

# Get training data
X_train, X_test, y_train, y_test = feature_view.train_test_split(
    test_size=config['model']['test_size']
)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")
print(f"Features: {X_train.columns.tolist()}")

## Prepare Data

In [None]:
# Split targets
y_train_return = y_train['target_return']
y_train_direction = y_train['target_direction']

y_test_return = y_test['target_return']
y_test_direction = y_test['target_direction']

print(f"Train target distribution: {y_train_direction.value_counts()}")
print(f"Test target distribution: {y_test_direction.value_counts()}")

## Train Regression Model (Return Prediction)

In [None]:
# XGBoost regressor
xgb_regressor = xgb.XGBRegressor(
    n_estimators=config['model']['xgboost']['regression']['n_estimators'],
    max_depth=config['model']['xgboost']['regression']['max_depth'],
    learning_rate=config['model']['xgboost']['regression']['learning_rate'],
    random_state=config['model']['xgboost']['regression']['random_state'],
    objective='reg:squarederror'
)

# Train
xgb_regressor.fit(
    X_train, y_train_return,
    eval_set=[(X_test, y_test_return)],
    verbose=10
)

print("\nRegression model trained!")

## Evaluate Regression Model

In [None]:
# Predictions
y_pred_return = xgb_regressor.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test_return, y_pred_return)
rmse = np.sqrt(mean_squared_error(y_test_return, y_pred_return))
directional_accuracy = accuracy_score(
    (y_test_return > 0).astype(int),
    (y_pred_return > 0).astype(int)
)

print(f"Regression Metrics:")
print(f"MAE: {mae:.6f}")
print(f"RMSE: {rmse:.6f}")
print(f"Directional Accuracy: {directional_accuracy:.4f}")

# Plot predictions vs actuals
plt.figure(figsize=(12, 6))
plt.scatter(y_test_return, y_pred_return, alpha=0.5)
plt.plot([y_test_return.min(), y_test_return.max()], 
         [y_test_return.min(), y_test_return.max()], 'r--')
plt.xlabel('Actual Return')
plt.ylabel('Predicted Return')
plt.title('Regression Model: Predicted vs Actual Returns')
plt.grid(True)
plt.show()

## Train Classification Model (Up/Down Prediction)

In [None]:
# XGBoost classifier
xgb_classifier = xgb.XGBClassifier(
    n_estimators=config['model']['xgboost']['classification']['n_estimators'],
    max_depth=config['model']['xgboost']['classification']['max_depth'],
    learning_rate=config['model']['xgboost']['classification']['learning_rate'],
    random_state=config['model']['xgboost']['classification']['random_state'],
    objective='binary:logistic'
)

# Train
xgb_classifier.fit(
    X_train, y_train_direction,
    eval_set=[(X_test, y_test_direction)],
    verbose=10
)

print("\nClassification model trained!")

## Evaluate Classification Model

In [None]:
# Predictions
y_pred_direction = xgb_classifier.predict(X_test)
y_pred_proba = xgb_classifier.predict_proba(X_test)[:, 1]

# Metrics
accuracy = accuracy_score(y_test_direction, y_pred_direction)
auc = roc_auc_score(y_test_direction, y_pred_proba)

print(f"Classification Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC-ROC: {auc:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test_direction, y_pred_direction, target_names=['Down', 'Up']))

## Feature Importance Analysis

In [None]:
# Feature importance from regression model
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_regressor.feature_importances_
}).sort_values('importance', ascending=False)

# Plot top 20 features
plt.figure(figsize=(10, 8))
sns.barplot(data=feature_importance.head(20), x='importance', y='feature')
plt.title('Top 20 Most Important Features (Regression Model)')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

print("\nTop 10 Features:")
print(feature_importance.head(10))

## Save Models to Hopsworks Model Registry

In [None]:
# Save models locally first
import os
os.makedirs('../models', exist_ok=True)

joblib.dump(xgb_regressor, '../models/qqq_regressor.pkl')
joblib.dump(xgb_classifier, '../models/qqq_classifier.pkl')

print("Models saved locally")

In [None]:
# Upload to Hopsworks Model Registry
mr = get_model_registry()

# Register regression model
regressor_model = mr.sklearn.create_model(
    name="qqq_return_regressor",
    description="XGBoost model for predicting next-day QQQ return",
    metrics={
        "mae": mae,
        "rmse": rmse,
        "directional_accuracy": directional_accuracy
    }
)
regressor_model.save('../models/qqq_regressor.pkl')

# Register classification model
classifier_model = mr.sklearn.create_model(
    name="qqq_direction_classifier",
    description="XGBoost model for predicting QQQ up/down direction",
    metrics={
        "accuracy": accuracy,
        "auc": auc
    }
)
classifier_model.save('../models/qqq_classifier.pkl')

print("Models registered in Hopsworks!")