# XGBoost Classification with Medical Claim Embeddings

This notebook demonstrates training XGBoost models using embeddings generated from medical claims data.

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from datetime import datetime

# XGBoost imports
import xgboost as xgb
from xgboost import XGBClassifier

# Scikit-learn imports
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report
)
from sklearn.preprocessing import StandardScaler

# Local imports
import sys
sys.path.append('..')
from pipelines.embedding_pipeline import EmbeddingPipeline
from models.config_models import PipelineConfig
from utils.logging_utils import get_logger

# Set random seeds
np.random.seed(42)

# Configure plotting
plt.style.use('seaborn-v0_8')
%matplotlib inline

## 1. Generate or Load Embeddings

In [None]:
# Configuration for embedding pipeline
data_path = Path('data/medical_claims_complete.csv')
embeddings_path = Path('outputs/xgboost_embeddings.csv')

# Check if embeddings already exist
if embeddings_path.exists():
    print("Loading existing embeddings...")
    embeddings_df = pd.read_csv(embeddings_path)
else:
    print("Generating new embeddings...")
    config = {
        'pipeline': {
            'job_name': 'xgboost_embeddings',
            'log_level': 'INFO'
        },
        'data': {
            'data_path': str(data_path.absolute()),
            'claim_column': 'claim',
            'label_column': 'label',
            'mcid_column': 'mcid'
        },
        'llm': {
            'model_url': 'http://localhost:8000',
            'batch_size': 32,
            'max_retries': 3
        },
        'outputs': {
            'output_dir': 'outputs',
            'save_embeddings': True
        }
    }
    
    pipeline_config = PipelineConfig(**config)
    embedding_pipeline = EmbeddingPipeline(pipeline_config)
    embeddings_df = embedding_pipeline.run()
    embeddings_df.to_csv(embeddings_path, index=False)

print(f"Embeddings shape: {embeddings_df.shape}")
print(f"Columns: {embeddings_df.columns.tolist()[:10]}...")

## 2. Prepare Data for Training

In [None]:
# Extract features and labels
embedding_cols = [col for col in embeddings_df.columns if col.startswith('embedding_')]
X = embeddings_df[embedding_cols].values
y = embeddings_df['label'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Class distribution - Train: {np.bincount(y_train)}")
print(f"Class distribution - Test: {np.bincount(y_test)}")

## 3. Train Basic XGBoost Model

In [None]:
# Create and train basic XGBoost model
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# Train with evaluation
eval_set = [(X_train, y_train), (X_test, y_test)]
xgb_model.fit(
    X_train, y_train,
    eval_set=eval_set,
    eval_metric=['logloss', 'auc'],
    early_stopping_rounds=10,
    verbose=True
)

# Make predictions
y_pred = xgb_model.predict(X_test)
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

# Calculate metrics
metrics = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1': f1_score(y_test, y_pred),
    'auc_roc': roc_auc_score(y_test, y_pred_proba)
}

print("\nBasic Model Performance:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

## 4. Hyperparameter Tuning

In [None]:
# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Create XGBoost classifier
xgb_tune = XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# Grid search with cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    xgb_tune,
    param_grid,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

print("Starting hyperparameter tuning...")
grid_search.fit(X_train, y_train)

# Best parameters
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

# Evaluate best model
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
y_pred_proba_best = best_model.predict_proba(X_test)[:, 1]

metrics_best = {
    'accuracy': accuracy_score(y_test, y_pred_best),
    'precision': precision_score(y_test, y_pred_best),
    'recall': recall_score(y_test, y_pred_best),
    'f1': f1_score(y_test, y_pred_best),
    'auc_roc': roc_auc_score(y_test, y_pred_proba_best)
}

print("\nTuned Model Performance:")
for metric, value in metrics_best.items():
    print(f"{metric}: {value:.4f}")

## 5. Feature Importance Analysis

In [None]:
# Get feature importance
importance = best_model.feature_importances_
indices = np.argsort(importance)[::-1]

# Plot top 20 features
plt.figure(figsize=(12, 8))
top_n = 20
plt.bar(range(top_n), importance[indices[:top_n]])
plt.xlabel('Feature Index')
plt.ylabel('Feature Importance')
plt.title('Top 20 Most Important Features')
plt.tight_layout()
plt.show()

# Feature importance distribution
plt.figure(figsize=(10, 6))
plt.hist(importance, bins=50, edgecolor='black')
plt.xlabel('Feature Importance')
plt.ylabel('Count')
plt.title('Distribution of Feature Importances')
plt.tight_layout()
plt.show()

print(f"Top 10 feature indices: {indices[:10]}")
print(f"Top 10 importance scores: {importance[indices[:10]]}")

## 6. Model Evaluation and Visualization

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_best)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {metrics_best["auc_roc"]:.3f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.tight_layout()
plt.show()

# Classification Report
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred_best))

## 7. Training History Visualization

In [None]:
# Plot training history
results = xgb_model.evals_result()
epochs = len(results['validation_0']['logloss'])
x_axis = range(0, epochs)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Log Loss
ax1.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax1.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax1.set_xlabel('Boosting Round')
ax1.set_ylabel('Log Loss')
ax1.set_title('XGBoost Log Loss')
ax1.legend()

# AUC
ax2.plot(x_axis, results['validation_0']['auc'], label='Train')
ax2.plot(x_axis, results['validation_1']['auc'], label='Test')
ax2.set_xlabel('Boosting Round')
ax2.set_ylabel('AUC')
ax2.set_title('XGBoost AUC')
ax2.legend()

plt.tight_layout()
plt.show()

## 8. Save Model and Results

In [None]:
# Create output directory
output_dir = Path('outputs/xgboost_model')
output_dir.mkdir(parents=True, exist_ok=True)

# Save model
model_path = output_dir / f'xgboost_model_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
best_model.save_model(model_path)
print(f"Model saved to: {model_path}")

# Save metrics
results = {
    'model_type': 'XGBoost',
    'timestamp': datetime.now().isoformat(),
    'best_parameters': grid_search.best_params_,
    'cv_score': float(grid_search.best_score_),
    'test_metrics': metrics_best,
    'feature_importance': {
        'top_features': indices[:20].tolist(),
        'importance_scores': importance[indices[:20]].tolist()
    },
    'data_info': {
        'n_train': len(X_train),
        'n_test': len(X_test),
        'n_features': X_train.shape[1],
        'class_distribution': {
            'train': np.bincount(y_train).tolist(),
            'test': np.bincount(y_test).tolist()
        }
    }
}

metrics_path = output_dir / f'xgboost_metrics_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
with open(metrics_path, 'w') as f:
    json.dump(results, f, indent=2)
print(f"Metrics saved to: {metrics_path}")

# Display summary
print("\n=== XGBoost Model Summary ===")
print(f"Best AUC-ROC: {metrics_best['auc_roc']:.4f}")
print(f"Best F1 Score: {metrics_best['f1']:.4f}")
print(f"Number of trees: {best_model.n_estimators}")
print(f"Max depth: {best_model.max_depth}")
print(f"Learning rate: {best_model.learning_rate}")