# XGBoost Model Evaluation

This notebook loads the preprocessed data, retrains the XGBoost model (for self-containment of evaluation), and then evaluates its performance using various metrics, including ROC analysis and threshold tuning for specific class optimization.

In [None]:
import pandas as pd
import xgboost as xgb
import numpy as np
import utils # Import the new utils module

In [None]:
df = utils.load_preprocessed_data() # Default path 'data/preprocessed_data.parquet'



In [None]:
# Separate train/test and features/target using utility function
X_train_scaled, y_train, X_test_scaled, y_test = utils.split_data_features_target(df)

# Convert target variables using utility function
y_train = utils.convert_target_variable(y_train)
y_test = utils.convert_target_variable(y_test)

# The old print statements for shapes are now handled within utils.split_data_features_target()


In [None]:
# Calculate class distribution in the training set for scale_pos_weight
scale_pos_weight_val = 1 # Default
if y_train.dtype == 'int64' or y_train.dtype == 'int32': # Make sure y_train is the converted version
    count_class_0 = (y_train == 0).sum()
    count_class_1 = (y_train == 1).sum()
    print(f"\nTraining data class distribution: Class 0 (n): {count_class_0}, Class 1 (y): {count_class_1}")
    if count_class_1 > 0:
        scale_pos_weight_val = count_class_0 / count_class_1
        print(f"Calculated scale_pos_weight: {scale_pos_weight_val:.4f}")
    else:
        print("Warning: No positive class (1) instances in y_train. scale_pos_weight set to 1.")
else:
    print(f"Warning: y_train is not numeric (dtype: {y_train.dtype}) after conversion attempt. scale_pos_weight set to 1.")



In [None]:
# Define and Train XGBoost Classifier
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False, # Recommended to avoid warnings
    random_state=42,
    n_estimators=100, # Default, can be tuned
    scale_pos_weight=scale_pos_weight_val
)

print("\nTraining XGBoost model for evaluation notebook...")
xgb_clf.fit(X_train_scaled, y_train)
print("Model training complete.")

In [None]:
# Make Predictions on the test set
y_pred_xgb = xgb_clf.predict(X_test_scaled)
print("Predictions made on the test set.")

In [None]:
# Evaluate the XGBoost model using the utility function
utils.evaluate_model_performance(y_test, y_pred_xgb, X_test_scaled, xgb_clf, "XGBoost")


## Threshold Tuning Analysis for Class n (0)

This section explores how changing the classification threshold affects precision, recall, and F1-score, specifically for identifying 'Class n (0)' correctly.

In [None]:
# Define a range of threshold values to test
threshold_values = np.arange(0.05, 1.0, 0.05)

# Get predicted probabilities for the positive class (Class 1, 'y') from the trained xgb_clf
y_pred_proba_for_thresholding = xgb_clf.predict_proba(X_test_scaled)[:, 1]

precisions_class0 = []
recalls_class0 = []
f1s_class0 = []

print("\n--- Threshold Tuning for Class n (0) (Uncalibrated Model) ---")
print(f"{'Threshold':<10} | {'Precision (0)':<15} | {'Recall (0)':<12} | {'F1-score (0)':<12} | {'TP (0)':<7} | {'FP (0)':<7} | {'FN (0)':<7}")
print("-" * 80)

for thresh in threshold_values:
    # If prob_for_class_1 >= threshold, predict 1 (y), else 0 (n)
    y_pred_at_threshold = (y_pred_proba_for_thresholding >= thresh).astype(int)
    
    # Calculate metrics for Class 0 (label 0)
    p, r, f, s = precision_recall_fscore_support(y_test, y_pred_at_threshold, labels=[0, 1], zero_division=0)
    
    # Confusion matrix for this threshold to get TP, FP, FN for class 0
    cm_thresh = confusion_matrix(y_test, y_pred_at_threshold, labels=[0,1])
    # For class 0: TP(0) = cm_thresh[0,0], FP(0) = cm_thresh[1,0] (predicted 0, but was 1), FN(0) = cm_thresh[0,1] (predicted 1, but was 0)
    tp_c0 = cm_thresh[0,0] if cm_thresh.shape == (2,2) else 0
    fp_c0 = cm_thresh[1,0] if cm_thresh.shape == (2,2) else 0 
    fn_c0 = cm_thresh[0,1] if cm_thresh.shape == (2,2) else 0

    precisions_class0.append(p[0])
    recalls_class0.append(r[0])
    f1s_class0.append(f[0])
    
    print(f"{thresh:<10.2f} | {p[0]:<15.4f} | {r[0]:<12.4f} | {f[0]:<12.4f} | {tp_c0:<7} | {fp_c0:<7} | {fn_c0:<7}")

# Plotting the metrics
plt.figure(figsize=(12, 7))
plt.plot(threshold_values, precisions_class0, label='Precision (Class 0)', marker='o')
plt.plot(threshold_values, recalls_class0, label='Recall (Class 0)', marker='x')
plt.plot(threshold_values, f1s_class0, label='F1-score (Class 0)', marker='s')
plt.title('Precision, Recall, and F1-score for Class n (0) vs. Threshold (Uncalibrated Model)')
plt.xlabel('Threshold (Probability for Class y (1))')
plt.ylabel('Score')
plt.xticks(np.round(threshold_values,2))
plt.legend()
plt.grid(True)
plt.show()

# Find threshold that maximizes F1-score for Class 0
if f1s_class0 and not all(v == 0 for v in f1s_class0): # Check if list is not empty and contains non-zero values
    optimal_idx_f1_class0 = np.argmax(f1s_class0)
    optimal_threshold_f1_class0 = threshold_values[optimal_idx_f1_class0]
    print(f"\nOptimal threshold for maximizing F1-score for Class n (0) (Uncalibrated Model): {optimal_threshold_f1_class0:.2f}")
    print(f"  Precision (Class 0) at this threshold: {precisions_class0[optimal_idx_f1_class0]:.4f}")
    print(f"  Recall (Class 0) at this threshold: {recalls_class0[optimal_idx_f1_class0]:.4f}")
    print(f"  F1-score (Class 0) at this threshold: {f1s_class0[optimal_idx_f1_class0]:.4f}")
else:
    print("\nCould not determine optimal threshold for F1-score (Class 0) (Uncalibrated Model) as no valid F1 scores were calculated or all were zero.")

In [None]:
# ## Feature Importance Analysis
#
# This section visualizes the importance of each feature in the XGBoost model.

# Get feature importances from the trained XGBoost model
importances = xgb_clf.feature_importances_
feature_names = X_train_scaled.columns # X_train_scaled has the feature names

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})

# Sort features by importance
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

print("\n--- Feature Importance Analysis ---")
print("Top 10 Feature Importances:")
print(feature_importance_df.head(10))
print("\n") # Add a newline for better separation in output

# Plotting the feature importances (e.g., top 20 features)
plt.figure(figsize=(12, 8)) # Adjust figure size as needed
num_features_to_plot = min(len(feature_importance_df), 20) # Plot top 20 or fewer if not available
sns.barplot(x='importance', y='feature', data=feature_importance_df.head(num_features_to_plot), palette='viridis')
plt.title(f'Top {num_features_to_plot} Feature Importances - XGBoost')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.tight_layout() # Adjust layout to prevent labels from overlapping
plt.show()

## SHAP (SHapley Additive exPlanations) Value Analysis

SHAP values provide a way to explain the output of machine learning models by quantifying the contribution of each feature to a particular prediction. This offers more granular insights than global feature importance.

- **Summary Plot:** Shows the distribution of SHAP values for each feature, indicating not only the importance but also the direction of the relationship (e.g., whether high values of a feature increase or decrease the prediction).
- **Force Plot (for individual predictions):** Illustrates how features contributed to pushing a single prediction away from the base value (average prediction over the training set).

In [None]:
# Note: If you haven't installed shap, you might need to run: !pip install shap
try:
    import shap
    shap_available = True
    shap.initjs() # Initialize JavaScript visualization in the notebook
except ImportError:
    print("SHAP library not found. Please install it to run this section (e.g., pip install shap).")
    shap_available = False

if shap_available:
    print("\n--- SHAP Value Analysis ---")
    
    # Create a SHAP TreeExplainer for the XGBoost model
    # For tree-based models like XGBoost, TreeExplainer is more efficient.
    explainer = shap.TreeExplainer(xgb_clf)
    
    # Calculate SHAP values for the test set
    # This can take a moment for larger datasets
    print("Calculating SHAP values for the test set...")
    shap_values = explainer.shap_values(X_test_scaled)
    print("SHAP values calculated.")
    
    # SHAP Summary Plot (Beeswarm)
    print("\nSHAP Summary Plot (Beeswarm):")
    shap.summary_plot(shap_values, X_test_scaled, plot_type="beeswarm")
    
    # SHAP Force Plot for the first instance in the test set
    if len(X_test_scaled) > 0:
        print("\nSHAP Force Plot for the first test instance:")
        # For binary classification, shap_values can be an array or a list of two arrays (one for each class).
        # If it's a list, explainer.expected_value might also be a list.
        # We typically explain the probability of the positive class.
        expected_value_to_use = explainer.expected_value
        shap_values_to_use = shap_values
        
        if isinstance(explainer.expected_value, (list, np.ndarray)) and len(explainer.expected_value) == 2:
            # Common case for binary classifiers from XGBoost when explain_output='probability'
            expected_value_to_use = explainer.expected_value[1] # for the positive class (class 1)
            shap_values_to_use = shap_values[1] # SHAP values for the positive class
            
        # If shap_values is a 2D array (instances, features), it's likely for the positive class already
        # or for models where explainer.expected_value is a single value.
        
        shap.force_plot(expected_value_to_use, 
                        shap_values_to_use[0,:], 
                        X_test_scaled.iloc[0,:], 
                        matplotlib=True) # Use matplotlib for better rendering in some environments
    else:
        print("Test set is empty, cannot generate force plot for an instance.")


## Cross-Validation Evaluation

To assess the model's generalization performance more robustly and ensure it's not overfitting to the specific train-test split, we perform k-fold cross-validation on the training data. We'll use Stratified K-Fold to maintain class proportions in each fold.


In [None]:

from sklearn.model_selection import cross_val_score, StratifiedKFold
# import numpy as np # Already imported

# Re-define the XGBoost classifier for cross-validation to ensure fresh state for each fold
# Use the same parameters as the main model
cv_xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False, 
    random_state=42,
    n_estimators=100, # Or use tuned n_estimators if available from a CV tuning step
    scale_pos_weight=scale_pos_weight_val # Use the calculated scale_pos_weight
)

# Define Stratified K-Fold
n_splits_cv = 5 # Number of folds
strat_k_fold = StratifiedKFold(n_splits=n_splits_cv, shuffle=True, random_state=42)

print(f"\n--- Cross-Validation Evaluation ({n_splits_cv}-fold) ---")

# Perform cross-validation for different metrics
# Note: X_train_scaled and y_train are used here
scoring_metrics = {
    'accuracy': 'accuracy',
    'roc_auc': 'roc_auc',
    'f1': 'f1_weighted' # Using f1_weighted for potentially imbalanced classes
}

cv_results = {}
for metric_name, scorer in scoring_metrics.items():
    try:
        scores = cross_val_score(cv_xgb_clf, X_train_scaled, y_train, cv=strat_k_fold, scoring=scorer)
        cv_results[metric_name] = scores
        print(f"Cross-validation {metric_name.upper()} scores: {scores}")
        print(f"Mean {metric_name.upper()}: {np.mean(scores):.4f} (+/- {np.std(scores):.4f})")
    except Exception as e:
        print(f"Could not calculate cross-validation {metric_name.upper()}. Error: {e}")
    print("-" * 30)

# You can further analyze or store cv_results if needed
