# Task 4: Binary Classification - XGBoost Model

This notebook loads the preprocessed data saved by `1_consolidate_data.ipynb` and trains/evaluates an XGBoost classifier.



XGBoost (Extreme Gradient Boosting) is a powerful and often high-performing algorithm for classification tasks.

In [None]:
import xgboost as xgb
import utils

In [None]:
# Load the preprocessed data using the utility function
try:
    df = utils.load_preprocessed_data() # Default path 'data/preprocessed_data.parquet'
except Exception as e:
    print(f"Stopping notebook execution due to error in loading data: {e}")
    raise


In [None]:
# Separate train/test and features/target using utility function
X_train_scaled, y_train, X_test_scaled, y_test = utils.split_data_features_target(df)

# Convert target variables using utility function
y_train = utils.convert_target_variable(y_train)
y_test = utils.convert_target_variable(y_test)


In [None]:
# Calculate class distribution in the training set for scale_pos_weight
scale_pos_weight_val = 1 # Default
if y_train.dtype == 'int64' or y_train.dtype == 'int32': # Make sure y_train is the converted version
    count_class_0 = (y_train == 0).sum()
    count_class_1 = (y_train == 1).sum()
    print(f"\nTraining data class distribution: Class 0 (n): {count_class_0}, Class 1 (y): {count_class_1}")
    if count_class_1 > 0:
        scale_pos_weight_val = count_class_0 / count_class_1
        print(f"Calculated scale_pos_weight: {scale_pos_weight_val:.4f}")
    else:
        print("Warning: No positive class (1) instances in y_train. scale_pos_weight set to 1.")
else:
    print(f"Warning: y_train is not numeric (dtype: {y_train.dtype}) after conversion attempt. scale_pos_weight set to 1.")


In [None]:
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False, # Recommended to avoid warnings
    random_state=42,
    n_estimators=100, # Default, can be tuned
    scale_pos_weight=scale_pos_weight_val # Add calculated scale_pos_weight
)

# Train the model
print("\nTraining XGBoost model...")
xgb_clf.fit(X_train_scaled, y_train)
print("Model training complete.")

In [None]:
# Make Predictions
y_pred_xgb = xgb_clf.predict(X_test_scaled)
# Make Predictions on the test set
# y_pred_proba_xgb = xgb_clf.predict_proba(X_test_scaled)[:, 1] # Probabilities (optional)

In [None]:
# Evaluate the XGBoost model using the utility function
utils.evaluate_model_performance(y_test, y_pred_xgb, X_test_scaled, xgb_clf, "XGBoost")