# Task 4: Binary Classification - XGBoost Model

This notebook loads the preprocessed data saved by `1_consolidate_data.ipynb` and trains/evaluates an XGBoost classifier.



XGBoost (Extreme Gradient Boosting) is a powerful and often high-performing algorithm for classification tasks.

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc, roc_auc_score, precision_recall_fscore_support
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # For potential NaN handling if needed, though imputation should handle it

In [None]:
# Define input path
input_parquet_path = 'data/preprocessed_data.parquet'
# Load the preprocessed data
print(f"Loading preprocessed data from {input_parquet_path}...")
try:
    df = pd.read_parquet(input_parquet_path)
    print("Data loaded successfully.")
    print("\nLoaded DataFrame Info:")
    df.info()
except FileNotFoundError:
    print(f"Error: File not found at {input_parquet_path}. Please run notebook 1 first.")
    # raise
except ImportError as e:
    print(f"\nError: 'pyarrow' or 'fastparquet' package is required to read Parquet format, or 'xgboost' is missing.")
    print("Please install required packages using: pip install pyarrow xgboost")
    # raise
except Exception as e:
    print(f"\nAn error occurred while loading the Parquet file: {e}")
    # raise

In [None]:
# Separate train and test sets based on the 'split' column
train_df = df[df['split'] == 'train']
test_df = df[df['split'] == 'test']

# Separate features (X) and target (y)
X_train_scaled = train_df.drop(['Class', 'split'], axis=1)
y_train = train_df['Class']

X_test_scaled = test_df.drop(['Class', 'split'], axis=1)
y_test = test_df['Class']

In [None]:
# Convert target variable 'Class' from object ('n'/'y') to numeric (0/1) if necessary
if y_train.dtype == 'object':
    print("\nConverting target variable 'Class' to numeric (n=0, y=1)...")
    y_train = y_train.map({'n': 0, 'y': 1})
    y_test = y_test.map({'n': 0, 'y': 1})
    print("Target variable converted.")

In [None]:
print(f"\nTraining features shape: {X_train_scaled.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Test features shape: {X_test_scaled.shape}")
print(f"Test target shape: {y_test.shape}")

In [None]:
# Calculate class distribution in the training set for scale_pos_weight
if y_train.dtype == 'int64' or y_train.dtype == 'int32': # Ensure y_train is numeric
    count_class_0 = (y_train == 0).sum()
    count_class_1 = (y_train == 1).sum()
    print(f"\nTraining data class distribution: Class 0 (n): {count_class_0}, Class 1 (y): {count_class_1}")
    if count_class_1 > 0: # Avoid division by zero
        scale_pos_weight_val = count_class_0 / count_class_1
        print(f"Calculated scale_pos_weight: {scale_pos_weight_val:.4f}")
    else:
        scale_pos_weight_val = 1 # Default if no positive class instances
        print("Warning: No positive class (1) instances in y_train. scale_pos_weight set to 1.")
else:
    scale_pos_weight_val = 1 # Default if y_train is not in expected numeric format
    print(f"Warning: y_train is not numeric. dtype: {y_train.dtype}. scale_pos_weight set to 1.")

In [None]:
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False, # Recommended to avoid warnings
    random_state=42,
    n_estimators=100, # Default, can be tuned
    scale_pos_weight=scale_pos_weight_val # Add calculated scale_pos_weight
)

# Train the model
print("\nTraining XGBoost model...")
xgb_clf.fit(X_train_scaled, y_train)
print("Model training complete.")

In [None]:
# Make Predictions
y_pred_xgb = xgb_clf.predict(X_test_scaled)
# Make Predictions on the test set
# y_pred_proba_xgb = xgb_clf.predict_proba(X_test_scaled)[:, 1] # Probabilities (optional)

In [None]:
def evaluate_model(y_true, y_pred, X_features_for_proba, model, model_name):
    """Calculates, prints, and plots evaluation metrics for a binary classifier."""
    print(f"\n--- {model_name} Evaluation ---")
    
    # Accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    
    # Classification Report
    print("\nClassification Report:")
    target_names = ['Class n (0)', 'Class y (1)'] if np.all(np.isin(y_true.unique(), [0, 1])) else None
    print(classification_report(y_true, y_pred, target_names=target_names, zero_division=0))
    
    # Confusion Matrix
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_true, y_pred)
    print(cm)
    
    # Plot Confusion Matrix
    plt.figure(figsize=(6, 4))
    classes = model.classes_ if hasattr(model, 'classes_') else [0, 1]
    sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', 
                xticklabels=classes, 
                yticklabels=classes)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.show()

    # ROC Curve and AUC
    if hasattr(model, "predict_proba") and X_features_for_proba is not None:
        y_pred_proba = model.predict_proba(X_features_for_proba)[:, 1]
        fpr, tpr, thresholds_roc = roc_curve(y_true, y_pred_proba)
        roc_auc = auc(fpr, tpr)
        print(f"\nROC AUC Score: {roc_auc:.4f}")

        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'Receiver Operating Characteristic (ROC) - {model_name}')
        plt.legend(loc="lower right")
        plt.show()
    else:
        print("\nROC Curve not available: Model lacks predict_proba or features for probabilities not provided.")

In [None]:
# Evaluate the XGBoost model
evaluate_model(y_test, y_pred_xgb, X_test_scaled, xgb_clf, "XGBoost")