In [1]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import torch
from torch.utils.data import DataLoader, TensorDataset

from get_dataset import X
from get_dataset import y

# Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Train XGBoost model with L2 (R2) regularization
print("Training XGBoost model with L2 regularization...")
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.2,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.9,
    reg_lambda=10,  # L2 Regularization
    objective='binary:logistic',
    random_state=42
)
xgb_model.fit(X_train, y_train)

# Step 3: Make predictions and evaluate the model
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Model Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Step 4: PyTorch integration
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train)
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.FloatTensor(y_test)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Training XGBoost with PyTorch DataLoader
def train_xgb_with_pytorch_loader(train_loader):
    all_X, all_y = [], []
    for X_batch, y_batch in train_loader:
        all_X.append(X_batch.numpy())
        all_y.append(y_batch.numpy())
    
    X_train_combined = np.vstack(all_X)
    y_train_combined = np.concatenate(all_y)
    
    xgb_model_from_loader = xgb.XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=4,
        reg_lambda=10,
        objective='binary:logistic',
        random_state=42
    )
    xgb_model_from_loader.fit(X_train_combined, y_train_combined)
    return xgb_model_from_loader

xgb_model_pytorch = train_xgb_with_pytorch_loader(train_loader)

def evaluate_models(xgb_model, test_loader):
    X_test_np = X_test
    y_test_np = y_test
    y_pred_xgb = xgb_model.predict(X_test_np)
    xgb_accuracy = accuracy_score(y_test_np, y_pred_xgb)
    print(f"\nXGBoost Final Test Accuracy: {xgb_accuracy:.4f}")
    return xgb_accuracy

evaluate_models(xgb_model, test_loader)


Training XGBoost model with L2 regularization...
XGBoost Model Accuracy: 0.9070

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7952
           1       0.66      0.47      0.55      1091

    accuracy                           0.91      9043
   macro avg       0.79      0.72      0.75      9043
weighted avg       0.90      0.91      0.90      9043


Confusion Matrix:
[[7684  268]
 [ 573  518]]

XGBoost Final Test Accuracy: 0.9070


0.9069998894172288