In [18]:
import pandas as pd
import numpy as np
import json

from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import recall_score

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit




### Download data and hyperparameters, run initial model

In [6]:
data = pd.read_csv('../Project_datasets/reduced_dataset.csv')

X = data.drop(['purchase'], axis=1)
y = data['purchase']


with open('../Project_datasets/best_xgb_params.json', 'r') as f:
    best_params = json.load(f)


In [7]:
# Split the data into training and testing sets
# This splits into 70% train, 30% test by default
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
model = xgb.XGBClassifier(**best_params)
model.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=1.0, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.2, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=8, max_leaves=None,
              min_child_weight=3, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=300, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)

In [9]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy with XGBoost importance selected features: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Accuracy with XGBoost importance selected features: 0.9304

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.94      0.93      4507
           1       0.94      0.93      0.93      4553

    accuracy                           0.93      9060
   macro avg       0.93      0.93      0.93      9060
weighted avg       0.93      0.93      0.93      9060


Confusion Matrix:
[[4215  292]
 [ 339 4214]]


### Cross-validation

###### 10 folders with different train-test split ration

In [17]:
# Create StratifiedKFold model
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracies = []

# 10-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Model train
    model.fit(X_train, y_train)
    
    
    model.fit(X_train, y_train)

    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluation
    print(f"\n--- Fold {fold} ---")
    print("Train recall:", recall_score(y_train, y_train_pred, pos_label=1))
    print("Test recall :", recall_score(y_test, y_test_pred, pos_label=1))





--- Fold 1 ---
Train recall: 0.987981358842286
Test recall : 0.9240953221535746

--- Fold 2 ---
Train recall: 0.9872461493181596
Test recall : 0.928476821192053

--- Fold 3 ---
Train recall: 0.9870499362307465
Test recall : 0.9134657836644592

--- Fold 4 ---
Train recall: 0.9872461493181596
Test recall : 0.9249448123620309

--- Fold 5 ---
Train recall: 0.9876876287648386
Test recall : 0.9311258278145695

--- Fold 6 ---
Train recall: 0.9875404689492789
Test recall : 0.937748344370861

--- Fold 7 ---
Train recall: 0.9880800549396644
Test recall : 0.9222958057395143

--- Fold 8 ---
Train recall: 0.9878838418522515
Test recall : 0.9328918322295806

--- Fold 9 ---
Train recall: 0.9881291082115177
Test recall : 0.9368653421633554

--- Fold 10 ---
Train recall: 0.9886196409300501
Test recall : 0.9253863134657837


###### 10 folders with different data samples with constant train-test split 70/30

In [19]:
# Random 70/30 split
rs = ShuffleSplit(n_splits=10, test_size=0.3, train_size=0.7, random_state=42)

train_recalls = []
test_recalls = []

for i, (train_idx, test_idx) in enumerate(rs.split(X, y), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    train_recall = recall_score(y_train, y_train_pred, pos_label=1)
    test_recall = recall_score(y_test, y_test_pred, pos_label=1)
    
    train_recalls.append(train_recall)
    test_recalls.append(test_recall)
    
    print(f"\n--- Bucket {i} ---")
    print(f"Train recall: {train_recall:.4f}")
    print(f"Test recall : {test_recall:.4f}")

# Summary
print(f"\nAverage Train Recall: {np.mean(train_recalls):.4f}")
print(f"Average Test Recall : {np.mean(test_recalls):.4f}")



--- Bucket 1 ---
Train recall: 0.9915
Test recall : 0.9214

--- Bucket 2 ---
Train recall: 0.9910
Test recall : 0.9213

--- Bucket 3 ---
Train recall: 0.9917
Test recall : 0.9204

--- Bucket 4 ---
Train recall: 0.9918
Test recall : 0.9217

--- Bucket 5 ---
Train recall: 0.9907
Test recall : 0.9265

--- Bucket 6 ---
Train recall: 0.9905
Test recall : 0.9273

--- Bucket 7 ---
Train recall: 0.9913
Test recall : 0.9244

--- Bucket 8 ---
Train recall: 0.9911
Test recall : 0.9253

--- Bucket 9 ---
Train recall: 0.9905
Test recall : 0.9249

--- Bucket 10 ---
Train recall: 0.9915
Test recall : 0.9193

Average Train Recall: 0.9912
Average Test Recall : 0.9232


In [21]:
# Save the model to file
model.save_model("../Models/xgb_model.json")  

