In [1]:
#  import packages

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif, RFE


### Download XGBoost baseline model for optimisation

In [2]:
#  import data
data = pd.read_csv('../Project_datasets/balanced_dataset.csv')

In [3]:
#  dataset preparation
data = data.sample(frac=1, random_state=42).reset_index(drop=True)
X = data.drop(['Unnamed: 0', 'Unnamed: 0.1', 'sent_at', 'user_id', 'total_cost_pre_discount_pennies', 'purchase'], axis=1)
y = data['purchase']

In [4]:
# Split the data into training and testing sets
# This splits into 70% train, 30% test by default
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
# Create XGBoost classifier
model = xgb.XGBClassifier(random_state=42)

# Train model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9358

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      6806
           1       0.94      0.93      0.94      6784

    accuracy                           0.94     13590
   macro avg       0.94      0.94      0.94     13590
weighted avg       0.94      0.94      0.94     13590


Confusion Matrix:
[[6386  420]
 [ 452 6332]]


### Features selection

###### Statistical feature selection 

In [6]:
#Statistical feature selection (F-test)
selector_f = SelectKBest(score_func=f_classif, k=10)  # Select top 10 features
X_selected_f = selector_f.fit_transform(X, y)
selected_features_f = X.columns[selector_f.get_support()]

  f = msb / msw


In [7]:
# Statistical feature selection (F-test) accuracy

# Split data with selected features - use X_selected_f (the actual data)
X_train, X_test, y_train, y_test = train_test_split(X_selected_f, y, test_size=0.2, random_state=42)

# Create XGBoost classifier
model = xgb.XGBClassifier(random_state=42)

# Train model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy with F-test selected features: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy with F-test selected features: 0.9007

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.91      0.90      4507
           1       0.91      0.89      0.90      4553

    accuracy                           0.90      9060
   macro avg       0.90      0.90      0.90      9060
weighted avg       0.90      0.90      0.90      9060


Confusion Matrix:
[[4123  384]
 [ 516 4037]]


###### XGBoost feature importance

In [8]:
# XGBoost feature importance
model_importance = xgb.XGBClassifier(random_state=42)
model_importance.fit(X, y)
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model_importance.feature_importances_
}).sort_values('importance', ascending=False)

top_features = feature_importance.head(10)['feature'].tolist()
X_selected_importance = X[top_features]


In [9]:
# XGBoost feature importance accaracy

# Split data with selected features - use X_selected_importance (the actual data)
X_train, X_test, y_train, y_test = train_test_split(X_selected_importance, y, test_size=0.2, random_state=42)

# Create XGBoost classifier
model = xgb.XGBClassifier(random_state=42)

# Train model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy with XGBoost importance selected features: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Accuracy with XGBoost importance selected features: 0.9276

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.94      0.93      4507
           1       0.94      0.92      0.93      4553

    accuracy                           0.93      9060
   macro avg       0.93      0.93      0.93      9060
weighted avg       0.93      0.93      0.93      9060


Confusion Matrix:
[[4218  289]
 [ 367 4186]]


###### Recursive Feature Elimination (RFE)

In [54]:
#  Recursive Feature Elimination (RFE)
rfe = RFE(estimator=xgb.XGBClassifier(random_state=42), n_features_to_select=10)
X_selected_rfe = rfe.fit_transform(X, y)
selected_features_rfe = X.columns[rfe.support_]

In [55]:
#  Recursive Feature Elimination (RFE) accuracy 

# Split data with selected features - use X_selected_rfe (the actual data)
X_train, X_test, y_train, y_test = train_test_split(X_selected_rfe, y, test_size=0.2, random_state=42)

# Create XGBoost classifier
model = xgb.XGBClassifier(random_state=42)

# Train model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy with RFE selected features: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Accuracy with RFE selected features: 0.9232

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.93      0.92      4507
           1       0.93      0.91      0.92      4553

    accuracy                           0.92      9060
   macro avg       0.92      0.92      0.92      9060
weighted avg       0.92      0.92      0.92      9060


Confusion Matrix:
[[4205  302]
 [ 394 4159]]


#### XGBoost feature importance bcs higher recall

In [46]:
# Combine selected features and target
final_dataset = X_selected_importance.copy()
final_dataset['purchase'] = y

# Save to CSV
final_dataset.to_csv('../Project_datasets/reduced_dataset.csv', index=False)

