In [75]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.metrics import recall_score, make_scorer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

import json


### Download XGBoost model with selected features for optimisation

In [65]:
data = pd.read_csv('../Project_datasets/reduced_dataset.csv')

In [66]:
X = data.drop(['purchase'], axis=1)
y = data['purchase']

In [67]:
# Split the data into training and testing sets
# This splits into 70% train, 30% test by default
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [68]:
# Create XGBoost classifier
model = xgb.XGBClassifier(random_state=42)

# Train model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy with XGBoost importance selected features: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Accuracy with XGBoost importance selected features: 0.9276

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.94      0.93      4507
           1       0.94      0.92      0.93      4553

    accuracy                           0.93      9060
   macro avg       0.93      0.93      0.93      9060
weighted avg       0.93      0.93      0.93      9060


Confusion Matrix:
[[4218  289]
 [ 367 4186]]


### Hyperparamenters turning

In [72]:
# Custom scorer that focuses only on recall for class 1 (responders)
custom_recall_scorer = make_scorer(recall_score, pos_label=1)

# Parameter grid -- random generated numbers
param = {
    'max_depth': [3, 6, 8],
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 1],
    'min_child_weight': [1, 3, 5]
}

###### GridSearchCV

In [73]:
# GridSearchCV with the custom recall scorer
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param,
    scoring=custom_recall_scorer,
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Fit to training data
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Accuracy
y_pred = best_model.predict(X_test)
print("Best Parameters:", grid_search.best_params_)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Fitting 3 folds for each of 648 candidates, totalling 1944 fits




Best Parameters: {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 8, 'min_child_weight': 3, 'n_estimators': 300, 'subsample': 0.8}

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.94      0.93      4507
           1       0.94      0.93      0.93      4553

    accuracy                           0.93      9060
   macro avg       0.93      0.93      0.93      9060
weighted avg       0.93      0.93      0.93      9060


Confusion Matrix:
[[4220  287]
 [ 339 4214]]


###### RandomizedSearchCV

In [74]:
# RandomizedSearchCV ith the custom recall scorer

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param,
    n_iter=30,  
    scoring=custom_recall_scorer,
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit the randomized search
random_search.fit(X_train, y_train)

#  Best model
best_model = random_search.best_estimator_

# Accuracy

y_pred = best_model.predict(X_test)
print("Best Parameters:", random_search.best_params_)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best Parameters: {'subsample': 0.8, 'n_estimators': 300, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.2, 'gamma': 1, 'colsample_bytree': 0.8}

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.93      0.93      4507
           1       0.93      0.92      0.93      4553

    accuracy                           0.93      9060
   macro avg       0.93      0.93      0.93      9060
weighted avg       0.93      0.93      0.93      9060


Confusion Matrix:
[[4203  304]
 [ 355 4198]]


#### Chose the Search and save into dict

In [76]:
with open('../Project_datasets/best_xgb_params.json', 'w') as f:
    json.dump(grid_search.best_params_, f)