In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import optuna

# sklearn imports
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Other libraries
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

In [None]:
# Load the data
file_path = "Clickstream_data.csv"
expanded_file_path = os.path.expanduser(file_path)
data = pd.read_csv(expanded_file_path)

# extreme values will be removed based on the exploratory analysis
features_to_clean = {
    'ProductRelated_Duration': 2,
    'Informational': 1,
    'Administrative_Duration': 1,
    'ProductRelated': 2,
    'PageValues': 2
}

# Create a copy of the DataFrame for modifications
data_modified = data.copy()

# Remove the rows with the outliers in the specified features
for feature, count in features_to_clean.items():
    for _ in range(count):
        max_value_index = data_modified[feature].idxmax()
        data_modified.drop(max_value_index, inplace=True)

data_modified
# Rename the dataset name from data_modified to online_csd which is short for Online Clicksteam data
Online_csd = data_modified

In [None]:
# 1- Data Preprocessing :
# A- One-hot encoding 
# Specify the columns to one-hot encode for categorical variables
Online_csd_coded = pd.get_dummies(Online_csd, columns=['Month', 'VisitorType'], drop_first=True)

# 2- Label encoding for Weekend and Revenue
le = LabelEncoder()
Online_csd_coded['Weekend'] = le.fit_transform(Online_csd_coded['Weekend'])
Online_csd_coded['Revenue'] = le.fit_transform(Online_csd_coded['Revenue'])

# 3- Splitting dependent and independent variables(columns)
features = Online_csd_coded.drop(['Revenue'], axis = 1)
target = Online_csd_coded['Revenue']

# checking the shapes
# print("Shape of the features: ", features.shape)
# print("Shape of the target (Revenue) : ", target.shape)

# 4- Stratified sampling to split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size =  0.2, random_state = 42, stratify=target)

# 5- Applying SMOTE oversampling to the training data
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

# 6- Feature Scaling
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_resampled)
x_test_scaled = scaler.transform(x_test)


In [None]:
# Bernoulli Naive Bayes model
# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the Bernoulli Naive Bayes model
bnb_model = BernoulliNB()

f1_scores = []

# Perform cross-validation 
for train_index, val_index in skf.split(x_train_resampled, y_train_resampled):
    x_train_fold, x_val_fold = x_train_resampled.iloc[train_index], x_train_resampled.iloc[val_index]
    y_train_fold, y_val_fold = y_train_resampled.iloc[train_index], y_train_resampled.iloc[val_index]

    # Fit the model on the training fold
    bnb_model.fit(x_train_fold, y_train_fold)

    # Predict on the validation fold
    y_pred_fold = bnb_model.predict(x_val_fold)

    # Calculate the F1 score and append to the list
    f1 = f1_score(y_val_fold, y_pred_fold)
    f1_scores.append(f1)

# Calculate the average F1 score across all folds
avg_f1_score = np.mean(f1_scores)
print("Average F1 Score (Cross-Validated):", avg_f1_score)

# Fit the model on the entire training set
bnb_model.fit(x_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred_naive = bnb_model.predict(x_test)

In [None]:
# Evaluate the model on the test set 
print("Bernoulli Naive Bayes Test Metrics:")
# Generate a classification report
report = classification_report(y_test, y_pred_naive, digits=3)
print('Classification Report:\n', report)

# Create a confusion matrix
cm = confusion_matrix(y_test, y_pred_naive)
print('Confusion Matrix:\n', cm)

In [None]:
# Create confusion matrix for Naive Bayes
cm = confusion_matrix(y_test, y_pred_naive)
sns.heatmap(cm, annot=True, fmt="d", cmap='Blues')
plt.title("Confusion Matrix - Naive Bayes")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
## SVM Model
# Define the objective function for hyperparameter optimization with StratifiedKFold
def objective_svm(trial):
    C = trial.suggest_float('C', 0.01, 20, log=True)  # Using a logarithmic scale for C
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])  # Exploring 'scale' and 'auto' for gamma
    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid'])  # Including 'sigmoid'
    degree = trial.suggest_int('degree', 2, 5) if kernel == 'poly' else 3  # For 'poly' kernel
    coef0 = trial.suggest_float('coef0', 0.0, 10.0) if kernel in ['poly', 'sigmoid'] else 0.0  # For 'poly' and 'sigmoid'
    class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])  # Class weight options

    svm_model = SVC(C=C, gamma=gamma, kernel=kernel, degree=degree, coef0=coef0, class_weight=class_weight, probability=True, random_state=42)

    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # List to store f1 scores for each fold
    f1_scores = []

    # Perform cross-validation manually
    for train_index, val_index in skf.split(x_train_scaled, y_train_resampled):
        x_train_fold, x_val_fold = x_train_scaled[train_index], x_train_scaled[val_index]
        y_train_fold, y_val_fold = y_train_resampled[train_index], y_train_resampled[val_index]

        # Fit the model on the training fold
        svm_model.fit(x_train_fold, y_train_fold)

        # Predict on the validation fold
        y_pred_fold = svm_model.predict(x_val_fold)

        # Calculate the F1 score and append to the list
        f1 = f1_score(y_val_fold, y_pred_fold)
        f1_scores.append(f1)

    # Return the average F1 score across all folds
    return np.mean(f1_scores)

# Create and optimize an Optuna study
study_svm = optuna.create_study(direction="maximize")
study_svm.optimize(objective_svm, n_trials=40)

# Print best hyperparameters
print("Best Hyperparameters:", study_svm.best_trial.params)

# Train the final model with the best hyperparameters
best_params = study_svm.best_trial.params
best_svm_model = SVC(**best_params, probability=True, random_state=42)
best_svm_model.fit(x_train_scaled, y_train_resampled)

# Make predictions on the test set
y_pred_svm = best_svm_model.predict(x_test_scaled)
y_prob = best_svm_model.predict_proba(x_test_scaled)[:, 1]  # Get probabilities for the positive class

In [None]:
# Evaluate the model on the test set 
# Print test metrics
print("SVM Test Metrics:")
# Print classification report and confusion matrix
report = classification_report(y_test, y_pred_svm, digits=3)
print('Classification Report:\n', report)
cm = confusion_matrix(y_test, y_pred_svm)
print('Confusion Matrix:\n', cm)

In [None]:
# Create confusion matrix for SVM
cm = confusion_matrix(y_test, y_pred_svm)
sns.heatmap(cm, annot=True, fmt="d", cmap='Blues')
plt.title("Confusion Matrix - SVM")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
## XGBoost
# Define the objective function for hyperparameter optimization with StratifiedKFold
def objective_xgb(trial):
    # Hyperparameters to be tuned by Optuna
    xgb_params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300, step=20),
        'max_depth': trial.suggest_int('max_depth', 2, 30),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'gamma': trial.suggest_float('gamma', 0, 1, step=0.01),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 3)
    }

    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # List to store f1 scores for each fold
    f1_scores = []

    # Perform cross-validation
    for train_index, val_index in skf.split(x_train_resampled, y_train_resampled):
        x_train_fold, x_val_fold = x_train_resampled.iloc[train_index], x_train_resampled.iloc[val_index]
        y_train_fold, y_val_fold = y_train_resampled.iloc[train_index], y_train_resampled.iloc[val_index]

        # Create the XGBoost model
        xgb_model = XGBClassifier(**xgb_params, random_state=42)

        # Fit the model on the training fold
        xgb_model.fit(x_train_fold, y_train_fold)

        # Predict on the validation fold
        y_pred_fold = xgb_model.predict(x_val_fold)

        # Calculate the F1 score and append to the list
        f1 = f1_score(y_val_fold, y_pred_fold, average='binary')
        f1_scores.append(f1)

    # Return the average F1 score across all folds
    return np.mean(f1_scores)

# Create and optimize an Optuna study
study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective_xgb, n_trials=40)

# Print best hyperparameters
print("Best Hyperparameters:", study_xgb.best_trial.params)

# Train final model with best hyperparameters
best_params = study_xgb.best_trial.params
final_xgb_model = XGBClassifier(**best_params, random_state=42)
final_xgb_model.fit(x_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred_xgb = final_xgb_model.predict(x_test)

In [None]:
# Evaluate the model on the test set
# Print test metrics
print("XGBoost Test Metrics:")
# Print classification report and confusion matrix
report = classification_report(y_test, y_pred_xgb, digits=3)
print('Classification Report:\n', report)
cm = confusion_matrix(y_test, y_pred_xgb)
print('Confusion Matrix:\n', cm)

In [None]:
# confusion matrix for Random Forest
cm = confusion_matrix(y_test, y_pred_xgb)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - XGBoost")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()

plt.show()


In [None]:
## Random Forest
# Define the objective function for hyperparameter optimization with StratifiedKFold
def objective_rf(trial):
    # Hyperparameters to be tuned by Optuna
    rf_params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300, step=20),
        'max_depth': trial.suggest_int('max_depth', 2, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, step=2),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 10, step=2),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }

    # Create the Random Forest model
    rf_model = RandomForestClassifier(**rf_params, random_state=42)

    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # List to store f1 scores for each fold
    f1_scores = []

    # Perform cross-validation
    for train_index, val_index in skf.split(x_train_resampled, y_train_resampled):
        x_train_fold, x_val_fold = x_train_resampled.iloc[train_index], x_train_resampled.iloc[val_index]
        y_train_fold, y_val_fold = y_train_resampled.iloc[train_index], y_train_resampled.iloc[val_index]

        # Fit the model on the training fold
        rf_model.fit(x_train_fold, y_train_fold)

        # Predict on the validation fold
        y_pred_fold = rf_model.predict(x_val_fold)

        # Calculate the F1 score and append to the list
        f1 = f1_score(y_val_fold, y_pred_fold, average='weighted')
        f1_scores.append(f1)

    # Return the average F1 score across all folds
    return np.mean(f1_scores)

# Create and optimize an Optuna study
study_rf = optuna.create_study(direction="maximize")
study_rf.optimize(objective_rf, n_trials=40)

# Print best hyperparameters
print("Best Hyperparameters:", study_rf.best_trial.params)

# Train final model with best hyperparameters
best_params = study_rf.best_trial.params
final_rf_model = RandomForestClassifier(**best_params, random_state=42)
final_rf_model.fit(x_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred_rf = final_rf_model.predict(x_test)

In [None]:
# Evaluate the model on the test set using different metrics
# Print test metrics
print("Random Forest Test Metrics:")
# Print classification report and confusion matrix
report = classification_report(y_test, y_pred_rf, digits=3)
print('Classification Report:\n', report)
cm = confusion_matrix(y_test, y_pred_rf)
print('Confusion Matrix:\n', cm)

In [None]:
# confusion matrix for Random Forest
cm = confusion_matrix(y_test, y_pred_rf)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()

plt.show()


In [None]:
## Tabnet
# Initialize Stratified 5-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fixed_learning_rate = 0.001

def objective_tabnet(trial):
    # Hyperparameters to be tuned by Optuna
    mask_type = trial.suggest_categorical("mask_type", ["sparsemax", "entmax"])
    n_d = trial.suggest_int("n_d", 8, 64)
    n_a = trial.suggest_int("n_a", 8, 64)
    batch_size = trial.suggest_categorical("batch_size", [64, 128, 256])
    virtual_batch_size = trial.suggest_categorical("virtual_batch_size", [32, 64, 128])

    f1_scores = []

    for train_index, val_index in skf.split(x_train_scaled, y_train_resampled):
        x_train_fold, x_val_fold = x_train_scaled[train_index], x_train_scaled[val_index]
        y_train_fold, y_val_fold = y_train_resampled[train_index], y_train_resampled[val_index]

        # Create the TabNet model with the fixed learning rate
        model = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                                 optimizer_params=dict(lr=fixed_learning_rate), 
                                 mask_type=mask_type, 
                                 n_d=n_d, 
                                 n_a=n_a)

        # Train the model
        model.fit(X_train=x_train_fold, y_train=y_train_fold,
                  eval_set=[(x_val_fold, y_val_fold)],
                  eval_metric=['auc'],
                  max_epochs=50,  # You can adjust this
                  patience=10,  # And this
                  batch_size=batch_size,
                  virtual_batch_size=virtual_batch_size,
                  num_workers=0,
                  drop_last=False)

        # Evaluate the model
        preds = model.predict(x_val_fold)
        f1 = f1_score(y_val_fold, preds, average='weighted')
        f1_scores.append(f1)

    return np.mean(f1_scores)


# Create and optimize an Optuna
study_tabnet = optuna.create_study(direction="maximize")
study_tabnet.optimize(objective_tabnet, n_trials=40)

# Best hyperparameters
best_params = study_tabnet.best_trial.params
print("Best Hyperparameters:", best_params)

# Train final model with best hyperparameters
final_model = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                               optimizer_params=dict(lr=0.001),
                               mask_type=best_params['mask_type'],
                               n_d=best_params['n_d'],
                               n_a=best_params['n_a'])

final_model.fit(X_train=x_train_scaled, y_train=y_train_resampled,
                eval_set=[(x_train_scaled, y_train_resampled)],
                eval_metric=['auc'],
                max_epochs=50,  # Adjust as needed
                patience=10,  # Adjust as needed
                batch_size=best_params['batch_size'],
                virtual_batch_size=best_params['virtual_batch_size'],
                num_workers=0,
                drop_last=False)
# Evaluate on the test set
y_pred_tabnet = final_model.predict(x_test_scaled)

In [None]:
# Print test metrics
print("TabNet Test Metrics:")
# Print classification report and confusion matrix
report = classification_report(y_test, y_pred_tabnet, digits=3)
print('Classification Report:\n', report)
cm = confusion_matrix(y_test, y_pred_tabnet)
print('Confusion Matrix:\n', cm)


In [None]:
# confusion matrix for tabnet
cm = confusion_matrix(y_test, y_pred_tabnet)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - TabNet")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()

plt.show()


In [None]:
# Feature importance using best performer (Random Forest)
# Fitting a Random Forest Classifier for feature importance
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(x_train, y_train)

importances = rf_classifier.feature_importances_
feature_names = features.columns
sorted_indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
colors = plt.cm.viridis(np.linspace(0, 1, x_train.shape[1]))
plt.bar(range(x_train.shape[1]), importances[sorted_indices], color=colors, align='center')
plt.xticks(range(x_train.shape[1]), feature_names[sorted_indices], rotation=90)
plt.tight_layout()

plt.show()