In [34]:
import pandas as pd
import sys
sys.path.append('./src') 
import engineer_features as ef
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    roc_auc_score,
    ConfusionMatrixDisplay
)
import optuna

In [32]:
df = pd.read_csv('Fraudulent_E-Commerce_Transaction_Data_2.csv')

In [35]:
X_train, X_test, y_train, y_test = ef.engineer_features(df, use_linear_model=False, use_smote=False)

Strategy: Using ORIGINAL amount. Dropping 'log_transaction_amount'.
Feature engineering complete.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['customer_avg_spend_before_tx'].fillna(0, inplace=True)


ValueError: too many values to unpack (expected 4)

In [None]:
def objective(trial):
    # Define the hyperparameters to search
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 10, 30, log=True),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
    }
    
    # Create the model with the suggested params
    model = RandomForestClassifier(
        **params,
        class_weight='balanced',
        random_state=ef.RANDOM_SEED,
        n_jobs=-1
    )
    
    # Get the cross-validation score (on the training data)
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='roc_auc', n_jobs=-1)
    # Return the mean score for this trial
    return score.mean()

In [None]:
# Maximise ROC-AUC Score
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20) # Run 20 trials

print(f"Best ROC-AUC: {study.best_value:.4f}")
print("Best Parameters:")
print(study.best_params)

best_rf_model = RandomForestClassifier(
    **study.best_params,  # Use the best params found by Optuna
    random_state=ef.RANDOM_SEED,
    n_jobs=-1
)
best_rf_model.fit(X_train, y_train)

In [None]:
print("\n--- Evaluation of Best Model (Test Set) ---")
y_pred = best_rf_model.predict(X_test)
y_pred_proba = best_rf_model.predict_proba(X_test)[:, 1]

# Metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"Accuracy (Test Set): {accuracy:.4f}")
print(f"ROC-AUC Score (Test Set): {roc_auc:.4f}")

# Classification Report
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred, target_names=['Not Fraud (0)', 'Fraud (1)']))

# Confusion Matrix
print("\nConfusion Matrix (Test Set):")
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Not Fraud (0)', 'Fraud (1)'])
disp.plot(cmap='Blues')
plt.title("Optuna-Tuned Random Forest - Confusion Matrix (Test Set)")
plt.show()

In [None]:
print("\n--- Top 10 Feature Importances (Tuned Model) ---")
importances = best_rf_model.feature_importances_
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(feature_importance_df.head(10))