In [14]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

# Machine Learning - Optuna
import optuna
from optuna.samplers import TPESampler
from sklearn.metrics import f1_score, roc_auc_score

In [None]:
train = pd.read_csv('data/df_train_1.csv')
test = pd.read_csv('data/df_test_1.csv')

X_train = train.drop(columns=['FraudFound_P'])
y_train = train['FraudFound_P']
X_test = test.drop(columns=['FraudFound_P'])
y_test = test['FraudFound_P']

In [15]:
def objective(trial):
    classifier_name = trial.suggest_categorical('classifier', [
        'LogisticRegression', 'SVM', 'KNeighbors', 'RandomForest', 'LightGBM', 'CatBoost'
    ])
    
    if classifier_name == 'LogisticRegression':
        C = trial.suggest_float('lr_C', 0.1, 10)
        solver = trial.suggest_categorical('lr_solver', ['liblinear'])
        classifier_obj = LogisticRegression(C=C, solver=solver)
    
    elif classifier_name == 'SVM':
        C = trial.suggest_float('svm_C', 0.1, 10)
        kernel = trial.suggest_categorical('svm_kernel', ['linear', 'rbf'])
        classifier_obj = SVC(C=C, kernel=kernel, probability=True)
    
    elif classifier_name == 'KNeighbors':
        n_neighbors = trial.suggest_int('kn_n_neighbors', 3, 7)
        classifier_obj = KNeighborsClassifier(n_neighbors=n_neighbors)
    
    elif classifier_name == 'RandomForest':
        n_estimators = trial.suggest_int('rf_n_estimators', 100, 200)
        max_depth = trial.suggest_categorical('rf_max_depth', [10, 20, None])
        classifier_obj = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    
    elif classifier_name == 'LightGBM':
        num_leaves = trial.suggest_int('lgb_num_leaves', 31, 50)
        learning_rate = trial.suggest_float('lgb_learning_rate', 0.01, 0.2)
        n_estimators = trial.suggest_int('lgb_n_estimators', 100, 200)
        classifier_obj = lgb.LGBMClassifier(num_leaves=num_leaves, learning_rate=learning_rate, n_estimators=n_estimators, verbose=-1)
    
    elif classifier_name == 'CatBoost':
        iterations = trial.suggest_int('cat_iterations', 100, 200)
        learning_rate = trial.suggest_float('cat_learning_rate', 0.01, 0.2)
        depth = trial.suggest_int('cat_depth', 3, 9)
        classifier_obj = CatBoostClassifier(iterations=iterations, learning_rate=learning_rate, depth=depth, verbose=0)
    
    classifier_obj.fit(X_train, y_train)
    y_prob = classifier_obj.predict_proba(X_test)[:, 1]
    
    # Threshold optimization
    thresholds = np.arange(0.1, 0.9, 0.01)
    best_threshold = 0.5
    best_score = 0.0
    
    for threshold in thresholds:
        y_pred = (y_prob >= threshold).astype(int)
        score = f1_score(y_test, y_pred)
        if score > best_score:
            best_score = score
            best_threshold = threshold
    
    trial.set_user_attr('best_threshold', best_threshold)
    trial.set_user_attr('classifier_obj', classifier_obj)
    return roc_auc_score(y_test, (y_prob >= best_threshold).astype(int))

In [16]:
# Optuna Study
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective, n_trials=100)

[I 2024-11-04 13:36:02,576] A new study created in memory with name: no-name-0a269b6f-9ebc-41c1-bf4b-e514a4bcef22
[I 2024-11-04 13:36:03,574] Trial 0 finished with value: 0.704069190630183 and parameters: {'classifier': 'LightGBM', 'lgb_num_leaves': 49, 'lgb_learning_rate': 0.04513541589244335, 'lgb_n_estimators': 112}. Best is trial 0 with value: 0.704069190630183.
[I 2024-11-04 13:36:06,969] Trial 1 finished with value: 0.6229883142357299 and parameters: {'classifier': 'CatBoost', 'cat_iterations': 113, 'cat_learning_rate': 0.15001887596073143, 'cat_depth': 5}. Best is trial 0 with value: 0.704069190630183.
[I 2024-11-04 13:36:07,919] Trial 2 finished with value: 0.6545012533985657 and parameters: {'classifier': 'LightGBM', 'lgb_num_leaves': 38, 'lgb_learning_rate': 0.1648279313499861, 'lgb_n_estimators': 139}. Best is trial 0 with value: 0.704069190630183.
[I 2024-11-04 13:36:10,222] Trial 3 finished with value: 0.6647064169462514 and parameters: {'classifier': 'CatBoost', 'cat_iter

In [17]:
best_trial = study.best_trial
print('Best trial: ', best_trial.values)
print('Best hyperparameters: ', best_trial.params)

Best trial:  [0.7526412250740301]
Best hyperparameters:  {'classifier': 'LightGBM', 'lgb_num_leaves': 33, 'lgb_learning_rate': 0.010083903457449599, 'lgb_n_estimators': 146}


In [18]:
# Optimized Alogrithm Combination
best_algorithms = [trial.params['classifier'] for trial in study.trials]
algorithm_counts = pd.Series(best_algorithms).value_counts()
print('Optimal algorithm proportions: ', algorithm_counts)

Optimal algorithm proportions:  LightGBM              67
CatBoost               8
LogisticRegression     7
RandomForest           6
SVM                    6
KNeighbors             6
Name: count, dtype: int64
