In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import OrdinalEncoder

import optuna
import lightgbm as lgb
import xgboost as xgb
from tqdm.notebook import tqdm


In [2]:
# Configs
BASE_PATH = '/Users/AnshulSrivastava/Desktop/Fall24/CMSE 492/Project/isic-2024-challenge/'
RANDOM_SEED = 69
N_FOLDS = 5

In [3]:
# set random seed
np.random.seed(RANDOM_SEED)

In [4]:
# Load data
metadata = pd.read_csv('metadata_with_cnn.csv')

metadata = metadata.drop(columns='kfold', axis=1)

In [5]:
# Add noise to the CNN confidence
noise_level = 0.02

metadata['cnn_confidence'] = metadata['cnn_confidence'] + np.random.normal(0, noise_level, metadata.shape[0])
metadata['cnn_confidence'] = metadata['cnn_confidence'].clip(0, 1)

In [6]:
# Add folds
metadata['kfold'] = -1
stratified_kfold = StratifiedGroupKFold(n_splits=N_FOLDS)

for fold, (train_idx, val_idx) in enumerate(stratified_kfold.split(X=metadata, y=metadata['target'], groups=metadata['patient_id'])):
    metadata.loc[val_idx, 'kfold'] = fold

In [7]:
# Ordinal encode 'age_group'
metadata['age_group'] = metadata['age_group'].fillna('nan')
ordinal_encoder = OrdinalEncoder()
metadata['age_group'] = ordinal_encoder.fit_transform(metadata['age_group'].values.reshape(-1, 1))

In [8]:
train_cols = metadata.columns.to_list()
train_cols.remove('target')
train_cols.remove('patient_id')
train_cols.remove('kfold')
train_cols.remove('isic_id')


In [9]:
def custom_lgbm_metric(y_true, y_pred):
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)

    # Convert y_true to 0/1 format for binary classification
    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_pred])

    # Calculate partial AUC
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)

    # Return the metric name, value, and maximization indicator
    return 'pAUC', partial_auc, True

def partial_auc_score(y_true, y_pred):
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)

    # Convert y_true to 0/1 format for binary classification
    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_pred])

    # Calculate partial AUC
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)

    # Return the metric name, value, and maximization indicator
    return partial_auc

# LGBM Classifier

In [17]:
# Define objective function

def objective(trial):
    # Define hyperparameters
    param = {
        "objective": "binary",
        "metric": "custom",
        "verbosity": -1,
        "boosting_type": "gbdt",
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "device": "cpu",
        'random state': RANDOM_SEED
    }

    # Initialize list to store AUC scores
    aucs = []

    # Iterate over folds
    for fold in range(N_FOLDS):
        # Get train and validation data
        train_data = metadata[metadata['kfold'] != fold].reset_index(drop=True)
        val_data = metadata[metadata['kfold'] == fold].reset_index(drop=True)

        # Define features and target
        features = train_data[train_cols]
        target = train_data['target']

        dtrain = lgb.Dataset(features, target)

        # Train model
        model = lgb.train(param, dtrain,)
        preds = model.predict(val_data[train_cols])
        auc = partial_auc_score(val_data['target'], preds)
        aucs.append(auc)

    return np.mean(aucs)

In [18]:
# Define study
study = optuna.create_study(direction="maximize", study_name="lgbm_optimization")
study.optimize(objective, n_trials=20)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-11-22 22:41:39,705] A new study created in memory with name: lgbm_optimization
[I 2024-11-22 22:41:43,731] Trial 0 finished with value: 0.1601477163166451 and parameters: {'learning_rate': 0.02940714881841564, 'lambda_l1': 1.5856234362030003e-07, 'lambda_l2': 0.5499446320731312, 'num_leaves': 99, 'feature_fraction': 0.5873137275143606, 'bagging_fraction': 0.45172514662065333, 'bagging_freq': 2, 'min_child_samples': 14}. Best is trial 0 with value: 0.1601477163166451.
[I 2024-11-22 22:41:47,772] Trial 1 finished with value: 0.1586062563423794 and parameters: {'learning_rate': 0.07855810851734218, 'lambda_l1': 0.045815518854756654, 'lambda_l2': 1.604684760338536e-08, 'num_leaves': 112, 'feature_fraction': 0.8202811046063471, 'bagging_fraction': 0.7202837813953901, 'bagging_freq': 5, 'min_child_samples': 96}. Best is trial 0 with value: 0.1601477163166451.
[I 2024-11-22 22:41:51,003] Trial 2 finished with value: 0.15576599279103423 and parameters: {'learning_rate': 0.0003169504132

Number of finished trials: 20
Best trial:
  Value: 0.16247064681217246
  Params: 
    learning_rate: 0.05393530722692628
    lambda_l1: 0.5906720640598918
    lambda_l2: 0.9079354829661402
    num_leaves: 35
    feature_fraction: 0.486237214335383
    bagging_fraction: 0.8581182837877586
    bagging_freq: 4
    min_child_samples: 80


In [19]:
trial.params

{'learning_rate': 0.05393530722692628,
 'lambda_l1': 0.5906720640598918,
 'lambda_l2': 0.9079354829661402,
 'num_leaves': 35,
 'feature_fraction': 0.486237214335383,
 'bagging_fraction': 0.8581182837877586,
 'bagging_freq': 4,
 'min_child_samples': 80}

In [20]:
# Define best model

best_params = trial.params
best_params['objective'] = 'binary'
best_params['metric'] = 'custom'
best_params['verbosity'] = -1
best_params['boosting_type'] = 'gbdt'
best_params['device'] = 'cpu'
best_params['random state'] = RANDOM_SEED
best_params['n_estimators'] = 1000

# Initialize list to store AUC scores
aucs = []
models = []

# Iterate over folds
for fold in tqdm(range(N_FOLDS), total=N_FOLDS):
    # Get train and validation data
    train_data = metadata[metadata['kfold'] != fold].reset_index(drop=True)
    val_data = metadata[metadata['kfold'] == fold].reset_index(drop=True)

    # Define features and target
    features = train_data[train_cols]
    target = train_data['target']

    model = VotingClassifier([(f"lgb_{i}", lgb.LGBMClassifier(random_state=i, **best_params)) for i in range(3)], voting="soft")
    model.fit(features, target)
    # Predict probabilities for validation data
    preds_proba = model.predict_proba(val_data[train_cols])[:, 1]  # Probability of the positive class
    auc = partial_auc_score(val_data['target'], preds_proba)
    print(f"Fold: {fold+1} - Partial AUC Score: {auc:.5f}")
    aucs.append(auc)
    models.append(model)

  0%|          | 0/5 [00:00<?, ?it/s]

Fold: 1 - Partial AUC Score: 0.15600
Fold: 2 - Partial AUC Score: 0.16550
Fold: 3 - Partial AUC Score: 0.15009
Fold: 4 - Partial AUC Score: 0.16524
Fold: 5 - Partial AUC Score: 0.16908


In [21]:
# Mean AUC score
mean_auc = np.mean(aucs)
print(f"Mean Partial AUC for LGBM: {mean_auc:.5f}")

Mean Partial AUC for LGBM: 0.16118


In [23]:
from joblib import dump

# Save models
for i, model in enumerate(models):
    dump(model, f"lgb_model_{i}.joblib")

# XGBoost

In [24]:
# Define objective function for XGBoost

def objective(trial):

    param = {
        "objective": "binary:logistic",
        "eval_metric": "auc",  # AUC will still be tracked as an eval metric
        "booster": "gbtree",
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.1, log=True),
        "lambda": trial.suggest_float("lambda", 1e-8, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "subsample": trial.suggest_float("subsample", 0.4, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 100),
        "random_state": RANDOM_SEED
    }

    aucs = []

    for fold in range(N_FOLDS):
        train_data = metadata[metadata['kfold'] != fold].reset_index(drop=True)
        val_data = metadata[metadata['kfold'] == fold].reset_index(drop=True)

        features = train_data[train_cols]
        target = train_data['target']

        model = xgb.XGBClassifier(**param)
        # Train XGBoost model
        dtrain = xgb.DMatrix(features, label=target)
        dval = xgb.DMatrix(val_data[train_cols], label=val_data['target'])
        evals = [(dval, 'valid')]
        
        model = xgb.train(param, dtrain, evals=evals, num_boost_round=1000,
                          early_stopping_rounds=50, verbose_eval=False)
        
        preds = model.predict(dval)
        auc = partial_auc_score(val_data['target'], preds)
        aucs.append(auc)
    
    return np.mean(aucs)

In [26]:
# Define study
study = optuna.create_study(direction="maximize", study_name="xgb_optimization")
study.optimize(objective, n_trials=20)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-11-22 22:47:22,644] A new study created in memory with name: xgb_optimization
[I 2024-11-22 22:47:32,642] Trial 0 finished with value: 0.15971507395240503 and parameters: {'learning_rate': 0.011546540000196066, 'lambda': 2.240993966620505, 'alpha': 1.0929072965331964e-06, 'max_depth': 9, 'subsample': 0.7858816807755473, 'colsample_bytree': 0.5474771462223389, 'min_child_weight': 84}. Best is trial 0 with value: 0.15971507395240503.
[I 2024-11-22 22:47:34,661] Trial 1 finished with value: 0.14532371615820266 and parameters: {'learning_rate': 0.0009127325431013793, 'lambda': 2.826464166115527e-07, 'alpha': 0.00015856838250149165, 'max_depth': 4, 'subsample': 0.5757727836828336, 'colsample_bytree': 0.6056020553581936, 'min_child_weight': 99}. Best is trial 0 with value: 0.15971507395240503.
[I 2024-11-22 22:47:50,542] Trial 2 finished with value: 0.15935380276061323 and parameters: {'learning_rate': 0.004812012390943991, 'lambda': 1.3158725588979303e-05, 'alpha': 0.000575718461407

Number of finished trials: 20
Best trial:
  Value: 0.16312132150405126
  Params: 
    learning_rate: 0.024387184674775072
    lambda: 2.7537038054884306e-08
    alpha: 0.22390734815967694
    max_depth: 7
    subsample: 0.6786181909732795
    colsample_bytree: 0.41387785641011565
    min_child_weight: 1


In [27]:
# Define best model
best_params = trial.params
best_params['objective'] = 'binary:logistic'
best_params['eval_metric'] = 'auc'
best_params['booster'] = 'gbtree'
best_params['n_estimators'] = 1000

# Initialize list to store AUC scores and models for each fold
aucs = []
models = []

# Iterate over folds
for fold in tqdm(range(N_FOLDS)):
    # Get train and validation data for this fold
    train_data = metadata[metadata['kfold'] != fold].reset_index(drop=True)
    val_data = metadata[metadata['kfold'] == fold].reset_index(drop=True)

    # Define features and target
    X_train, y_train = train_data[train_cols], train_data['target']
    X_val, y_val = val_data[train_cols], val_data['target']

    model = VotingClassifier([(f"xgb_{i}", xgb.XGBClassifier(random_state=i, **best_params)) for i in range(3)], voting="soft")
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_val)[:, 1]
    # Calculate AUC score
    auc = partial_auc_score(y_val, preds)
    print(f"Fold {fold} - AUC Score: {auc:.5f}")
    
    # Append results
    aucs.append(auc)
    models.append(model)

# Display average AUC across folds
print(f"Average AUC Score across folds: {np.mean(aucs):.5f}")

  0%|          | 0/5 [00:00<?, ?it/s]

Fold 0 - AUC Score: 0.16119
Fold 1 - AUC Score: 0.16821
Fold 2 - AUC Score: 0.15332
Fold 3 - AUC Score: 0.16266
Fold 4 - AUC Score: 0.16597
Average AUC Score across folds: 0.16227


In [28]:
# Save models

for i, model in enumerate(models):
    dump(model, f"xgb_model_{i}.joblib")