In [1]:
# Define a function to train the model
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import train_test_split
from optuna import Trial, visualization
import os
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

In [2]:
def score(solution: np.ndarray, submission: np.ndarray, min_tpr: float=0.80) -> float:
    v_gt = abs(solution-1)
    v_pred = np.array([1.0 - x for x in submission])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return(partial_auc)

def pauc_80(preds, data):
    score_value = score(data.get_label(), preds, min_tpr=0.8)
    return 'pauc_80', score_value, True

In [3]:
# Setting up features and target
df_raw = pd.read_csv("/kaggle/input/isic-2024-challenge/train-metadata.csv", low_memory=False)
df_test1 = df_raw.drop(['patient_id', 'age_approx', 'sex', 'image_type', 'tbp_lv_Aext', 'tbp_lv_Bext', 'tbp_lv_Cext', 'tbp_lv_Hext', 'tbp_lv_Lext', 'tbp_lv_stdLExt', 'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_location', 'tbp_lv_location_simple', 'tbp_lv_radial_color_std_max', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z', 'attribution', 'copyright_license', 'lesion_id', 'iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5', 'mel_mitotic_index', 'mel_thick_mm', 'tbp_lv_dnn_lesion_confidence', 'tbp_lv_deltaLB'], axis=1)
df_test1['anatom_site_general'] = df_test1['anatom_site_general'].fillna('Unknown')
df_test1['anatom_site_general'] = df_test1['anatom_site_general'].replace({'lower extremity': 0, 'head/neck': 1, 'posterior torso': 2, 'anterior torso': 3, 'upper extremity': 4, 'Unknown': 5}, regex=True)
df_test1['tbp_tile_type'] = df_test1['tbp_tile_type'].replace({'3D: white': 0, '3D: XP': 1}, regex=True)
cat_columns = ['anatom_site_general', 'tbp_tile_type']
columns_list = df_test1[cat_columns].columns.tolist()

data = df_test1.columns.values.tolist()
print (data)

exclude = ['isic_id', 'target']
input = [i for i in data if i not in exclude]

X = df_test1[input]
y = df_test1['target']

  df_test1['anatom_site_general'] = df_test1['anatom_site_general'].replace({'lower extremity': 0, 'head/neck': 1, 'posterior torso': 2, 'anterior torso': 3, 'upper extremity': 4, 'Unknown': 5}, regex=True)


['isic_id', 'target', 'anatom_site_general', 'clin_size_long_diam_mm', 'tbp_tile_type', 'tbp_lv_A', 'tbp_lv_B', 'tbp_lv_C', 'tbp_lv_H', 'tbp_lv_L', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color', 'tbp_lv_perimeterMM', 'tbp_lv_stdL']


  df_test1['tbp_tile_type'] = df_test1['tbp_tile_type'].replace({'3D: white': 0, '3D: XP': 1}, regex=True)


In [4]:
# Create Dataset for LightGBM
train_df_dataset = lgb.Dataset(
X,
y,
categorical_feature=[0,2],  # Ensure categorical features are correctly specified
free_raw_data=False
)

In [5]:
# Lgb parameters
lgb_params = {
'objective': 'binary',
'metric': 'none',  # Use a standard metric for evaluation
'verbose': -1,
'learning_rate': 0.01,  # Increase if model is converging too slowly
'num_leaves': 10,  # Reduce for simpler models
'min_data_in_leaf': 90,  # Increase to prevent overfitting
'pos_bagging_fraction': 0.9,  # Adjust based on variance
'neg_bagging_fraction': 0.05,  # Adjust based on variance
'bagging_freq': 1,  # Reduce or disable if bagging is not helping
'feature_fraction': 1,  # Increase to use more features
'lambda_l1': 0.25,  # Keep low or 0 if L1 regularization is not needed
'lambda_l2': 0.1,  # Keep low or 0 if L2 regularization is not needed
'n_jobs' : 4
} 

In [6]:
kf = StratifiedKFold()

cv_results = lgb.cv(
    lgb_params,
    train_df_dataset,
    folds=kf.split(X, y),
    feval=pauc_80,
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50), 
        lgb.log_evaluation(30)
    ],
    stratified=True,
    return_cvbooster=True,
)

Training until validation scores don't improve for 50 rounds
[30]	cv_agg's valid pauc_80: 0.137182 + 0.0182989
[60]	cv_agg's valid pauc_80: 0.138861 + 0.016559
[90]	cv_agg's valid pauc_80: 0.141743 + 0.0168614
[120]	cv_agg's valid pauc_80: 0.143686 + 0.0173565
[150]	cv_agg's valid pauc_80: 0.144618 + 0.0169869
[180]	cv_agg's valid pauc_80: 0.145382 + 0.0166903
[210]	cv_agg's valid pauc_80: 0.145857 + 0.0166322
[240]	cv_agg's valid pauc_80: 0.146223 + 0.0165019
[270]	cv_agg's valid pauc_80: 0.146335 + 0.0165122
[300]	cv_agg's valid pauc_80: 0.146901 + 0.0160785
[330]	cv_agg's valid pauc_80: 0.147281 + 0.0158088
[360]	cv_agg's valid pauc_80: 0.147412 + 0.0158728
[390]	cv_agg's valid pauc_80: 0.147769 + 0.0156686
[420]	cv_agg's valid pauc_80: 0.147879 + 0.0156375
[450]	cv_agg's valid pauc_80: 0.147904 + 0.0156029
[480]	cv_agg's valid pauc_80: 0.148098 + 0.0152845
[510]	cv_agg's valid pauc_80: 0.148209 + 0.0153107
[540]	cv_agg's valid pauc_80: 0.148383 + 0.0154425
[570]	cv_agg's valid pauc

In [7]:
df_test_real = pd.read_csv("/kaggle/input/isic-2024-challenge/test-metadata.csv", low_memory=False)
df_test = df_test_real.drop(['isic_id', 'patient_id', 'age_approx', 'sex', 'image_type', 'tbp_lv_Aext', 'tbp_lv_Bext', 'tbp_lv_Cext', 'tbp_lv_Hext', 'tbp_lv_Lext', 'tbp_lv_stdLExt', 'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_location', 'tbp_lv_location_simple', 'tbp_lv_radial_color_std_max', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z', 'attribution', 'copyright_license', 'tbp_lv_deltaLB'], axis=1)
df_test['anatom_site_general'] = df_test['anatom_site_general'].fillna('Unknown')
df_test['anatom_site_general'] = df_test['anatom_site_general'].replace({'lower extremity': 0, 'head/neck': 1, 'posterior torso': 2, 'anterior torso': 3, 'upper extremity': 4, 'Unknown': 5}, regex=True)
df_test['tbp_tile_type'] = df_test['tbp_tile_type'].replace({'3D: white': 0, '3D: XP': 1}, regex=True)
df_temp = df_test_real.drop(['patient_id', 'age_approx', 'sex', 'image_type', 'tbp_lv_Aext', 'tbp_lv_Bext', 'tbp_lv_Cext', 'tbp_lv_Hext', 'tbp_lv_Lext', 'tbp_lv_stdLExt', 'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_location', 'tbp_lv_location_simple', 'tbp_lv_radial_color_std_max', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z', 'attribution', 'copyright_license', 'tbp_lv_deltaLB'], axis=1)
                             
data = df_test.columns.values.tolist()
print (data)

['anatom_site_general', 'clin_size_long_diam_mm', 'tbp_tile_type', 'tbp_lv_A', 'tbp_lv_B', 'tbp_lv_C', 'tbp_lv_H', 'tbp_lv_L', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color', 'tbp_lv_perimeterMM', 'tbp_lv_stdL']


  df_test['anatom_site_general'] = df_test['anatom_site_general'].replace({'lower extremity': 0, 'head/neck': 1, 'posterior torso': 2, 'anterior torso': 3, 'upper extremity': 4, 'Unknown': 5}, regex=True)
  df_test['tbp_tile_type'] = df_test['tbp_tile_type'].replace({'3D: white': 0, '3D: XP': 1}, regex=True)


In [8]:
models = cv_results['cvbooster'].boosters
pred_per_cv = [model.predict(df_test) for model in models]
pred_average = np.array(pred_per_cv).mean(axis=0)

In [9]:
submission_df = pd.DataFrame({'isic_id':df_temp['isic_id'].tolist(), 'target':pred_average})
submission_df.to_csv("submission.csv", index=False)
print (submission_df.head())

        isic_id    target
0  ISIC_0015657  0.001688
1  ISIC_0015729  0.000519
2  ISIC_0015740  0.002844
