In [9]:
import numpy as np
import pandas as pd
import pandas.api.types
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold

import wandb
import lightgbm as lgb

In [13]:
def get_df_train():
    df_train = pd.read_csv("../data/train-metadata.csv")

    def feature_engineering(df):
        # New features to try...
        df["lesion_size_ratio"] = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
        df["lesion_shape_index"] = df["tbp_lv_areaMM2"] / (df["tbp_lv_perimeterMM"] ** 2)
        df["hue_contrast"] = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
        df["luminance_contrast"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs()
        df["lesion_color_difference"] = np.sqrt(df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2)
        df["border_complexity"] = df["tbp_lv_norm_border"] + df["tbp_lv_symm_2axis"]
        df["color_uniformity"] = df["tbp_lv_color_std_mean"] / df["tbp_lv_radial_color_std_max"]
        df["3d_position_distance"] = np.sqrt(df["tbp_lv_x"] ** 2 + df["tbp_lv_y"] ** 2 + df["tbp_lv_z"] ** 2) 
        df["perimeter_to_area_ratio"] = df["tbp_lv_perimeterMM"] / df["tbp_lv_areaMM2"]
        df["lesion_visibility_score"] = df["tbp_lv_deltaLBnorm"] + df["tbp_lv_norm_color"]
        df["combined_anatomical_site"] = df["anatom_site_general"] + "_" + df["tbp_lv_location"]
        df["symmetry_border_consistency"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_norm_border"]
        df["color_consistency"] = df["tbp_lv_stdL"] / df["tbp_lv_Lext"]

        df["size_age_interaction"] = df["clin_size_long_diam_mm"] * df["age_approx"]
        df["hue_color_std_interaction"] = df["tbp_lv_H"] * df["tbp_lv_color_std_mean"]
        df["lesion_severity_index"] = (df["tbp_lv_norm_border"] + df["tbp_lv_norm_color"] + df["tbp_lv_eccentricity"]) / 3
        df["shape_complexity_index"] = df["border_complexity"] + df["lesion_shape_index"]
        df["color_contrast_index"] = df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"] + df["tbp_lv_deltaLBnorm"]
        df["log_lesion_area"] = np.log(df["tbp_lv_areaMM2"] + 1)
        df["normalized_lesion_size"] = df["clin_size_long_diam_mm"] / df["age_approx"]
        df["mean_hue_difference"] = (df["tbp_lv_H"] + df["tbp_lv_Hext"]) / 2
        df["std_dev_contrast"] = np.sqrt((df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2) / 3)
        df["color_shape_composite_index"] = (df["tbp_lv_color_std_mean"] + df["tbp_lv_area_perim_ratio"] + df["tbp_lv_symm_2axis"]) / 3
        df["3d_lesion_orientation"] = np.arctan2(df_train["tbp_lv_y"], df_train["tbp_lv_x"])
        df["overall_color_difference"] = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
        df["symmetry_perimeter_interaction"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_perimeterMM"]
        df["comprehensive_lesion_index"] = (df["tbp_lv_area_perim_ratio"] + df["tbp_lv_eccentricity"] + df["tbp_lv_norm_color"] + df["tbp_lv_symm_2axis"]) / 4

        # Taken from: https://www.kaggle.com/code/dschettler8845/isic-detect-skin-cancer-let-s-learn-together
        df["color_variance_ratio"] = df["tbp_lv_color_std_mean"] / df["tbp_lv_stdLExt"]
        df["border_color_interaction"] = df["tbp_lv_norm_border"] * df["tbp_lv_norm_color"]
        df["size_color_contrast_ratio"] = df["clin_size_long_diam_mm"] / df["tbp_lv_deltaLBnorm"]
        df["age_normalized_nevi_confidence"] = df["tbp_lv_nevi_confidence"] / df["age_approx"]
        df["color_asymmetry_index"] = df["tbp_lv_radial_color_std_max"] * df["tbp_lv_symm_2axis"]
        df["3d_volume_approximation"] = df["tbp_lv_areaMM2"] * np.sqrt(df["tbp_lv_x"]**2 + df["tbp_lv_y"]**2 + df["tbp_lv_z"]**2)
        df["color_range"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs() + (df["tbp_lv_A"] - df["tbp_lv_Aext"]).abs() + (df["tbp_lv_B"] - df["tbp_lv_Bext"]).abs()
        df["shape_color_consistency"] = df["tbp_lv_eccentricity"] * df["tbp_lv_color_std_mean"]
        df["border_length_ratio"] = df["tbp_lv_perimeterMM"] / (2 * np.pi * np.sqrt(df["tbp_lv_areaMM2"] / np.pi))
        df["age_size_symmetry_index"] = df["age_approx"] * df["clin_size_long_diam_mm"] * df["tbp_lv_symm_2axis"]
        # Until here.
        
        new_num_cols = [
            "lesion_size_ratio", "lesion_shape_index", "hue_contrast",
            "luminance_contrast", "lesion_color_difference", "border_complexity",
            "color_uniformity", "3d_position_distance", "perimeter_to_area_ratio",
            "lesion_visibility_score", "symmetry_border_consistency", "color_consistency",

            "size_age_interaction", "hue_color_std_interaction", "lesion_severity_index", 
            "shape_complexity_index", "color_contrast_index", "log_lesion_area",
            "normalized_lesion_size", "mean_hue_difference", "std_dev_contrast",
            "color_shape_composite_index", "3d_lesion_orientation", "overall_color_difference",
            "symmetry_perimeter_interaction", "comprehensive_lesion_index",
            
            "color_variance_ratio", "border_color_interaction", "size_color_contrast_ratio",
            "age_normalized_nevi_confidence", "color_asymmetry_index", "3d_volume_approximation",
            "color_range", "shape_color_consistency", "border_length_ratio", "age_size_symmetry_index",
        ]
        new_cat_cols = ["combined_anatomical_site"]
        return df, new_num_cols, new_cat_cols

    num_cols = [
        'age_approx', 'clin_size_long_diam_mm', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 
        'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 
        'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 
        'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB',
        'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_minorAxisMM',
        'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color',
        'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL',
        'tbp_lv_stdLExt', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle',
        'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z',
    ]
    df_train[num_cols] = df_train[num_cols].fillna(df_train[num_cols].median())
    df_train, new_num_cols, new_cat_cols = feature_engineering(df_train.copy())
    num_cols += new_num_cols
    # anatom_site_general
    cat_cols = ["sex", "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple"] + new_cat_cols
    train_cols = num_cols + cat_cols

    category_encoder = OrdinalEncoder(
        categories='auto',
        dtype=int,
        handle_unknown='use_encoded_value',
        unknown_value=-2,
        encoded_missing_value=-1,
    )

    X_cat = category_encoder.fit_transform(df_train[cat_cols])
    for c, cat_col in enumerate(cat_cols):
        df_train[cat_col] = X_cat[:, c]

    return df_train, train_cols

In [14]:
def comp_score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, min_tpr: float=0.80):
    v_gt = abs(np.asarray(solution.values)-1)
    v_pred = np.array([1.0 - x for x in submission.values])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return partial_auc


lgb_params =  {
    "objective": "binary",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "n_estimators": 531, # 200,
    'learning_rate': 0.020961882958412847, # 0.05,    
    'lambda_l1': 0.031850406238810657, #0.0004681884533249742, 
    'lambda_l2': 9.352820586794987, # 8.765240856362274, 
    'num_leaves': 60, # 136, 
    'feature_fraction': 0.5270436775267011, # 0.5392005444882538, 
    'bagging_fraction': 0.9745808187865352, # 0.9577412548866563, 
    'bagging_freq': 9, # 6,
    'min_child_samples': 65, # 60,
    "device": "gpu"
}

In [17]:
splits = np.arange(2,11)

splits_df = []

run = wandb.init(project="isic_lesions_24", job_type="lgbm_train_split")

for split in splits:
    df_train, train_cols = get_df_train()
    gkf = StratifiedGroupKFold(n_splits=split)
    df_train["fold"] = -1

    for idx, (train_idx, val_idx) in enumerate(
        gkf.split(df_train, df_train["target"], groups=df_train["patient_id"])
    ):
        df_train.loc[val_idx, "fold"] = idx
    df_train.to_csv(f"../data/splits/stratified_split_num_{split}.csv", index=False)

    scores = []
    models = []
    targets = []
    preds_fold = []
    for fold in range(split):
        _df_train = df_train[df_train["fold"] != fold].reset_index(drop=True)
        _df_valid = df_train[df_train["fold"] == fold].reset_index(drop=True)
        model = lgb.LGBMRegressor(
            **lgb_params
        )
        model.fit(_df_train[train_cols], _df_train["target"])
        preds = model.predict(_df_valid[train_cols])
        score = comp_score(_df_valid[["target"]], pd.DataFrame(preds, columns=["prediction"]), "")
        print(f"fold: {fold} - Partial AUC Score: {score:.5f}")

        targets.extend(_df_valid["target"])
        preds_fold.extend(preds)

        scores.append(score)
        models.append(model)

    score = np.mean(scores)
    score_std = np.std(scores)
    print(f"LGBM Score: {score:.5f}")
    print(f"LGBM Score std: {score_std:.5f}")
    wandb.log({"pAUC": score, "pAUC std": score_std})

VBox(children=(Label(value='0.026 MB of 0.026 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112431147032314, max=1.0…

  df_train = pd.read_csv("../data/train-metadata.csv")


fold: 0 - Partial AUC Score: 0.15837
fold: 1 - Partial AUC Score: 0.14839
LGBM Score: 0.15338
LGBM Score std: 0.00499


  df_train = pd.read_csv("../data/train-metadata.csv")


fold: 0 - Partial AUC Score: 0.15866
fold: 1 - Partial AUC Score: 0.15181
fold: 2 - Partial AUC Score: 0.16515
LGBM Score: 0.15854
LGBM Score std: 0.00545


  df_train = pd.read_csv("../data/train-metadata.csv")


fold: 0 - Partial AUC Score: 0.15851
fold: 1 - Partial AUC Score: 0.14347
fold: 2 - Partial AUC Score: 0.16908
fold: 3 - Partial AUC Score: 0.16307
LGBM Score: 0.15854
LGBM Score std: 0.00947


  df_train = pd.read_csv("../data/train-metadata.csv")


fold: 0 - Partial AUC Score: 0.17203
fold: 1 - Partial AUC Score: 0.14806
fold: 2 - Partial AUC Score: 0.17046
fold: 3 - Partial AUC Score: 0.15567
fold: 4 - Partial AUC Score: 0.15677
LGBM Score: 0.16060
LGBM Score std: 0.00921


  df_train = pd.read_csv("../data/train-metadata.csv")


fold: 0 - Partial AUC Score: 0.14764
fold: 1 - Partial AUC Score: 0.12947
fold: 2 - Partial AUC Score: 0.16224
fold: 3 - Partial AUC Score: 0.16689
fold: 4 - Partial AUC Score: 0.18061
fold: 5 - Partial AUC Score: 0.17656
LGBM Score: 0.16057
LGBM Score std: 0.01748


  df_train = pd.read_csv("../data/train-metadata.csv")


fold: 0 - Partial AUC Score: 0.13381
fold: 1 - Partial AUC Score: 0.15973
fold: 2 - Partial AUC Score: 0.17148
fold: 3 - Partial AUC Score: 0.16169
fold: 4 - Partial AUC Score: 0.18147
fold: 5 - Partial AUC Score: 0.16447
fold: 6 - Partial AUC Score: 0.14067
LGBM Score: 0.15904
LGBM Score std: 0.01544


  df_train = pd.read_csv("../data/train-metadata.csv")


fold: 0 - Partial AUC Score: 0.15620
fold: 1 - Partial AUC Score: 0.15397
fold: 2 - Partial AUC Score: 0.15595
fold: 3 - Partial AUC Score: 0.16117
fold: 4 - Partial AUC Score: 0.17933
fold: 5 - Partial AUC Score: 0.15527
fold: 6 - Partial AUC Score: 0.17564
fold: 7 - Partial AUC Score: 0.14949
LGBM Score: 0.16088
LGBM Score std: 0.01008


  df_train = pd.read_csv("../data/train-metadata.csv")


fold: 0 - Partial AUC Score: 0.18225
fold: 1 - Partial AUC Score: 0.17077
fold: 2 - Partial AUC Score: 0.16586
fold: 3 - Partial AUC Score: 0.16742
fold: 4 - Partial AUC Score: 0.17606
fold: 5 - Partial AUC Score: 0.14638
fold: 6 - Partial AUC Score: 0.13956
fold: 7 - Partial AUC Score: 0.17297
fold: 8 - Partial AUC Score: 0.14391
LGBM Score: 0.16280
LGBM Score std: 0.01460


  df_train = pd.read_csv("../data/train-metadata.csv")


fold: 0 - Partial AUC Score: 0.12951
fold: 1 - Partial AUC Score: 0.15516
fold: 2 - Partial AUC Score: 0.16483
fold: 3 - Partial AUC Score: 0.15944
fold: 4 - Partial AUC Score: 0.17367
fold: 5 - Partial AUC Score: 0.17705
fold: 6 - Partial AUC Score: 0.18102
fold: 7 - Partial AUC Score: 0.14212
fold: 8 - Partial AUC Score: 0.14792
fold: 9 - Partial AUC Score: 0.16676
LGBM Score: 0.15975
LGBM Score std: 0.01551


In [18]:
run.finish()

VBox(children=(Label(value='0.026 MB of 0.026 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
pAUC,▁▅▅▆▆▅▇█▆
pAUC std,▁▁▄▃█▇▄▆▇

0,1
pAUC,0.15975
pAUC std,0.01551
