# Skin Cancer Classification

This notebook describes the classification of skin cancer based on a dataset 

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.impute import KNNImputer

import xgboost as xgb

import optuna

%config InlineBackend.figure_format = 'svg'

  from .autonotebook import tqdm as notebook_tqdm


# Exploratory Data Analysis

In [2]:
data = pd.read_csv("train-metadata.csv")

  data = pd.read_csv("train-metadata.csv")


In [3]:
data

Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,lesion_id,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,,Benign,Benign,,,,,,,97.517282
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.10,TBP tile: close-up,3D: white,31.712570,...,IL_6727506,Benign,Benign,,,,,,,3.141455
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.40,TBP tile: close-up,3D: XP,22.575830,...,,Benign,Benign,,,,,,,99.804040
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,,Benign,Benign,,,,,,,99.989998
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.725520,...,,Benign,Benign,,,,,,,70.442510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401054,ISIC_9999937,0,IP_1140263,70.0,male,anterior torso,6.80,TBP tile: close-up,3D: XP,22.574335,...,IL_9520694,Benign,Benign,,,,,,,99.999988
401055,ISIC_9999951,0,IP_5678181,60.0,male,posterior torso,3.11,TBP tile: close-up,3D: white,19.977640,...,,Benign,Benign,,,,,,,99.999820
401056,ISIC_9999960,0,IP_0076153,65.0,female,anterior torso,2.05,TBP tile: close-up,3D: XP,17.332567,...,IL_9852274,Benign,Benign,,,,,,,99.999416
401057,ISIC_9999964,0,IP_5231513,30.0,female,anterior torso,2.80,TBP tile: close-up,3D: XP,22.288570,...,,Benign,Benign,,,,,,,100.000000


In [28]:
""" Preprocess dataset """

def labelEncode(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for col in df.columns:
        ser = df[col]
        if ser.dtype == object:
            replacements = {val: i for i, val in enumerate(ser.unique())}
            df[col].replace(replacements.keys(), replacements.values(), inplace=True)
    return df

#TODO: reimplement one-hot encoding & compare with label encoding

test_cols = pd.read_csv("test-metadata.csv").columns.tolist()

filtered = data[['target'] + test_cols]
filtered = filtered.drop(columns=['isic_id', 'patient_id', 'image_type', 'attribution', 'copyright_license'], inplace=False)

newcols = {
    'sex': 'sex',
    'anatom_site_general': 'body_site',
    'tbp_tile_type': 'tiletype',
    'tbp_lv_location': 'tbp_loc',
    'tbp_lv_location_simple': 'tbp_loc_simple'
}

filtered.columns = [newcols[col] if filtered[col].dtype == object else col for col in filtered.columns]

# label encode categorical variables
encoded = labelEncode(filtered)

# impute nan values with knn
nanIdx = encoded.drop(index=encoded.dropna().index, inplace=False).index
knn = KNNImputer()
imputed = knn.fit_transform(encoded)

# min-max scale
scaler = MinMaxScaler()
scaled = pd.DataFrame(data=scaler.fit_transform(imputed), columns=encoded.columns)
scaled

Unnamed: 0,target,age_approx,sex,body_site,clin_size_long_diam_mm,tiletype,tbp_lv_A,tbp_lv_Aext,tbp_lv_B,tbp_lv_Bext,...,tbp_lv_norm_color,tbp_lv_perimeterMM,tbp_lv_radial_color_std_max,tbp_lv_stdL,tbp_lv_stdLExt,tbp_lv_symm_2axis,tbp_lv_symm_2axis_angle,tbp_lv_x,tbp_lv_y,tbp_lv_z
0,0.0,0.6875,0.0,0.0,0.074453,0.0,0.448560,0.549700,0.502444,0.376069,...,0.000000,0.067335,0.000000,0.102225,0.080388,0.582086,0.485714,0.356776,0.566559,0.408087
1,0.0,0.6875,0.0,0.2,0.003650,0.0,0.674860,0.747149,0.491697,0.391260,...,0.000000,0.007756,0.000000,0.033828,0.131598,0.252621,0.314286,0.504132,0.893847,0.571023
2,0.0,0.6875,0.0,0.4,0.087591,1.0,0.494565,0.568489,0.703178,0.619598,...,0.000000,0.063125,0.000000,0.085311,0.052795,0.334987,0.600000,0.603966,0.858581,0.858501
3,0.0,0.7500,0.0,0.6,0.081022,1.0,0.330121,0.460827,0.402979,0.303669,...,0.177170,0.069412,0.057844,0.057262,0.037652,0.170317,0.742857,0.390405,0.848437,0.572962
4,0.0,0.6250,0.0,0.6,0.063139,0.0,0.536985,0.632028,0.494130,0.420931,...,0.000000,0.038916,0.000000,0.105071,0.074071,0.282586,0.114286,0.445845,0.864265,0.512548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401054,0.0,0.8125,0.0,0.6,0.211679,1.0,0.494536,0.521126,0.515904,0.447931,...,0.734813,0.176467,0.202597,0.392395,0.061576,0.256087,0.571429,0.622958,0.760043,0.678136
401055,0.0,0.6875,0.0,0.4,0.077007,0.0,0.443295,0.544600,0.633924,0.556393,...,0.279525,0.067668,0.091213,0.093165,0.091352,0.441034,0.142857,0.546435,0.831770,0.685676
401056,0.0,0.7500,0.5,0.6,0.038321,1.0,0.391100,0.465157,0.555550,0.441107,...,0.166041,0.034235,0.052872,0.082950,0.063017,0.141688,0.228571,0.548269,0.720228,0.302728
401057,0.0,0.3125,0.5,0.6,0.065693,1.0,0.488897,0.404430,0.529857,0.454194,...,0.358397,0.065396,0.093829,0.197278,0.053052,0.118717,0.800000,0.496238,0.656369,0.351561


In [24]:
filtered

Unnamed: 0,target,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,tbp_tile_type,tbp_lv_A,tbp_lv_Aext,tbp_lv_B,tbp_lv_Bext,...,tbp_lv_norm_color,tbp_lv_perimeterMM,tbp_lv_radial_color_std_max,tbp_lv_stdL,tbp_lv_stdLExt,tbp_lv_symm_2axis,tbp_lv_symm_2axis_angle,tbp_lv_x,tbp_lv_y,tbp_lv_z
0,0,60.0,male,lower extremity,3.04,3D: white,20.244422,16.261975,26.922447,23.954773,...,0.000000,9.307003,0.000000,2.036195,2.637780,0.590476,85,-182.703552,613.493652,-42.427948
1,0,60.0,male,head/neck,1.10,3D: white,31.712570,25.364740,26.331000,24.549290,...,0.000000,3.354148,0.000000,0.853227,3.912844,0.285714,55,-0.078308,1575.687000,57.174500
2,0,60.0,male,posterior torso,3.40,3D: XP,22.575830,17.128170,37.970460,33.485410,...,0.000000,8.886309,0.000000,1.743651,1.950777,0.361905,105,123.649700,1472.010000,232.908900
3,0,65.0,male,anterior torso,3.22,3D: XP,14.242329,12.164757,21.448144,21.121356,...,1.771705,9.514499,0.664690,1.258541,1.573733,0.209581,130,-141.024780,1442.185791,58.359802
4,0,55.0,male,anterior torso,2.73,3D: white,24.725520,20.057470,26.464900,25.710460,...,0.000000,6.467562,0.000000,2.085409,2.480509,0.313433,20,-72.315640,1488.720000,21.428960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401054,0,70.0,male,anterior torso,6.80,3D: XP,22.574335,14.944666,27.663259,26.767135,...,7.348126,20.210836,2.328066,7.054819,2.169398,0.288920,100,147.187256,1182.317505,122.652588
401055,0,60.0,male,posterior torso,3.11,3D: white,19.977640,16.026870,34.158840,31.011870,...,2.795246,9.340242,1.048147,1.879502,2.910780,0.460000,25,52.349740,1393.187000,127.261700
401056,0,65.0,female,anterior torso,2.05,3D: XP,17.332567,12.364397,29.845326,26.500073,...,1.660411,5.999862,0.607554,1.702824,2.205272,0.183099,40,54.622246,1065.263672,-106.833740
401057,0,30.0,female,anterior torso,2.80,3D: XP,22.288570,9.564721,28.431200,27.012250,...,3.583966,9.113276,1.078204,3.680175,1.957157,0.161850,140,-9.861557,877.527000,-76.982120


['target',
 'age_approx',
 'sex',
 'body_site',
 'clin_size_long_diam_mm',
 'tiletype',
 'tbp_lv_A',
 'tbp_lv_Aext',
 'tbp_lv_B',
 'tbp_lv_Bext',
 'tbp_lv_C',
 'tbp_lv_Cext',
 'tbp_lv_H',
 'tbp_lv_Hext',
 'tbp_lv_L',
 'tbp_lv_Lext',
 'tbp_lv_areaMM2',
 'tbp_lv_area_perim_ratio',
 'tbp_lv_color_std_mean',
 'tbp_lv_deltaA',
 'tbp_lv_deltaB',
 'tbp_lv_deltaL',
 'tbp_lv_deltaLB',
 'tbp_lv_deltaLBnorm',
 'tbp_lv_eccentricity',
 'tbp_loc',
 'tbp_loc_simple',
 'tbp_lv_minorAxisMM',
 'tbp_lv_nevi_confidence',
 'tbp_lv_norm_border',
 'tbp_lv_norm_color',
 'tbp_lv_perimeterMM',
 'tbp_lv_radial_color_std_max',
 'tbp_lv_stdL',
 'tbp_lv_stdLExt',
 'tbp_lv_symm_2axis',
 'tbp_lv_symm_2axis_angle',
 'tbp_lv_x',
 'tbp_lv_y',
 'tbp_lv_z']

In [19]:
""" Isolate features & labels """

features = scaled.drop(columns='target', inplace=False)
labels = scaled['target']

In [15]:
""" Optimize for hyperparameters with optuna """

def objective(trial: optuna.trial.BaseTrial):
    params = {
        'objective': 'binary:logistic',
        'device': 'cuda',
        'eval_metric': 'aucpr',
        'lambda': trial.suggest_float("lambda", 1e-10, 1, log=True),
        'alpha': trial.suggest_float('alpha', 1e-10, 1, log=True),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
        'scale_pos_weight': trial.suggest_int('scale_pos_weight', 500, 2000),
        'max_depth': trial.suggest_int('max_depth', 2, 5),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05)
    }
    
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
    balanced_accs = []
    
    for train, test in kf.split(X=features, y=labels):
        feat_train, feat_test = features.iloc[train], features.iloc[test]
        label_train, label_test = labels.iloc[train], labels.iloc[test]
        
        booster = xgb.train(
            params,
            xgb.DMatrix(feat_train, label=label_train),
            num_boost_round=trial.suggest_int('num_boost_round', 1500, 2000),
            maximize=True
        )
        
        label_pred = booster.predict(xgb.DMatrix(feat_test))
        (tn, fp), (fn, tp) = confusion_matrix(label_test, [0 if i < .5 else 1 for i in label_pred])
        sensitivity, specificity = tp / (tp + fn), tn / (tn + fp)
        balanced_accs.append((sensitivity + specificity) / 2)
    
    return sum(balanced_accs) / len(balanced_accs)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[I 2024-12-22 12:10:03,768] A new study created in memory with name: no-name-c2221e41-5a33-465c-87de-4fd7f967e990


[W 2024-12-22 12:10:05,901] Trial 0 failed with parameters: {'lambda': 1.4205954621309722e-10, 'alpha': 0.629939912706138, 'subsample': 0.5819060924442414, 'colsample_bytree': 0.385239548355099, 'scale_pos_weight': 703, 'max_depth': 5, 'learning_rate': 0.03890034440433634, 'num_boost_round': 1888} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/aspandit/.local/lib/python3.10/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_775670/3648819230.py", line 24, in objective
    booster = xgb.train(
  File "/home/aspandit/.local/lib/python3.10/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/home/aspandit/.local/lib/python3.10/site-packages/xgboost/training.py", line 181, in train
    bst.update(dtrain, iteration=i, fobj=obj)
  File "/home/aspandit/.local/lib/python3.10/site-packages/xgboost/core.py", line 2101, in update
    _LI

KeyboardInterrupt: 

In [20]:
""" Train model with optimized hyperparameters """

params = {
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'device': 'cuda',
    'eval_metric': 'aucpr',
    'lambda': 9.03301684175667e-08,  # FILL IN WITH OPTUNA
    'alpha': 9.018595902076856e-06,  # FILL IN WITH OPTUNA
    'subsample': 0.630992385329696,  # FILL IN WITH OPTUNA
    'colsample_bytree': 0.966089992688949,  # FILL IN WITH OPTUNA
    'scale_pos_weight': 1225,  # FILL IN WITH OPTUNA
    'max_depth': 2,
    'learning_rate': 0.00605754412105179,  # FILL IN WITH OPTUNA
}
    
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
balanced_accs = []
    
for train, test in kf.split(X=features, y=labels):
    feat_train, feat_test = features.iloc[train], features.iloc[test]
    label_train, label_test = labels.iloc[train], labels.iloc[test]
        
    booster = xgb.train(
        params,
        xgb.DMatrix(feat_train, label=label_train),
        num_boost_round=1638,  # FILL IN WITH OPTUNA
        maximize=True
    )
        
    label_pred = booster.predict(xgb.DMatrix(feat_test))
    (tn, fp), (fn, tp) = confusion_matrix(label_test, [0 if i < .5 else 1 for i in label_pred])
    sensitivity, specificity = tp / (tp + fn), tn / (tn + fp)
    balanced_accs.append((sensitivity + specificity) / 2)
    
balanced_accs, sum(balanced_accs) / 5

([0.8979155427479485,
  0.8791274537726599,
  0.8557016049425425,
  0.8738528367475149,
  0.8694348530183953],
 0.8752064582458123)

In [18]:
sum(balanced_accs) / 5

0.8752064582458123