In [1]:
import numpy as np 
import pandas as pd
from tqdm import tqdm
from PIL import Image
import random
import os
import warnings
warnings.filterwarnings('ignore')
import lightgbm as lgb
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold
from sklearn import metrics
from sklearn import preprocessing
from bayes_opt import BayesianOptimization
from catboost import CatBoostClassifier

In [2]:
# function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [3]:
# let´s start seeding everything
seed_everything(42)

train = pd.read_csv('./ISIC/train.csv')
test = pd.read_csv('./ISIC/test.csv')
sub = pd.read_csv('./ISIC/sample_submission.csv')

train.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0


In [4]:
for col in ['sex', 'anatom_site_general_challenge']:
    encoder = preprocessing.LabelEncoder()
    train[col].fillna('unknown', inplace = True)
    test[col].fillna('unknown', inplace = True)
    train[col] = encoder.fit_transform(train[col])
    test[col] = encoder.transform(test[col])
    
age_approx = np.nanmean(np.concatenate([np.array(train['age_approx']), np.array(test['age_approx'])]))
train['age_approx'].fillna(age_approx, inplace = True)
test['age_approx'].fillna(age_approx, inplace = True)
train['patient_id'].fillna('unknown', inplace = True)

train.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,1,45.0,0,unknown,benign,0
1,ISIC_0015719,IP_3075186,0,45.0,6,unknown,benign,0
2,ISIC_0052212,IP_2842074,0,50.0,1,nevus,benign,0
3,ISIC_0068279,IP_6890425,0,45.0,0,unknown,benign,0
4,ISIC_0074268,IP_8723313,0,55.0,6,unknown,benign,0


In [5]:
# run bayesian optimization with optimal features
bounds_cat = {
    'learning_rate': (0.1, 0.5),
    'depth': (2, 12),
    'bagging_temperature': (0.0, 2.0),
    'colsample_bylevel': (0.5, 1.0)
}

In [6]:
def train_and_evaluate_cat(train, test, cat_params, verbose_eval, folds = 5):
    features = [col for col in train.columns if col in ['sex', 'age_approx', 'anatom_site_general_challenge']]
    if verbose_eval != False:
        print('Training with features: ', features)
    
    kf = GroupKFold(n_splits = 5)
    target = 'target'
    oof_pred = np.zeros(len(train))
    y_pred = np.zeros(len(test))
    for fold, (tr_ind, val_ind) in enumerate(kf.split(train, groups = train['patient_id'])):
        
        if verbose_eval != False:
            print('\n')
            print('-'*50)
            print(f'Training fold {fold + 1}"')
            
        x_train, x_val = train[features].iloc[tr_ind], train[features].iloc[val_ind]
        y_train, y_val = train[target][tr_ind], train[target][val_ind]
        model = CatBoostClassifier(**cat_params)
        model.fit(x_train,
                  y_train,
                  eval_set = (x_val, y_val),
                  cat_features = ['sex', 'anatom_site_general_challenge'],
                  use_best_model = True,
                  early_stopping_rounds = 50,
                  verbose_eval = verbose_eval)

        oof_pred[val_ind] = model.predict_proba(x_val)[:, 1]
        y_pred += model.predict_proba(test[features])[:, 1] / kf.n_splits
        
    if verbose_eval != False:
        print(f'Our oof roc auc score for our cat boost model is {rauc}')

    rauc = metrics.roc_auc_score(train['target'], oof_pred)
    return rauc

In [7]:
# function to perform bayesian optimization search
def run_cat_bayesian(learning_rate, depth, bagging_temperature, colsample_bylevel):
    
    params = {
        'learning_rate': learning_rate,
        'eval_metric': 'AUC',
        'loss_function': 'Logloss',
        'random_seed': 42,
        'task_type': 'CPU',
        'depth': int(depth),
        'bagging_temperature': bagging_temperature,
        'colsample_bylevel': colsample_bylevel,
        
    }
#     print(train,head())
#     rauc, y_pred = train_and_evaluate_cat(train, test, params, False)
    # define usefull features
    features = [col for col in train.columns if col in ['sex', 'age_approx', 'anatom_site_general_challenge']]
    kf = GroupKFold(n_splits = 5)
    target = 'target'
    oof_pred = np.zeros(len(train))
    y_pred = np.zeros(len(test))
    for fold, (tr_ind, val_ind) in enumerate(kf.split(train, groups = train['patient_id'])):
        
        x_train, x_val = train[features].iloc[tr_ind], train[features].iloc[val_ind]
        y_train, y_val = train[target][tr_ind], train[target][val_ind]
        
        model = CatBoostClassifier(**params)
        model.fit(x_train,
                  y_train,
                  eval_set = (x_val, y_val),
                  cat_features = ['sex', 'anatom_site_general_challenge'],
                  use_best_model = True,
                  early_stopping_rounds = 50,
                  verbose_eval = False)
        
        oof_pred[val_ind] = model.predict_proba(x_val)[:, 1]
        y_pred += model.predict_proba(test[features])[:, 1] / kf.n_splits
        
        
    rauc = metrics.roc_auc_score(train['target'], oof_pred)
    
    return rauc

In [8]:
cat_bo = BayesianOptimization(run_cat_bayesian, bounds_cat, random_state = 42)
cat_bo.maximize(init_points = 20, n_iter = 20, acq = 'ucb', xi = 0.0, alpha = 1e-6)

|   iter    |  target   | baggin... | colsam... |   depth   | learni... |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.6797  [0m | [0m 0.7491  [0m | [0m 0.9754  [0m | [0m 9.32    [0m | [0m 0.3395  [0m |
| [0m 2       [0m | [0m 0.6742  [0m | [0m 0.312   [0m | [0m 0.578   [0m | [0m 2.581   [0m | [0m 0.4465  [0m |
| [95m 3       [0m | [95m 0.6816  [0m | [95m 1.202   [0m | [95m 0.854   [0m | [95m 2.206   [0m | [95m 0.488   [0m |
| [0m 4       [0m | [0m 0.6787  [0m | [0m 1.665   [0m | [0m 0.6062  [0m | [0m 3.818   [0m | [0m 0.1734  [0m |
| [0m 5       [0m | [0m 0.6384  [0m | [0m 0.6085  [0m | [0m 0.7624  [0m | [0m 6.319   [0m | [0m 0.2165  [0m |
| [0m 6       [0m | [0m 0.6772  [0m | [0m 1.224   [0m | [0m 0.5697  [0m | [0m 4.921   [0m | [0m 0.2465  [0m |
| [0m 7       [0m | [0m 0.6776  [0m | [0m 0.9121  [0m | [0m 0.8926  [0m | [0m 3.997   [0m | [0m 0.3057 

In [9]:
# get new hyperparameters
params = {
    'learning_rate': cat_bo.max['params']['learning_rate'],
    'eval_metric': 'AUC',
    'loss_function': 'Logloss',
    'random_seed': 42,
    'task_type': 'CPU',
    'depth': int(cat_bo.max['params']['depth']),
    'bagging_temperature': cat_bo.max['params']['bagging_temperature'],
    'colsample_bylevel': cat_bo.max['params']['colsample_bylevel']
}

# train with new hyperparameters
# roc_auc, y_pred = train_and_evaluate_cat(train, test, params, 50)
features = [col for col in train.columns if col in ['sex', 'age_approx', 'anatom_site_general_challenge']]
kf = GroupKFold(n_splits = 5)
target = 'target'
oof_pred = np.zeros(len(train))
y_pred = np.zeros(len(test))
for fold, (tr_ind, val_ind) in enumerate(kf.split(train, groups = train['patient_id'])):

    x_train, x_val = train[features].iloc[tr_ind], train[features].iloc[val_ind]
    y_train, y_val = train[target][tr_ind], train[target][val_ind]

    model = CatBoostClassifier(**params)
    model.fit(x_train,
              y_train,
              eval_set = (x_val, y_val),
              cat_features = ['sex', 'anatom_site_general_challenge'],
              use_best_model = True,
              early_stopping_rounds = 50,
              verbose_eval = False)

    oof_pred[val_ind] = model.predict_proba(x_val)[:, 1]
    y_pred += model.predict_proba(test[features])[:, 1] / kf.n_splits

rauc = metrics.roc_auc_score(train['target'], oof_pred)

In [11]:
# predict
test['target'] = y_pred
test.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,target
0,ISIC_0052060,IP_3579794,1,70.0,5,0.046189
1,ISIC_0052349,IP_7782715,1,40.0,1,0.008976
2,ISIC_0058510,IP_7960270,0,55.0,4,0.015463
3,ISIC_0073313,IP_6375035,0,50.0,4,0.011688
4,ISIC_0073502,IP_0589375,0,45.0,1,0.010681


In [13]:
sub = test[['image_name', 'target']]
sub.to_csv('./submission.csv', index = False)