In [1]:
import os
import sys; sys.path.insert(0, os.path.abspath("../"))
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from dataset.dataset import SkinLesion_Dataset, SegExamples
from pipeline.preprocessing import SkinLesionPreprocessing
from pipeline.feature_extraction import FeaturesExtraction
import time

### Full pipeline example for binary challenge

In [None]:
# Init object instances
data = SkinLesion_Dataset(class_task='binary')
preproc = SkinLesionPreprocessing()
cfe = FeaturesExtraction(['global','local']) 

In [None]:
# Inference time test
start_time = time.time()

# Read model pickle
clf = pd.read_pickle('../data/binary/bin_model.pkl') # trained model available at 
# https://drive.google.com/file/d/1S5VT32gV5G53bgeyts9CxgZEzlFzh1yc/view?usp=share_link

# Select 100 random images
rand_images = np.random.randint(0, len(data.md_df), 100)

# Extract features
feats = []
for i in tqdm(rand_images, total=len(rand_images)):
    image = data[i]['img']
    image_p = preproc.preprocess(image)
    mask = preproc.get_seg_mask(image_p)
    feats.append(cfe.extract_features(image_p, mask))

df = pd.DataFrame(feats, columns=cfe.features_names)
test_shape = df.shape

# Prepare data
X_test = df.iloc[:, :(test_shape[1])].to_numpy(dtype=np.float32)

# Predict
y_pred = clf.predict(X_test)


print("--- %s seconds ---" % (time.time() - start_time), )

### Full pipeline example for three-class challenge

Since we need specific trained models that will be ensembled, and we had issues writing the trained pipelines (pickling) and reading them back, it is necessary to run the classifiers training once again. Below in a single cell is the code to do so. It is only necessary to download three models and scalers from 

In [None]:
from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
import pickle
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, balanced_accuracy_score, roc_auc_score, cohen_kappa_score, make_scorer
from sklearn.utils import resample
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as impipe

from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.utils import class_weight
from joblib import dump, load

from scipy import stats

# Read pre-trained models and scalers from https://drive.google.com/drive/folders/1cr_KMZTzZeTXF2hq834ZXEu4X6w9FNqo?usp=share_link

clf1 = pd.read_pickle('../data/three_class/models/clf1_svc.pkl') 
clf1_scaler = pd.read_pickle('../data/three_class/models/clf1_scaler.pkl')
pipe_clf1_svc = Pipeline([('scaler', clf1_scaler), ('classifier', clf1)])

clf2 = pd.read_pickle('../data/three_class/models/clf2_svc.pkl')
clf2_scaler = pd.read_pickle('../data/three_class/models/clf2_scaler.pkl') 
pipe_clf2_svc = Pipeline([('scaler', clf2_scaler), ('classifier', clf2)])

clf3 = pd.read_pickle('../data/three_class/models/clf3_xgb.pkl')
clf3_scaler = pd.read_pickle('../data/three_class/models/clf3_scaler.pkl')
pipe_clf3_xgb = Pipeline([('scaler', clf3_scaler), ('classifier', clf3)])

# Other models running on the go

# data preparation
train_df_1 = pd.read_feather("../data/three_class/train_all_feat.f")
train_df_2 = pd.read_feather("../data/three_class/train_color_local_feat.f")
val_df_1 = pd.read_feather("../data/three_class/val_all_feat.f")
val_df_2 = pd.read_feather("../data/three_class/val_color_local_feat.f")

train_df = pd.concat([train_df_2.iloc[:,:60], train_df_1], axis=1)
val_df = pd.concat([val_df_2.iloc[:, :60], val_df_1], axis=1)

# move lbp features at the beginning
train_df = train_df[ [ col for col in train_df.columns if 'lbp' in col ] + [ col for col in train_df.columns if 'lbp' not in col ] ]
val_df = val_df[ [ col for col in val_df.columns if 'lbp' in col ] + [ col for col in val_df.columns if 'lbp' not in col ] ]

train_shape = train_df.shape
val_shape = val_df.shape

train_df.replace({'bcc': 0, 'mel': 1, 'scc': 2}, inplace=True)
val_df.replace({'bcc': 0, 'mel': 1, 'scc': 2}, inplace=True)

df_train = train_df.iloc[:train_shape[0], :].sample(frac=1, random_state=42)
df_val = val_df.iloc[:val_shape[0], :].sample(frac=1, random_state=42)
# df_val

X_train, y_train = df_train.iloc[:, :(train_shape[1]-1)].to_numpy(dtype=np.float32), df_train.iloc[:, (train_shape[1]-1)].to_numpy()
X_test, y_test = df_val.iloc[:, :(val_shape[1]-1)].to_numpy(dtype= np.float32), df_val.iloc[:, (val_shape[1]-1)].to_numpy()

# training model
classifier =  RandomForestClassifier(class_weight='balanced', max_depth=100, n_estimators=300, random_state=42)
pipe_clf4_rf = Pipeline([('scaler', StandardScaler()), ('classifier', classifier)])
# X_train = scaler.fit_transform(X_train)
pipe_clf4_rf.fit(X_train, y_train)

del train_df, val_df, train_shape, val_shape, X_train, y_train, X_test, y_test, classifier

train_df_1 = pd.read_feather("../data/three_class/train_all_feat.f")
train_df_2 = pd.read_feather("../data/three_class/train_color_local_feat.f")
val_df_1 = pd.read_feather("../data/three_class/val_all_feat.f")
val_df_2 = pd.read_feather("../data/three_class/val_color_local_feat.f")

train_df = pd.concat([train_df_2.iloc[:,:60], train_df_1], axis=1)
val_df = pd.concat([val_df_2.iloc[:, :60], val_df_1], axis=1)

# move lbp features at the beginning
train_df = train_df[ [ col for col in train_df.columns if 'lbp' in col ] + [ col for col in train_df.columns if 'lbp' not in col ] ]
val_df = val_df[ [ col for col in val_df.columns if 'lbp' in col ] + [ col for col in val_df.columns if 'lbp' not in col ] ]

train_shape = train_df.shape
val_shape = val_df.shape

train_df.replace({'bcc': 0, 'mel': 1, 'scc': 2}, inplace=True)
val_df.replace({'bcc': 0, 'mel': 1, 'scc': 2}, inplace=True)

df_train = train_df.iloc[:train_shape[0], :].sample(frac=1, random_state=42)
df_val = val_df.iloc[:val_shape[0], :].sample(frac=1, random_state=42)
X_train, y_train = df_train.iloc[:, :(train_shape[1]-1)].to_numpy(dtype=np.float32), df_train.iloc[:, (train_shape[1]-1)].to_numpy()
X_test, y_test = df_val.iloc[:, :(val_shape[1]-1)].to_numpy(dtype= np.float32), df_val.iloc[:, (val_shape[1]-1)].to_numpy()

over = SMOTE(sampling_strategy={2: 500}, random_state=42)
under = RandomUnderSampler(sampling_strategy={1: 1200, 0: 1000}, random_state=123)
steps = [('o', over), ('u', under)]
smote_pipe = impipe(steps=steps)
# transform the dataset
X_train, y_train = smote_pipe.fit_resample(X_train, y_train)

classifier =  RandomForestClassifier(class_weight='balanced', max_depth=50, n_estimators=300, random_state=42)
pipe_clf5_rf = Pipeline([('scaler', StandardScaler()), ('classifier', classifier)])
pipe_clf5_rf.fit(X_train, y_train)

del train_df, val_df, train_shape, val_shape, X_train, y_train, X_test, y_test, classifier

train_df_1 = pd.read_feather("../data/three_class/train_all_feat.f")
train_df_2 = pd.read_feather("../data/three_class/train_color_local_feat.f")
val_df_1 = pd.read_feather("../data/three_class/val_all_feat.f")
val_df_2 = pd.read_feather("../data/three_class/val_color_local_feat.f")

train_df = pd.concat([train_df_2.iloc[:,:60], train_df_1], axis=1)
val_df = pd.concat([val_df_2.iloc[:, :60], val_df_1], axis=1)

# move lbp features at the beginning
train_df = train_df[ [ col for col in train_df.columns if 'lbp' in col ] + [ col for col in train_df.columns if 'lbp' not in col ] ]
val_df = val_df[ [ col for col in val_df.columns if 'lbp' in col ] + [ col for col in val_df.columns if 'lbp' not in col ] ]

train_bcc = train_df.loc[train_df.label == 'bcc']
train_mel = train_df.loc[train_df.label == 'mel']
train_scc = train_df.loc[train_df.label == 'scc']

train_mel_resamp = resample(train_mel, replace=False, n_samples=1993, random_state=123) 
train_scc_resamp = resample(train_scc, replace=True, n_samples=1993, random_state=123) 
train_df = pd.concat([train_bcc, train_mel_resamp, train_scc_resamp])

train_shape = train_df.shape
val_shape = val_df.shape

train_df.replace({'bcc': 0, 'mel': 1, 'scc': 2}, inplace=True)
val_df.replace({'bcc': 0, 'mel': 1, 'scc': 2}, inplace=True)

df_train = train_df.iloc[:train_shape[0], :].sample(frac=1, random_state=42)
df_val = val_df.iloc[:val_shape[0], :].sample(frac=1, random_state=42)
# df_val

X_train, y_train = df_train.iloc[:, :(train_shape[1]-1)].to_numpy(dtype=np.float32), df_train.iloc[:, (train_shape[1]-1)].to_numpy()
X_test, y_test = df_val.iloc[:, :(val_shape[1]-1)].to_numpy(dtype= np.float32), df_val.iloc[:, (val_shape[1]-1)].to_numpy()

classifier =  ExtraTreesClassifier(class_weight='balanced', max_depth=50, n_estimators=200, random_state=42)
pipe_clf6_ert = Pipeline([('scaler', StandardScaler()), ('classifier', classifier)])
pipe_clf6_ert.fit(X_train, y_train)



In [None]:
# Init object instances
data = SkinLesion_Dataset(class_task='three_class')
preproc = SkinLesionPreprocessing()
cfe = FeaturesExtraction(['global','local'])

# Ensemble prediction function
def prediction_ensemble(classifiers: dict, X: np.ndarray, weights = [1, 1, 4, 1, 1]):

    weights = np.asarray(weights)

    results_soft = np.zeros((X.shape[0],3,len(classifiers)))
   
    for i, clf in enumerate(classifiers.values()): 
        results_soft[:,:,i] = clf.predict_proba(X)
    results_w = results_soft*np.tile(np.array(weights), (results_soft.shape[0], 1))[:, np.newaxis]

    y_pred_soft_w = np.argmax(np.mean(results_w, axis=2), axis=1)
    
    return y_pred_soft_w

In [None]:
# Inference time test
start_time = time.time()

# Select 100 random images
rand_images = np.random.randint(0, len(data.md_df), 100)

# Extract features
feats = []
for i in tqdm(rand_images, total=len(rand_images)):
    image = data[i]['img']
    image_p = preproc.preprocess(image)
    mask = preproc.get_seg_mask(image_p)
    feats.append(cfe.extract_features(image_p, mask))

# Prepare data
df = pd.DataFrame(feats, columns=cfe.features_names)
test_shape = df.shape
df = df[ [ col for col in df.columns if 'lbp' in col ] + [ col for col in df.columns if 'lbp' not in col ] ]
X_test = df.iloc[:, :(test_shape[1])].to_numpy(dtype=np.float32)

# Predict
clf_dict = {'clf1_svc': pipe_clf1_svc, 'clf3_xgb': pipe_clf3_xgb, 'clf4_rf': pipe_clf4_rf, 'clf5_rf': pipe_clf5_rf, 'clf6_ert': pipe_clf6_ert}
y_pred = prediction_ensemble(clf_dict, X_test)


print("--- %s seconds ---" % (time.time() - start_time))