In [1]:
import os
import sys; sys.path.insert(0, os.path.abspath("../"))
import cv2
import numpy as np 
import matplotlib.pyplot as plt
# from utils.plots import get_image, get_image_and_plot
from tqdm import tqdm
import pandas as pd
from scipy.stats import skew, kurtosis
from skimage.measure import shannon_entropy
from pipeline.preprocessing import SkinLesionPreprocessing
from dataset.dataset import SkinLesion_Dataset
from pipeline.feature_extraction import FeaturesExtraction
import pyarrow

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score

In [2]:
data = SkinLesion_Dataset()
preproc = SkinLesionPreprocessing()
cfe = FeaturesExtraction(levels=['global'], color_params={})

In [3]:
def segment(img):

    r_norm = img[:,:,2]*(1/np.sqrt(np.sum(img.astype(np.float32)**2, axis=-1)))
    rnormg = (cv2.GaussianBlur(r_norm, ksize = (0,0), sigmaX=3, borderType = cv2.BORDER_DEFAULT)*255).astype(np.uint8)
    _,mask_r = cv2.threshold(rnormg, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)

    # Filling holes
    contour_r,_ = cv2.findContours(mask_r,cv2.RETR_CCOMP,cv2.CHAIN_APPROX_SIMPLE)
    for cnt in contour_r:
        cv2.drawContours(mask_r,[cnt],0,255,-1)
    
    return mask_r

class TextureScaler:
    def __init__(self, no_scale_feat: int = 20):
        super(TextureScaler, self).__init__()
        self.no_scale_feat = no_scale_feat
    def fit(self, X):
        self.means = np.mean(X, axis=0)
        self.stds = np.std(X, axis=0)
    def transform(self, X):
        X_scaled = X.copy()
        # scale only non-lbp features (20:)
        X_scaled[:, self.no_scale_feat:] = np.asarray([(X[:,i] - self.means[i])/self.stds[i] \
                                            for i in range(self.no_scale_feat, X.shape[1])]).T
        
        return X_scaled


In [9]:
df_text = pd.read_feather('../data/binary/ds1000_texture_features.f').iloc[:,:-1]
df_color = pd.read_feather('../data/binary/ds1000_features.f')
df = pd.concat([df_text, df_color], axis=1)
df

Unnamed: 0,global_rad_1_lbp0,global_rad_1_lbp1,global_rad_1_lbp2,global_rad_1_lbp3,global_rad_1_lbp4,global_rad_1_lbp5,global_rad_1_lbp6,global_rad_1_lbp7,global_rad_1_lbp8,global_rad_1_lbp9,...,local_hsv_skew_1,local_hsv_skew_2,local_hsv_skew_3,local_hsv_kur_1,local_hsv_kur_2,local_hsv_kur_3,local_hsv_ent_1,local_hsv_ent_2,local_hsv_ent_3,label
0,0.077620,0.095471,0.060913,0.081342,0.091314,0.097138,0.089875,0.099987,0.121429,0.184911,...,0.517842,-0.215365,-0.078190,-1.729880,-0.749449,-1.219490,3.625263,6.603793,7.128266,nevus
1,0.030114,0.060068,0.032623,0.120833,0.159114,0.174400,0.089783,0.083645,0.138978,0.110443,...,0.374698,0.035333,-1.022156,-1.852144,0.830507,0.848577,4.190969,6.576946,6.977806,nevus
2,0.015907,0.055590,0.016333,0.124239,0.106435,0.209330,0.099168,0.095137,0.176871,0.100989,...,0.181642,-1.304465,0.148821,-0.488802,1.191417,-0.507097,2.908433,5.975254,5.578350,nevus
3,0.069879,0.090704,0.034849,0.065227,0.056720,0.080805,0.078677,0.112275,0.189339,0.221525,...,-0.220330,-0.745734,-0.526534,-1.260615,-0.288970,-1.010442,3.777607,6.517842,6.562462,nevus
4,0.028894,0.056170,0.033833,0.112186,0.116448,0.180569,0.101057,0.098457,0.151373,0.121013,...,5.467453,-0.427335,-0.066131,28.058980,-0.874860,-0.900924,3.154020,6.557076,5.479958,nevus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.052319,0.075735,0.041267,0.089738,0.091145,0.121390,0.091563,0.104836,0.154575,0.177433,...,5.553718,0.749792,-1.380261,30.247867,0.244036,0.755363,4.462614,6.587884,6.405933,others
996,0.036633,0.062980,0.045714,0.114827,0.145520,0.164558,0.102591,0.087607,0.105939,0.133631,...,2.740288,0.546554,-0.617885,5.523847,-0.029864,0.404142,3.251255,5.672496,4.422095,others
997,0.041522,0.074488,0.046688,0.114118,0.133883,0.152429,0.102246,0.085888,0.110532,0.138207,...,-0.522070,0.759234,-0.718166,1.891386,0.847696,0.216139,2.301236,6.912207,6.648158,others
998,0.097709,0.105852,0.047562,0.052171,0.045184,0.054820,0.070656,0.114214,0.176345,0.235488,...,4.218103,0.547971,-1.341055,16.101893,-0.220937,2.962121,4.258150,6.123771,4.505319,others


In [13]:
df.replace({'nevus': 1, 'others': 0}, inplace=True)

df_train = df.iloc[:800, :].sample(frac=1, random_state=42)
df_val = df.iloc[800:, :].sample(frac=1, random_state=42)
# df_val

X_train, y_train = df_train.iloc[:, :188].to_numpy(dtype=np.float32), df_train.iloc[:, 188].to_numpy()
X_test, y_test = df_val.iloc[:, :188].to_numpy(dtype= np.float32), df_val.iloc[:, 188].to_numpy()

print(f'X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}')

X_train: (800, 188), y_train: (800,), X_test: (200, 188), y_test: (200,)


In [14]:
classifier = RandomForestClassifier(random_state=42, n_jobs=-1)
param_grid = { 
    'classifier__n_estimators': [50, 100, 500],
    'classifier__max_depth': [50, 70, 100],
}

#Let's apply a stratified 10-fold cross validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

#Let's build a pipeline with only a scaler and a linear SVM
pipe = Pipeline([('scaler', StandardScaler()),('classifier', classifier)]) #SVC(kernel='linear'))])


In [15]:
grid_search = GridSearchCV(pipe, param_grid, cv=cv)
#Let's train and test the learning system
grid_search.fit(X_train, y_train)

#Here we just print the best accuracy and the corresponding values for the parameters
print("TRAIN- The best parameters are %s with an accuracy of %0.4f"%(grid_search.best_params_, grid_search.best_score_))

# Store the trained models
with open('../data/binary/RF_800_text_color.sav', 'wb') as f:
                    pickle.dump(grid_search, f)

TRAIN- The best parameters are {'classifier__max_depth': 50, 'classifier__n_estimators': 500} with an accuracy of 0.8100


In [16]:
y_train_predicted = grid_search.predict(X_train)
y_test_predicted =  grid_search.predict(X_test)
print('F1 Score of train data: %0.4f' %f1_score(y_train,y_train_predicted,average='macro'), f'Acc: {accuracy_score(y_train, y_train_predicted)}')
print('F1 Score of test data: %0.4f' %f1_score(y_test,y_test_predicted,average='macro'), f'Acc: {accuracy_score(y_test, y_test_predicted)}')

F1 Score of train data: 1.0000 Acc: 1.0
F1 Score of test data: 0.7249 Acc: 0.725


In [17]:
# CV
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(estimator= grid_search, X = X_train, y = y_train, scoring= "accuracy", cv=10, n_jobs=-1)
cv_scores

array([0.7125, 0.7625, 0.8875, 0.675 , 0.7875, 0.825 , 0.8375, 0.8   ,
       0.925 , 0.825 ])

In [19]:
# Features importances
feature_importances = grid_search.best_estimator_._final_estimator.feature_importances_
cfe = FeaturesExtraction(levels=['global','local'])
fimp_df = pd.DataFrame(data=[feature_importances], columns=cfe.features_names).transpose()
# fimp_df
fimp_df.sort_values(0,axis=0,inplace=True, ascending=False)
fimp_df.head(50)

Unnamed: 0,0
global_dist2_ang1_correlation,0.025745
global_dist1_ang3_homogeneity,0.023549
global_dist2_ang0_contrast,0.021263
global_dist1_ang2_energy,0.019449
local_lab_kur_1,0.017523
global_rgb_kur_2,0.017064
local_ycrbcb_kur_1,0.016556
global_rgb_ent_2,0.016471
global_lab_mean_1,0.015842
global_dist2_ang0_dissimilarity,0.015795


### Simple CV with different scaler

In [25]:
def kfcv_RF(X, y):
    scaler = TextureScaler()
    
    # Create  classifier object.
    clf = RandomForestClassifier(max_depth=50, n_estimators=500,random_state=42, n_jobs=-1)
    # lr = svm.SVC()

    # Create StratifiedKFold object.
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=10)
    lst_accu_stratified = []
    feat_impt = []

    for train_index, test_index in skf.split(X, y):
      scaler.fit(X[train_index])
      x_train_fold, x_test_fold = scaler.transform(X[train_index]), scaler.transform(X[test_index])
      y_train_fold, y_test_fold = y[train_index], y[test_index]
      clf.fit(x_train_fold, y_train_fold)
      feat_impt.append(clf.feature_importances_)
      lst_accu_stratified.append(clf.score(x_test_fold, y_test_fold))

    # Print the output.
    print('List of possible accuracy:', lst_accu_stratified)
    print('\nMaximum Accuracy That can be obtained from this model is:',
          max(lst_accu_stratified)*100, '%')
    print('\nMinimum Accuracy:',
          min(lst_accu_stratified)*100, '%')
    print('\nOverall Accuracy:',
          np.mean(lst_accu_stratified)*100, '%')
    print('\nStandard Deviation is:', np.std(lst_accu_stratified))

    return feat_impt, lst_accu_stratified

In [26]:
feat_impt, lst_acc_str = kfcv_RF(X_train, y_train)

List of possible accuracy: [0.7375, 0.8375, 0.7375, 0.825, 0.85, 0.8375, 0.7875, 0.825, 0.7375, 0.8625]

Maximum Accuracy That can be obtained from this model is: 86.25 %

Minimum Accuracy: 73.75 %

Overall Accuracy: 80.375 %

Standard Deviation is: 0.047120192062426895


In [23]:
def sort_relevances(feat_importances):
    """ Get the mean importance for each feature across the runs and then sort
    them in decreasing order.
    """
    feature_importances_array = np.asarray(feat_importances)
    mean_feature_importances_array = np.mean(feature_importances_array, axis=0)
    sorted_features = np.argsort(mean_feature_importances_array)[::-1]
    sorted_mean_feature_importance = mean_feature_importances_array[sorted_features]
    
    return sorted_features, sorted_mean_feature_importance

In [28]:
sorted_features, mean_importance = sort_relevances(feat_impt)
selected_feats_50 = df.columns.values[sorted_features[:50].astype(int)]
selected_feats_50

array(['local_hsv_ent_1', 'local_ycrbcb_mean_2', 'local_hsv_mean_2',
       'local_lab_mean_2', 'global_rad_3_lbp0', 'global_rad_3_lbp3',
       'global_lab_mean_2', 'local_lab_kur_1', 'local_hsv_skew_2',
       'global_rad_3_lbp4', 'global_ycrbcb_mean_2', 'local_ycrbcb_kur_1',
       'global_rad_3_lbp6', 'global_rad_3_lbp5', 'local_lab_mean_3',
       'local_rgb_mean_3', 'local_rgb_skew_3', 'global_hsv_ent_1',
       'local_rgb_kur_2', 'global_rad_3_lbp9', 'global_rgb_skew_3',
       'global_ycrbcb_kur_2', 'local_ycrbcb_skew_2', 'local_hsv_kur_3',
       'local_rgb_kur_1', 'global_rad_3_lbp2', 'local_lab_ent_2',
       'local_ycrbcb_skew_1', 'local_lab_skew_1', 'global_ycrbcb_kur_3',
       'local_ycrbcb_mean_3', 'local_lab_kur_2', 'global_lab_kur_3',
       'local_hsv_kur_2', 'local_rgb_mean_2', 'global_lab_skew_3',
       'global_rgb_skew_2', 'global_ycrbcb_ent_2', 'local_hsv_std_1',
       'local_ycrbcb_ent_3', 'global_lab_ent_2', 'global_rad_3_lbp8',
       'global_ycrbcb_skew_3',

In [29]:
sorted_features

array([185, 159, 174, 144,  10,  13,  84, 152, 180,  14,  99, 167,  16,
        15, 145, 130, 136, 125, 138,  19,  76, 108, 165, 184, 137,  12,
       156, 164, 149, 109, 160, 153,  94, 183, 129,  91,  75, 111, 176,
       172,  96,  18, 106, 135, 104, 168, 147,  68, 171, 150, 162, 134,
       112,  87, 181, 173, 148, 114, 105, 157, 120,  97, 163,  17,  79,
       115, 151, 158, 123,  89,  71, 117, 118,   1, 126, 143, 169,  93,
         0,  73, 177, 139, 154, 186, 103, 166,  11, 170, 133, 155, 102,
         5,  60, 140, 182,  88, 179,   9,   4,  64,  67,   3, 187, 142,
         6, 116, 175,  69,  61,  85,  83, 131, 146,  98, 141,  62, 121,
       128,  74,  66, 132,  90, 122, 178, 161, 127,   7, 113,  78,  63,
       100,   2, 101, 119,  86,  65,  70, 107,  77,  80,  72,  95,  92,
         8,  82,  81, 124, 110,  24,  20,  21,  26,  22,  27,  50,  25,
        52,  56,  23,  38,  36,  48,  32,  39,  58,  28,  30,  55,  43,
        42,  47,  51,  54,  31,  44,  37,  46,  59,  41,  34,  2