In [15]:
import os
import sys; sys.path.insert(0, os.path.abspath("../"))
import cv2
import numpy as np 
import matplotlib.pyplot as plt
# from utils.plots import get_image, get_image_and_plot
from tqdm import tqdm
import pandas as pd
from scipy.stats import skew, kurtosis
from skimage.measure import shannon_entropy
from pipeline.preprocessing import SkinLesionPreprocessing
from dataset.dataset import SkinLesion_Dataset
from pipeline.feature_extraction import FeaturesExtraction
from tqdm import tqdm
import pyarrow

In [2]:
data = SkinLesion_Dataset()
preproc = SkinLesionPreprocessing()
cfe = FeaturesExtraction(levels=['global'], color_params={})
data.md_df.head(10)

def segment(img):

    r_norm = img[:,:,2]*(1/np.sqrt(np.sum(img.astype(np.float32)**2, axis=-1)))
    rnormg = (cv2.GaussianBlur(r_norm, ksize = (0,0), sigmaX=3, borderType = cv2.BORDER_DEFAULT)*255).astype(np.uint8)
    _,mask_r = cv2.threshold(rnormg, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)

    # Filling holes
    contour_r,_ = cv2.findContours(mask_r,cv2.RETR_CCOMP,cv2.CHAIN_APPROX_SIMPLE)
    for cnt in contour_r:
        cv2.drawContours(mask_r,[cnt],0,255,-1)
    
    return mask_r



In [4]:
data_sample = pd.read_csv('../metadata/data_1000_sample.csv', index_col=[0])
data_sample

Unnamed: 0,img_id,label,sublabel,size,height,width,path,split
8211,2005,nevus,nev,"(450, 600, 3)",450,600,../data/binary/train/nevus/nev02005.jpg,train
10349,3045,nevus,nev,"(680, 1024, 3)",680,1024,../data/binary/train/nevus/nev03045.jpg,train
7087,1714,nevus,nev,"(768, 1024, 3)",768,1024,../data/binary/train/nevus/nev01714.jpg,train
6408,6957,nevus,nev,"(1024, 1024, 3)",1024,1024,../data/binary/train/nevus/nev06957.jpg,train
4482,479,nevus,nev,"(450, 600, 3)",450,600,../data/binary/train/nevus/nev00479.jpg,train
...,...,...,...,...,...,...,...,...
3092,2860,others,mel,"(1024, 1024, 3)",1024,1024,../data/binary/val/others/mel02860.jpg,val
2225,544,others,ack,"(450, 600, 3)",450,600,../data/binary/val/others/ack00544.jpg,val
2238,3364,others,mel,"(680, 1024, 3)",680,1024,../data/binary/val/others/mel03364.jpg,val
3083,455,others,scc,"(1024, 1024, 3)",1024,1024,../data/binary/val/others/scc00455.jpg,val


## Extract texture features

In [5]:
cfe.features_names

['global_rad_1_lbp0',
 'global_rad_1_lbp1',
 'global_rad_1_lbp2',
 'global_rad_1_lbp3',
 'global_rad_1_lbp4',
 'global_rad_1_lbp5',
 'global_rad_1_lbp6',
 'global_rad_1_lbp7',
 'global_rad_1_lbp8',
 'global_rad_1_lbp9',
 'global_rad_3_lbp0',
 'global_rad_3_lbp1',
 'global_rad_3_lbp2',
 'global_rad_3_lbp3',
 'global_rad_3_lbp4',
 'global_rad_3_lbp5',
 'global_rad_3_lbp6',
 'global_rad_3_lbp7',
 'global_rad_3_lbp8',
 'global_rad_3_lbp9',
 'global_dist1_ang0_contrast',
 'global_dist1_ang1_contrast',
 'global_dist1_ang2_contrast',
 'global_dist1_ang3_contrast',
 'global_dist2_ang0_contrast',
 'global_dist2_ang1_contrast',
 'global_dist2_ang2_contrast',
 'global_dist2_ang3_contrast',
 'global_dist1_ang0_dissimilarity',
 'global_dist1_ang1_dissimilarity',
 'global_dist1_ang2_dissimilarity',
 'global_dist1_ang3_dissimilarity',
 'global_dist2_ang0_dissimilarity',
 'global_dist2_ang1_dissimilarity',
 'global_dist2_ang2_dissimilarity',
 'global_dist2_ang3_dissimilarity',
 'global_dist1_ang0_homo

In [6]:
all_feat = []
labels = []

for i in tqdm(data_sample.index.values, total=1000):
    img = data[i]['img']
    img = preproc.preprocess(img)
    # mask = segment(img)

    labels.append(data[i]['label'])
    all_feat.append(cfe.extract_features(img))

df = pd.DataFrame(all_feat, columns=cfe.features_names)
df['label'] = labels
df.to_feather('../data/binary/ds1000_texture_features.f') 

100%|██████████| 1000/1000 [37:25<00:00,  2.25s/it] 


## Classifier

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score

In [4]:
df = pd.read_feather('../data/binary/ds1000_texture_features.f')
df

Unnamed: 0,global_rad_1_lbp0,global_rad_1_lbp1,global_rad_1_lbp2,global_rad_1_lbp3,global_rad_1_lbp4,global_rad_1_lbp5,global_rad_1_lbp6,global_rad_1_lbp7,global_rad_1_lbp8,global_rad_1_lbp9,...,global_dist2_ang3_energy,global_dist1_ang0_correlation,global_dist1_ang1_correlation,global_dist1_ang2_correlation,global_dist1_ang3_correlation,global_dist2_ang0_correlation,global_dist2_ang1_correlation,global_dist2_ang2_correlation,global_dist2_ang3_correlation,label
0,0.077620,0.095471,0.060913,0.081342,0.091314,0.097138,0.089875,0.099987,0.121429,0.184911,...,0.037384,0.989650,0.986135,0.991963,0.987296,0.981559,0.986135,0.984382,0.987296,nevus
1,0.030114,0.060068,0.032623,0.120833,0.159114,0.174400,0.089783,0.083645,0.138978,0.110443,...,0.027033,0.997198,0.996688,0.998238,0.995860,0.993261,0.996688,0.995644,0.995860,nevus
2,0.015907,0.055590,0.016333,0.124239,0.106435,0.209330,0.099168,0.095137,0.176871,0.100989,...,0.089894,0.998387,0.997319,0.998229,0.997124,0.996030,0.997319,0.995765,0.997124,nevus
3,0.069879,0.090704,0.034849,0.065227,0.056720,0.080805,0.078677,0.112275,0.189339,0.221525,...,0.089619,0.998426,0.998003,0.998462,0.997996,0.997391,0.998003,0.997461,0.997996,nevus
4,0.028894,0.056170,0.033833,0.112186,0.116448,0.180569,0.101057,0.098457,0.151373,0.121013,...,0.057029,0.998286,0.997201,0.998180,0.997326,0.996023,0.997201,0.995816,0.997326,nevus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.052319,0.075735,0.041267,0.089738,0.091145,0.121390,0.091563,0.104836,0.154575,0.177433,...,0.049155,0.998059,0.997481,0.997929,0.996885,0.996203,0.997481,0.995858,0.996885,others
996,0.036633,0.062980,0.045714,0.114827,0.145520,0.164558,0.102591,0.087607,0.105939,0.133631,...,0.057125,0.974052,0.961989,0.974262,0.957002,0.937740,0.961989,0.939373,0.957002,others
997,0.041522,0.074488,0.046688,0.114118,0.133883,0.152429,0.102246,0.085888,0.110532,0.138207,...,0.044601,0.998139,0.997055,0.998127,0.996880,0.995634,0.997055,0.995523,0.996880,others
998,0.097709,0.105852,0.047562,0.052171,0.045184,0.054820,0.070656,0.114214,0.176345,0.235488,...,0.045571,0.982546,0.979195,0.982493,0.978568,0.976482,0.979195,0.976457,0.978568,others


In [6]:
# data preparation

df.replace({'nevus': 1, 'others': 0}, inplace=True)

df_train = df.iloc[:800, :].sample(frac=1, random_state=42)
df_val = df.iloc[800:, :].sample(frac=1, random_state=42)
# df_val

X_train, y_train = df_train.iloc[:, :68].to_numpy(dtype=np.float32), df_train.iloc[:, 68].to_numpy()
X_test, y_test = df_val.iloc[:, :68].to_numpy(dtype= np.float32), df_val.iloc[:, 68].to_numpy()

In [7]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(800, 68) (800,) (200, 68) (200,)


In [9]:
classifier = RandomForestClassifier(random_state=42, n_jobs=-1)
param_grid = { 
    'classifier__n_estimators': [50, 100, 500],
    'classifier__max_depth': [50, 70, 100],
}

#Let's apply a stratified 10-fold cross validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

#Let's build a pipeline with only a scaler and a linear SVM
pipe = Pipeline([('scaler', StandardScaler()),('classifier', classifier)]) #SVC(kernel='linear'))])


In [10]:
grid_search = GridSearchCV(pipe, param_grid, cv=cv)
#Let's train and test the learning system
grid_search.fit(X_train, y_train)

#Here we just print the best accuracy and the corresponding values for the parameters
print("TRAIN- The best parameters are %s with an accuracy of %0.4f"%(grid_search.best_params_, grid_search.best_score_))

# Store the trained models
with open('../data/binary/RF_800_texture.sav', 'wb') as f:
                    pickle.dump(grid_search, f)

TRAIN- The best parameters are {'classifier__max_depth': 50, 'classifier__n_estimators': 100} with an accuracy of 0.7362


In [11]:
y_train_predicted = grid_search.predict(X_train)
y_test_predicted =  grid_search.predict(X_test)
print('F1 Score of train data: %0.4f' %f1_score(y_train,y_train_predicted,average='macro'), f'Acc: {accuracy_score(y_train, y_train_predicted)}')
print('F1 Score of test data: %0.4f' %f1_score(y_test,y_test_predicted,average='macro'), f'Acc: {accuracy_score(y_test, y_test_predicted)}')

F1 Score of train data: 1.0000 Acc: 1.0
F1 Score of test data: 0.7149 Acc: 0.715


### CV

In [12]:
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(estimator= grid_search, X = X_train, y = y_train, scoring= "accuracy", cv=10, n_jobs=-1)
cv_scores

array([0.625 , 0.7125, 0.7375, 0.675 , 0.6875, 0.7875, 0.6875, 0.6625,
       0.8   , 0.7625])

### Feature importances

In [13]:
feature_importances = grid_search.best_estimator_._final_estimator.feature_importances_

In [16]:
cfe = FeaturesExtraction(levels=['global'], color_params={})
fimp_df = pd.DataFrame(data=[feature_importances], columns=cfe.features_names).transpose()
# fimp_df
fimp_df.sort_values(0,axis=0,inplace=True, ascending=False)
fimp_df.head(20)

Unnamed: 0,0
global_rad_3_lbp0,0.055308
global_rad_3_lbp6,0.04078
global_rad_3_lbp3,0.035899
global_rad_3_lbp4,0.034196
global_rad_3_lbp5,0.027615
global_rad_3_lbp2,0.025639
global_dist1_ang0_correlation,0.023304
global_rad_3_lbp9,0.023287
global_rad_3_lbp8,0.023161
global_dist1_ang2_correlation,0.022906


In [21]:
# using a different scaler
means = np.mean(X_train, axis=0)
stds = np.std(X_train, axis=0)
print(means.shape, stds.shape)

(68,) (68,)


In [64]:
class TextureScaler:
    def __init__(self, no_scale_feat: int = 20):
        super(TextureScaler, self).__init__()
        self.no_scale_feat = no_scale_feat
    def fit(self, X):
        self.means = np.mean(X, axis=0)
        self.stds = np.std(X, axis=0)
    def transform(self, X):
        X_scaled = X.copy()
        # scale only non-lbp features (20:)
        X_scaled[:, self.no_scale_feat:] = np.asarray([(X[:,i] - self.means[i])/self.stds[i] \
                                            for i in range(self.no_scale_feat, X.shape[1])]).T
        
        return X_scaled

In [70]:
X_train[0,:10]

array([0.0391515 , 0.0675836 , 0.04121076, 0.11328375, 0.12875994,
       0.15225904, 0.09358147, 0.09198017, 0.13665581, 0.13553397],
      dtype=float32)

In [68]:
full_scaler = StandardScaler()
X_fs = full_scaler.fit_transform(X_train)
X_fs[0,:10]

array([-0.46701208, -0.29016966, -0.09994659,  0.65352124,  0.47819766,
        0.31863004, -0.05756978, -0.49313104, -0.12634386, -0.51615494],
      dtype=float32)

In [69]:
scaler = TextureScaler()
scaler.fit(X_train)
X_scaled = scaler.transform(X_train)
X_scaled[0,:10]

array([0.0391515 , 0.0675836 , 0.04121076, 0.11328375, 0.12875994,
       0.15225904, 0.09358147, 0.09198017, 0.13665581, 0.13553397],
      dtype=float32)

In [65]:
def classifier(X, y):
    scaler = TextureScaler()
    
    # Create  classifier object.
    clf = RandomForestClassifier(max_depth=50, n_estimators=100,random_state=42, n_jobs=-1)
    # lr = svm.SVC()

    # Create StratifiedKFold object.
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=10)
    lst_accu_stratified = []

    for train_index, test_index in skf.split(X, y):
      scaler.fit(X[train_index])
      x_train_fold, x_test_fold = scaler.transform(X[train_index]), scaler.transform(X[test_index])
      y_train_fold, y_test_fold = y[train_index], y[test_index]
      clf.fit(x_train_fold, y_train_fold)
      lst_accu_stratified.append(clf.score(x_test_fold, y_test_fold))

    # Print the output.
    print('List of possible accuracy:', lst_accu_stratified)
    print('\nMaximum Accuracy That can be obtained from this model is:',
          max(lst_accu_stratified)*100, '%')
    print('\nMinimum Accuracy:',
          min(lst_accu_stratified)*100, '%')
    print('\nOverall Accuracy:',
          np.mean(lst_accu_stratified)*100, '%')
    print('\nStandard Deviation is:', np.std(lst_accu_stratified))

In [66]:
classifier(X_train,y_train)

List of possible accuracy: [0.725, 0.7, 0.7125, 0.725, 0.8, 0.7125, 0.6625, 0.75, 0.6875, 0.8]

Maximum Accuracy That can be obtained from this model is: 80.0 %

Minimum Accuracy: 66.25 %

Overall Accuracy: 72.74999999999999 %

Standard Deviation is: 0.04250000000000002


In [67]:
np.mean([0.625 , 0.7125, 0.7375, 0.675 , 0.6875, 0.7875, 0.6875, 0.6625,
       0.8   , 0.7625])

0.71375