In [1]:
import os
import sys; sys.path.insert(0, os.path.abspath("../"))
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from dataset.dataset import SkinLesion_Dataset, SegExamples
from pipeline.preprocessing import SkinLesionPreprocessing
from pipeline.feature_extraction import FeaturesExtraction

import multiprocessing as mp
from tqdm import tqdm

In [2]:
data = SkinLesion_Dataset(partitions=['val'])
preproc = SkinLesionPreprocessing()
# preproc_no_fov = SkinLesionPreprocessing(remove_fov=False)
cfe = FeaturesExtraction(levels=['global'])

In [3]:
def obtain_data(data_sample):

    img = data_sample['img']
    img_preproc = preproc.preprocess(img)
    if not img_preproc.any():
        img_preproc = preproc_no_fov.preprocess(img)
    return cfe.extract_features(img_preproc), data_sample['label']

In [4]:
n_jobs = mp.cpu_count()
features = []
labels = []
with mp.Pool(n_jobs) as pool:
    for result in tqdm(pool.imap(obtain_data, data), total= len(data)):
        feat, lab = result
        features.append(feat)
        labels.append(lab)

df = pd.DataFrame(features, columns=cfe.features_names)
df['label'] = labels
df.to_feather('../data/binary/val_all_features.f') 


100%|██████████| 3796/3796 [11:20<00:00,  5.58it/s]


In [5]:
df

Unnamed: 0,global_rgb_mean_1,global_rgb_mean_2,global_rgb_mean_3,global_rgb_std_1,global_rgb_std_2,global_rgb_std_3,global_rgb_skew_1,global_rgb_skew_2,global_rgb_skew_3,global_rgb_kur_1,...,global_gabor2_var,global_gabor2_skew,global_gabor2_kur,global_gabor2_min,global_gabor3_mean,global_gabor3_var,global_gabor3_skew,global_gabor3_kur,global_gabor3_min,label
0,167.713852,138.606186,157.837387,12.956479,19.675039,22.076326,-1.701084,-1.893283,-2.193939,3.850674,...,-3.963011,-3.960424,-1.489729,16.660029,16.625087,3.345335,106.0,106.0,11.0,nevus
1,204.255112,147.525864,123.066414,32.670475,27.367023,25.624580,-4.071280,-3.436913,-2.195173,16.637374,...,-4.653739,-4.656256,-1.842537,20.338494,20.373392,7.859289,15.0,14.0,0.0,nevus
2,182.023056,141.556702,163.088638,8.393549,20.507551,30.768713,-0.713030,-1.203962,-1.467290,0.127118,...,-3.081772,-3.074336,-1.288555,8.983521,8.922923,1.561351,164.0,161.0,19.0,nevus
3,193.503586,183.588058,154.457581,12.072448,17.058140,21.492414,-0.866403,-0.883047,-0.841835,0.300937,...,-18.920018,-18.623811,-0.730887,412.158710,396.804219,0.194586,196.0,199.0,30.0,nevus
4,170.013565,154.335953,180.735474,8.332796,11.624048,11.898982,-1.213859,-1.452731,-1.359126,1.923031,...,-6.617694,-7.187752,-1.146482,49.031998,66.141802,2.692014,179.0,154.0,25.0,nevus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3791,146.015930,121.785767,129.195709,13.844839,22.206858,30.026955,-0.537093,-0.472818,-0.454131,0.545049,...,-1.369788,-1.370844,-0.538658,1.720938,1.718004,0.351629,109.0,107.0,7.0,others
3792,166.418121,139.404846,111.242699,4.677742,7.194498,7.555755,-1.885355,-1.423033,-1.289074,5.901074,...,-5.761848,-5.801903,-1.196449,41.929813,42.674424,7.769903,181.0,178.0,19.0,others
3793,204.600632,163.537766,156.788605,17.821602,39.091873,44.373962,-1.486913,-1.080801,-0.952156,1.330181,...,-2.387764,-2.389610,-1.068337,4.479815,4.488611,0.008388,149.0,148.0,25.0,others
3794,197.494644,173.369904,161.971893,15.566128,34.179367,41.048100,-0.433357,-0.097371,0.239589,-0.341057,...,-5.070210,-5.087548,-0.111095,28.130345,28.336168,-0.953785,185.0,184.0,13.0,others


### Classifier

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score

In [7]:
# data preparation

df.replace({'nevus': 1, 'others': 0}, inplace=True)

df_train = df.sample(frac=1, random_state=42)
# df_val = df.iloc[800:, :].sample(frac=1, random_state=42)
# df_val

X_train, y_train = df_train.iloc[:, :143].to_numpy(dtype=np.float32), df_train.iloc[:, 143].to_numpy()
# X_test, y_test = df_val.iloc[:, :68].to_numpy(dtype= np.float32), df_val.iloc[:, 68].to_numpy()
print(X_train.shape, y_train.shape)

(3796, 143) (3796,)


In [10]:
def classifier(X, y):
    scaler = StandardScaler()
    
    # Create  classifier object.
    clf = RandomForestClassifier(max_depth=50, n_estimators=100,random_state=42, n_jobs=-1)
    # lr = svm.SVC()

    # Create StratifiedKFold object.
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=10)
    lst_accu_stratified = []

    for train_index, test_index in skf.split(X, y):
      scaler.fit(X[train_index])
      x_train_fold, x_test_fold = scaler.transform(X[train_index]), scaler.transform(X[test_index])
      y_train_fold, y_test_fold = y[train_index], y[test_index]
      clf.fit(x_train_fold, y_train_fold)
      lst_accu_stratified.append(clf.score(x_test_fold, y_test_fold))

    # Print the output.
    print('List of possible accuracy:', lst_accu_stratified)
    print('\nMaximum Accuracy That can be obtained from this model is:',
          max(lst_accu_stratified)*100, '%')
    print('\nMinimum Accuracy:',
          min(lst_accu_stratified)*100, '%')
    print('\nOverall Accuracy:',
          np.mean(lst_accu_stratified)*100, '%')
    print('\nStandard Deviation is:', np.std(lst_accu_stratified))

In [11]:
classifier(X_train,y_train)

List of possible accuracy: [0.7973684210526316, 0.8026315789473685, 0.8263157894736842, 0.7947368421052632, 0.7763157894736842, 0.7736842105263158, 0.8021108179419525, 0.7994722955145118, 0.8126649076517151, 0.7941952506596306]

Maximum Accuracy That can be obtained from this model is: 82.63157894736842 %

Minimum Accuracy: 77.36842105263158 %

Overall Accuracy: 79.79495903346758 %

Standard Deviation is: 0.014643503177932855


## Not resizing

In [12]:
data = SkinLesion_Dataset(partitions=['val'], resize_image=False)
preproc = SkinLesionPreprocessing()
# preproc_no_fov = SkinLesionPreprocessing(remove_fov=False)
cfe = FeaturesExtraction(levels=['global'])

In [13]:
def obtain_data(data_sample):

    img = data_sample['img']
    img_preproc = preproc.preprocess(img)
    if not img_preproc.any():
        img_preproc = preproc_no_fov.preprocess(img)
    return cfe.extract_features(img_preproc), data_sample['label']

In [14]:
n_jobs = mp.cpu_count()
features = []
labels = []
with mp.Pool(n_jobs) as pool:
    for result in tqdm(pool.imap(obtain_data, data), total= len(data)):
        feat, lab = result
        features.append(feat)
        labels.append(lab)

df = pd.DataFrame(features, columns=cfe.features_names)
df['label'] = labels
df.to_feather('../data/binary/val_all_features_ogsize.f') 


100%|██████████| 3796/3796 [27:44<00:00,  2.28it/s]  


In [15]:
# data preparation

df.replace({'nevus': 1, 'others': 0}, inplace=True)

df_train = df.sample(frac=1, random_state=42)
# df_val = df.iloc[800:, :].sample(frac=1, random_state=42)
# df_val

X_train, y_train = df_train.iloc[:, :143].to_numpy(dtype=np.float32), df_train.iloc[:, 143].to_numpy()
# X_test, y_test = df_val.iloc[:, :68].to_numpy(dtype= np.float32), df_val.iloc[:, 68].to_numpy()
print(X_train.shape, y_train.shape)

(3796, 143) (3796,)


In [16]:
classifier(X_train,y_train)

List of possible accuracy: [0.8131578947368421, 0.8026315789473685, 0.8368421052631579, 0.8052631578947368, 0.7631578947368421, 0.7789473684210526, 0.8126649076517151, 0.7994722955145118, 0.7968337730870713, 0.8047493403693932]

Maximum Accuracy That can be obtained from this model is: 83.6842105263158 %

Minimum Accuracy: 76.31578947368422 %

Overall Accuracy: 80.1372031662269 %

Standard Deviation is: 0.018815115852328344
