In [1]:
import os
import sys; sys.path.insert(0, os.path.abspath("../"))
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from dataset.dataset import SkinLesion_Dataset, SegExamples
from pipeline.preprocessing import SkinLesionPreprocessing
from pipeline.feature_extraction import FeaturesExtraction

import multiprocessing as mp
from tqdm import tqdm

In [2]:
data_train = SkinLesion_Dataset(class_task='binary', partitions=['train']) # default resize as True
# data_val = SkinLesion_Dataset(class_task='three_class', partitions=['val'])
preproc = SkinLesionPreprocessing()
cfe = FeaturesExtraction(levels=['global'])
print(f'train: {len(data_train)}')

train: 15195


In [3]:
def obtain_data(data_sample):

    img = data_sample['img']
    img_preproc = preproc.preprocess(img)
    
    return cfe.extract_features(img_preproc), data_sample['label']

In [4]:
n_jobs = mp.cpu_count()
features = []
labels = []
with mp.Pool(n_jobs) as pool:
    for result in tqdm(pool.imap(obtain_data, data_train), total= len(data_train)):
        feat, lab = result
        features.append(feat)
        labels.append(lab)

df = pd.DataFrame(features, columns=cfe.features_names)
df['label'] = labels
df.to_feather('../data/binary/train_all_feat.f') 

100%|██████████| 15195/15195 [22:25<00:00, 11.29it/s]


In [10]:
train_df = pd.read_feather("../data/binary/train_all_feat.f")
val_df = pd.read_feather("../data/binary/val_all_features.f")
train_shape = train_df.shape
val_shape = val_df.shape
print(f'train shape: {train_shape}, val shape: {val_shape}')

train shape: (15195, 144), val shape: (3796, 144)


### Quick gridsearch RF

In [3]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score

In [11]:
train_df.replace({'nevus': 1, 'others': 0}, inplace=True)
val_df.replace({'nevus': 1, 'others': 0}, inplace=True)

df_train = train_df.iloc[:train_shape[0], :].sample(frac=1, random_state=42)
df_val = val_df.iloc[:val_shape[0], :].sample(frac=1, random_state=42)
# df_val

X_train, y_train = df_train.iloc[:, :(train_shape[1]-1)].to_numpy(dtype=np.float32), df_train.iloc[:, (train_shape[1]-1)].to_numpy()
X_test, y_test = df_val.iloc[:, :(val_shape[1]-1)].to_numpy(dtype= np.float32), df_val.iloc[:, (val_shape[1]-1)].to_numpy()

print(f'X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}')

X_train: (15195, 143), y_train: (15195,), X_test: (3796, 143), y_test: (3796,)


In [12]:
classifier = RandomForestClassifier(random_state=42, n_jobs=-1)
param_grid = { 
    'classifier__n_estimators': [50, 100, 500, 1000],
    'classifier__max_depth': [50, 70, 100, 150],
}

#Let's apply a stratified 10-fold cross validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

#Let's build a pipeline with only a scaler and a linear SVM
pipe = Pipeline([('scaler', StandardScaler()),('classifier', classifier)]) #SVC(kernel='linear'))])


In [13]:
grid_search = GridSearchCV(pipe, param_grid, cv=cv)
#Let's train and test the learning system
grid_search.fit(X_train, y_train)

#Here we just print the best accuracy and the corresponding values for the parameters
print("TRAIN- The best parameters are %s with an accuracy of %0.4f"%(grid_search.best_params_, grid_search.best_score_))


In [None]:
y_train_predicted = grid_search.predict(X_train)
y_test_predicted =  grid_search.predict(X_test)
print('F1 Score of train data: %0.4f' %f1_score(y_train,y_train_predicted,average='macro'), f'Acc: {accuracy_score(y_train, y_train_predicted)}')
print('F1 Score of test data: %0.4f' %f1_score(y_test,y_test_predicted,average='macro'), f'Acc: {accuracy_score(y_test, y_test_predicted)}')

In [None]:
# CV
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(estimator= grid_search, X = X_train, y = y_train, scoring= "accuracy", cv=10, n_jobs=-1)
cv_scores

In [None]:
# Features importances
feature_importances = grid_search.best_estimator_._final_estimator.feature_importances_
cfe = FeaturesExtraction(levels=['global'])
fimp_df = pd.DataFrame(data=[feature_importances], columns=cfe.features_names).transpose()
# fimp_df
fimp_df.sort_values(0,axis=0,inplace=True, ascending=False)
normalized_df = (fimp_df-fimp_df.min())/(fimp_df.max()-fimp_df.min())
normalized_df.head(50)