In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Let's collect missclassified images

In [2]:
# !pip install torchsummary

In [3]:
#IMPORTS
from tqdm.notebook import tqdm

import torch
import torchvision
import torchvision.transforms as T
import numpy as np
import matplotlib.pyplot as plt
from torchsummary import summary
import requests
from PIL import Image
import os
import io
import base64

In [122]:
import sys
sys.path.append(os.path.abspath("../../skin_cancer_nas")) # go to parent dir
sys.path.append(os.path.abspath("/mnt/skin_cancer_nas/data"))
sys.path.append(os.path.abspath("/mnt/skin_cancer_nas/data/torch_generator"))
sys.path.append(os.path.abspath("/mnt/skin_cancer_nas/nas/darts_torch"))

sys.path.append('/mnt')
sys.path.append('/mnt/skin_cancer_nas')
sys.path.append('/mnt/skin_cancer_nas/data/torch_generator')
from skin_cancer_nas.data.torch_generator import generator as data_gen
from skin_cancer_nas.data.torch_generator import base_classes
from skin_cancer_nas.data.torch_generator.config import *

from nas.darts_torch import *
import cv2



ROOT_PATHS = [  Path('/mnt/data/interim/_melanoma_20200728_REGISTERED_OCV_WHITE_Split_Channels/checkyourskin'),
                Path('/mnt/data/interim/_melanoma_20200728_REGISTERED_OCV_WHITE_Split_Channels/loc'),
                Path('/mnt/data/interim/_melanoma_20200728_REGISTERED_OCV_WHITE_Split_Channels/loc_old_colored')]

CLS1 = ClassesDefinition(root_folders_list=ROOT_PATHS, diagnoses_names_list=['c43', 'd03', 'd03.9'], class_name='Melanoma_like_lesions', int_label=0)
CLS2 = ClassesDefinition(root_folders_list=ROOT_PATHS, diagnoses_names_list=['d22', 'd81', 'l81.2','l81.4', 'q82.5'], class_name='Pigmented_benign', int_label=1)
CLS3 = ClassesDefinition(root_folders_list=ROOT_PATHS, diagnoses_names_list=['d86.3', 'l21', 'l57', 'l57.0', 'l82', 'l85', 'l85.1', 'l85.5', 'l85.8', 'q80'], class_name='Keratin_lesions', int_label=2)
CLS4 = ClassesDefinition(root_folders_list=ROOT_PATHS, diagnoses_names_list=['c44', 'c46', 'd09'], class_name='Nonmelanoma_skin_cancer', int_label=3)
CLS5 = ClassesDefinition(root_folders_list=ROOT_PATHS, diagnoses_names_list=['a63', 'd18', 'd21.9', 'd48', 'l92', 'l94.2', 'l98.8', 'pxe', 'b07', 'ada', 'l57.9', 'l98.9'], class_name='Other', int_label=4)

CLASSES_SET_1 = [CLS1, CLS2]
CLASSES_SET_2 = [CLS1, CLS2, CLS3]
CLASSES_SET_3 = [CLS1, CLS2, CLS3, CLS4]
CLASSES_SET_4 = [ \
    ClassesDefinition(root_folders_list=ROOT_PATHS, diagnoses_names_list=['c43'], class_name='Melanoma', int_label=0), \
    ClassesDefinition(root_folders_list=ROOT_PATHS, diagnoses_names_list=['d22'], class_name='Nevuss', int_label=1) \
]
CLASSES_SET_5 = [ \
    ClassesDefinition(root_folders_list=ROOT_PATHS, diagnoses_names_list=['c43'], class_name='Melanoma', int_label=0), \
    ClassesDefinition(root_folders_list=ROOT_PATHS, diagnoses_names_list=['d22'], class_name='Nevuss', int_label=1), \
    ClassesDefinition(root_folders_list=ROOT_PATHS, diagnoses_names_list=['l82'], class_name='Keratin lesion', int_label=2) \
]
CLASSES_SET_6 = [ \
   ClassesDefinition(root_folders_list=ROOT_PATHS, diagnoses_names_list=['c43'], class_name='Melanoma', int_label=0), \
   ClassesDefinition(root_folders_list=ROOT_PATHS, diagnoses_names_list=['c44'], class_name='NonMelanomaCancer', int_label=1), \
   ClassesDefinition(root_folders_list=ROOT_PATHS, diagnoses_names_list=['d22'], class_name='Nevuss', int_label=2) \
]

CLASSES_SET_8 = [CLS1, CLS2, CLS3, CLS4, CLS5]

VALID_CHANNELS = ['r-r', 'ir-r', 'g-g', 'uv-0-r', 'uv-0-g', 'white-g']

IMG_WIDTH = 128
IMG_HEIGHT = 128
VALUE_MISSING = 0
MISSING_CH_IMG = np.ones((IMG_HEIGHT, IMG_WIDTH)) * VALUE_MISSING

In [5]:
import logging
logger = logging.getLogger('skin_cancer_nas__darts_misclassified_img_collection_notebook')

### Load model

In [6]:
# Load model
device = 'cpu'

model = torch.load("/mnt/models/darts_retrained/6ch_128x128_no_metainfo_registered/3lrs_2oct_ClassSet3_registered/final_model1.pt")
model.to(device)
for param in model.parameters():
    param.requires_grad = False

### Data preparation

In [48]:
def get_input_output_from_path(sample_path, labels):
    '''
    Generates one sample of data
    '''
    try:
        # logger.info('Sample path={}'.format(sample_path))
        x_out = _convert_img_to_array(sample_path)
        x_out = x_out.astype('float32')
        x_out /= 255

        X = torch.as_tensor(x_out)
        y = labels[str(sample_path)]

        X = X.to(device)
        y = torch.tensor(y).to(device)
        return X, y
    except Exception as e:
        msg = 'Something went wrong at path={}, e={}'.format(sample_path, e)
        print(msg)
        logger.info(msg)


def _convert_img_to_array(sample_path):
    'Converts n grayscale images to 3D array with n channels'
    x_array = []
    for channel in VALID_CHANNELS:
        channel_path = os.path.join(sample_path, channel)
        if not os.path.exists(channel_path):
            x_array.append(MISSING_CH_IMG)
            continue
        image = os.listdir(channel_path)
        if not image:
            x_array.append(MISSING_CH_IMG)
            continue
        full_image_path = os.path.join(channel_path, image[0])
        img = cv2.imread(full_image_path, flags=cv2.IMREAD_GRAYSCALE)
        if img is None:
            x_array.append(MISSING_CH_IMG)
            continue
        else:
            shape = img.shape
            # logger.info(shape)
            img = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH), interpolation=cv2.INTER_CUBIC)

        x_array.append(img)

    return np.stack(x_array, axis=0)

! IMPORTANT ! - for now we don't have any image transformations (like normalization), but once they will be - they have to be added here!

In [None]:
partition, labels = data_gen.train_val_split(val_ratio=0.1, classes_list=CLASSES_SET_2)
X, y = get_input_output_from_path(sample_path=partition['validation'][0], labels=labels)

In [70]:
def gather_predictions(partition, partition_key, labels, model):

    # Gather list of input 'image' matrices, labels and paths (for titles)
    X_list = []
    y_list = []  # y_true
    y_pred_list = []
    scores_list = []
    paths_list = []
    len_val = len(partition[partition_key])

    model.eval()

    for i in tqdm(range(len_val)):
        sample_path = partition[partition_key][i]
        _X, _y = get_input_output_from_path(sample_path=sample_path, labels=labels)
        scores = model(_X.unsqueeze(0))
        scores = scores.detach().numpy()
        y_pred = scores.argmax()

        X_list.append(_X)
        y_list.append(_y.detach().numpy())
        paths_list.append(sample_path)
        scores_list.append(scores)
        y_pred_list.append(y_pred)
        
    return X_list, y_list, y_pred_list, scores_list, paths_list

In [71]:
X_list, y_list, y_pred_list, scores_list, paths_list = gather_predictions(partition, 'validation', labels, model)

HBox(children=(FloatProgress(value=0.0, max=90.0), HTML(value='')))




In [72]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

print('f1_score', f1_score(y_list, y_pred_list, average='weighted'))
print('precision_score', precision_score(y_list, y_pred_list, average='weighted'))
print('recall_score', recall_score(y_list, y_pred_list, average='weighted'))

f1_score 0.8327765674170902
precision_score 0.8339285714285715
recall_score 0.8333333333333334


In [58]:
# Gather missclassified cases
subfolders_to_gather = ['r', 'ir', 'g', 'uv-0']
channels_to_gather = ['b','g','r']

output_folder = '/mnt/data/interim/misclassification_visualization/_melanoma_20200728_REGISTERED_OCV_WHITE_Split_Channels__MISSCLASSIFIED__Set3_Original_9-1_split_validation'

for y_true, y_pred, path in tqdm(zip(y_list, y_pred_list, paths_list), total=len(y_list)):
    if y_true != y_pred:
#         print(y_true, y_pred, path)
        for sub_folder in subfolders_to_gather:
            if os.path.exists(os.path.join(path, sub_folder + '-' + channels_to_gather[0])):
                channels = {}
                for ch in channels_to_gather:
                    sub_folder_path = os.path.join(path, sub_folder + '-' + ch)
                    channel_file = os.listdir(sub_folder_path)[0]
                    channels[ch] = cv2.imread(os.path.join(sub_folder_path, channel_file), cv2.IMREAD_GRAYSCALE)
                    
                color_img = cv2.merge((channels['b'], channels['g'], channels['r']))
                color_img_name = path.replace('/mnt/data/interim/_melanoma_20200728_REGISTERED_OCV_WHITE_Split_Channels/', '').replace('/', '---') + sub_folder + '.png'
                error_type_folder_name = str(y_true) + "_classified_as_" + str(y_pred)
                
                os.makedirs(os.path.join(output_folder, error_type_folder_name), exist_ok=True)
                
                color_img_path = os.path.join(output_folder, error_type_folder_name, color_img_name)
                cv2.imwrite(color_img_path, color_img)
#                 print('Saving file - {}'.format(color_img_path))

HBox(children=(FloatProgress(value=0.0, max=90.0), HTML(value='')))




### Gather misclassified images for 5-Fold CV - to get all of them!

In [79]:
def write_misclassified_images(fold_no, y_list, y_pred_list, paths_list):
    subfolders_to_gather = ['r', 'ir', 'g', 'uv-0']
    channels_to_gather = ['b','g','r']

    output_folder = '/mnt/data/interim/misclassification_visualization/_melanoma_20200728_REGISTERED_OCV_WHITE_Split_Channels__MISSCLASSIFIED__5layers_Set3_Original_5FoldCV_foldN{}'.format(fold_no)

    for y_true, y_pred, path in tqdm(zip(y_list, y_pred_list, paths_list), total=len(y_list)):
        if y_true != y_pred:
    #         print(y_true, y_pred, path)
            for sub_folder in subfolders_to_gather:
                if os.path.exists(os.path.join(path, sub_folder + '-' + channels_to_gather[0])):
                    channels = {}
                    for ch in channels_to_gather:
                        sub_folder_path = os.path.join(path, sub_folder + '-' + ch)
                        channel_file = os.listdir(sub_folder_path)[0]
                        channels[ch] = cv2.imread(os.path.join(sub_folder_path, channel_file), cv2.IMREAD_GRAYSCALE)

                    color_img = cv2.merge((channels['b'], channels['g'], channels['r']))
                    color_img_name = path.replace('/mnt/data/interim/_melanoma_20200728_REGISTERED_OCV_WHITE_Split_Channels/', '').replace('/', '---') + sub_folder + '.png'
                    error_type_folder_name = str(y_true) + "_classified_as_" + str(y_pred)

                    os.makedirs(os.path.join(output_folder, error_type_folder_name), exist_ok=True)

                    color_img_path = os.path.join(output_folder, error_type_folder_name, color_img_name)
                    cv2.imwrite(color_img_path, color_img)

In [81]:
from torchvision import transforms
from base_classes import Dataset

class RandomRot(object):
    
    '''
    Randomly rotates +90 / -90 degrees (or not rotates) passed in 3D tensor
    '''

    def __call__(self, sample):
        """
        :param sample: torch tensor 

        :return: randomly rotated for 90, 180, 270 or 0 times
        """
        k = random.randint(0, 2) - 1
        if k != 0:
            sample = torch.rot90(sample, k, [1, 2])
        return sample

class RandomFlip(object):
    
    def __init__(self, horizontal=True, prob_threshold=0.5):
        self.prob_threshold = prob_threshold
        self.horizontal = horizontal

    def __call__(self, sample):
        """
        :param sample: torch tensor 

        :return: randomly horizontally/vertically flipped torch 3D tensor
        """
        flip_prob = random.uniform(0, 1)
        if flip_prob > self.prob_threshold:
            if self.horizontal:
                sample = torch.flip(sample, [2])  # vertical flip of 3D tensor
            else:
                sample = torch.flip(sample, [1]) # horizontal flip of 3D tensor
        return sample

def prepare_train_val_generators(partition, labels):
    # MEAN = [0.2336, 0.6011, 0.3576, 0.4543]
    # STD = [0.0530, 0.0998, 0.0965, 0.1170]
    normalize = [
        # # transforms.Normalize(MEAN, STD)
        # transforms.ToPILImage(),
        # transforms.RandomHorizontalFlip(p=0.5),
        # transforms.RandomRotation(degrees=(-90, 90)),
        # transforms.RandomVerticalFlip(p=0.5),
        # transforms.ToTensor(),
        RandomFlip(horizontal=True, prob_threshold=0.5),
        RandomRot(),
        RandomFlip(horizontal=False, prob_threshold=0.5)
    ]
    train_transform = transforms.Compose(normalize)
    valid_transform = transforms.Compose([])

    # Generators Declaration
    data_device = torch.device("cpu")
    # data_device = device

    def _init_fn(worker_id):
        np.random.seed(int(seed))

    training_set = Dataset( partition['train'], 
                            labels, 
                            transform=train_transform, 
                            device=data_device,
                            valid_channels=VALID_CHANNELS,
                            channels_to_zero_out=[])
    train_loader = torch.utils.data.DataLoader( training_set, 
                                                **data_gen.PARAMS, 
                                                pin_memory=True, 
                                                worker_init_fn=_init_fn)

    validation_set = Dataset(partition['validation'], 
                            labels, 
                            transform=valid_transform, 
                            device=data_device,
                            valid_channels=VALID_CHANNELS,
                            channels_to_zero_out=[])
    valid_loader = torch.utils.data.DataLoader( validation_set, 
                                                **data_gen.PARAMS, 
                                                pin_memory=True, 
                                                worker_init_fn=_init_fn)

    return train_loader, valid_loader


folds_partions_dict, labels = data_gen.train_val_split_kfolds(folds_n=5, classes_list=CLASSES_SET_3)

for fold_n, partition in folds_partions_dict.items():
    print('fold_n',fold_n)
    
    train_loader, valid_loader = prepare_train_val_generators(partition, labels)
    
    model = torch.load("/mnt/models/darts_retrained/6ch_128x128_no_metainfo_registered_5Fold/5lrs_2oct_ClassSet3_registered_fold-{}/final_model1.pt".format(fold_n))
    model.to(device)
    
    X_list, y_list, y_pred_list, scores_list, paths_list = gather_predictions(partition, 'validation', labels, model)
    
    print('f1_score', f1_score(y_list, y_pred_list, average='weighted'))
    print('precision_score', precision_score(y_list, y_pred_list, average='weighted'))
    print('recall_score', recall_score(y_list, y_pred_list, average='weighted'))
    print('-------------------')
    
    write_misclassified_images(fold_n, y_list, y_pred_list, paths_list)

fold_n 0


HBox(children=(FloatProgress(value=0.0, max=178.0), HTML(value='')))


f1_score 0.8100215551424639
precision_score 0.8127298456115375
recall_score 0.8089887640449438
-------------------


HBox(children=(FloatProgress(value=0.0, max=178.0), HTML(value='')))


fold_n 1


HBox(children=(FloatProgress(value=0.0, max=178.0), HTML(value='')))


f1_score 0.769737730479169
precision_score 0.7718440185062788
recall_score 0.7696629213483146
-------------------


HBox(children=(FloatProgress(value=0.0, max=178.0), HTML(value='')))


fold_n 2


HBox(children=(FloatProgress(value=0.0, max=176.0), HTML(value='')))


f1_score 0.7614734410710224
precision_score 0.7795940840030361
recall_score 0.7670454545454546
-------------------


HBox(children=(FloatProgress(value=0.0, max=176.0), HTML(value='')))


fold_n 3


HBox(children=(FloatProgress(value=0.0, max=175.0), HTML(value='')))


f1_score 0.8047816227940452
precision_score 0.8086538605650337
recall_score 0.8057142857142857
-------------------


HBox(children=(FloatProgress(value=0.0, max=175.0), HTML(value='')))


fold_n 4


HBox(children=(FloatProgress(value=0.0, max=175.0), HTML(value='')))


f1_score 0.7723788638262323
precision_score 0.7708861748900829
recall_score 0.7771428571428571
-------------------


HBox(children=(FloatProgress(value=0.0, max=175.0), HTML(value='')))




FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/interim/_melanoma_20200728_REGISTERED_OCV_WHITE_Split_Channels/checkyourskin/L82/2019-01-25_11-39-37/uv-0-r'

### Collect estimations for randomly dropped channels original 3 layers Set3 network

In [86]:
from sklearn.metrics import confusion_matrix

In [123]:
partition, labels = data_gen.train_val_split(val_ratio=0.1, classes_list=CLASSES_SET_8)
X, y = get_input_output_from_path(sample_path=partition['validation'][0], labels=labels)

In [124]:
# Load model
device = 'cpu'

for L in range(1,6):
    print('Layers {}'.format(L))
    model = torch.load('/mnt/models/darts_retrained/6ch_128x128_no_metainfo_registered/XV2_SGD_orig_02DropChannel_{}lrs_2oct_ClassSet8_ManCorected_registered/final_model1.pt'.format(L))
    model.to(device)

    X_list, y_list, y_pred_list, scores_list, paths_list = gather_predictions(partition, 'train', labels, model)  #validation

    print('f1_score', f1_score(y_list, y_pred_list, average='weighted'))
    print('precision_score', precision_score(y_list, y_pred_list, average='weighted'))
    print('recall_score', recall_score(y_list, y_pred_list, average='weighted'))
    print('confusion_matrix \n', confusion_matrix(y_list, y_pred_list))

Layers 1


HBox(children=(FloatProgress(value=0.0, max=1249.0), HTML(value='')))


f1_score 0.7906912632362361
precision_score 0.8138043505146004
recall_score 0.7934347477982386
confusion_matrix 
 [[ 29   3   0   2   8]
 [  5 305   8   1  50]
 [  1  18 147   3  57]
 [  3   2   2  92  56]
 [  0  32   3   4 418]]
Layers 2


HBox(children=(FloatProgress(value=0.0, max=1249.0), HTML(value='')))


f1_score 0.9525632153904127
precision_score 0.9531127398079756
recall_score 0.9527622097678142
confusion_matrix 
 [[ 41   0   0   1   0]
 [  1 356   4   1   7]
 [  0   7 203   5  11]
 [  2   0   1 144   8]
 [  0   8   1   2 446]]
Layers 3


HBox(children=(FloatProgress(value=0.0, max=1249.0), HTML(value='')))


f1_score 0.9316971649951324
precision_score 0.9334437088264412
recall_score 0.9319455564451561
confusion_matrix 
 [[ 35   1   0   2   4]
 [  0 352   6   1  10]
 [  0   5 198   7  16]
 [  0   0   0 139  16]
 [  0  14   1   2 440]]
Layers 4


HBox(children=(FloatProgress(value=0.0, max=1249.0), HTML(value='')))


f1_score 0.9493151079532723
precision_score 0.9499189548308733
recall_score 0.9495596477181746
confusion_matrix 
 [[ 41   0   0   0   1]
 [  0 363   3   1   2]
 [  0   5 207   5   9]
 [  2   0   1 138  14]
 [  0  17   2   1 437]]
Layers 5


HBox(children=(FloatProgress(value=0.0, max=1249.0), HTML(value='')))


f1_score 0.9590270264667781
precision_score 0.9599253017333931
recall_score 0.9591673338670936
confusion_matrix 
 [[ 42   0   0   0   0]
 [  0 361   4   0   4]
 [  0   8 211   3   4]
 [  4   0   0 139  12]
 [  1  11   0   0 445]]


In [125]:
# Load model
device = 'cpu'

for L in range(1,6):
    print('Layers {}'.format(L))
    model = torch.load('/mnt/models/darts_retrained/6ch_128x128_no_metainfo_registered/XV2_SGD_orig_02DropChannel_{}lrs_2oct_ClassSet8_ManCorected_registered/final_model1.pt'.format(L))
    model.to(device)

    X_list, y_list, y_pred_list, scores_list, paths_list = gather_predictions(partition, 'validation', labels, model)  #validation | train

    print('f1_score', f1_score(y_list, y_pred_list, average='weighted'))
    print('precision_score', precision_score(y_list, y_pred_list, average='weighted'))
    print('recall_score', recall_score(y_list, y_pred_list, average='weighted'))
    print('confusion_matrix \n', confusion_matrix(y_list, y_pred_list))

Layers 1


HBox(children=(FloatProgress(value=0.0, max=141.0), HTML(value='')))


f1_score 0.6784275753572818
precision_score 0.726462965925452
recall_score 0.6879432624113475
confusion_matrix 
 [[ 4  1  0  0  0]
 [ 0 30  0  1 10]
 [ 0  2 13  1 10]
 [ 1  0  0  6 11]
 [ 0  5  1  1 44]]
Layers 2


HBox(children=(FloatProgress(value=0.0, max=141.0), HTML(value='')))


f1_score 0.7708284415928545
precision_score 0.7773271454327264
recall_score 0.7730496453900709
confusion_matrix 
 [[ 4  1  0  0  0]
 [ 0 35  0  1  5]
 [ 0  1 17  2  6]
 [ 1  0  1 11  5]
 [ 0  6  2  1 42]]
Layers 3


HBox(children=(FloatProgress(value=0.0, max=141.0), HTML(value='')))


f1_score 0.7458565671237485
precision_score 0.757841166180219
recall_score 0.7446808510638298
confusion_matrix 
 [[ 4  1  0  0  0]
 [ 1 31  0  0  9]
 [ 0  1 17  0  8]
 [ 1  0  1 12  4]
 [ 0  3  4  3 41]]
Layers 4


HBox(children=(FloatProgress(value=0.0, max=141.0), HTML(value='')))


f1_score 0.7933521314760108
precision_score 0.7978948850918174
recall_score 0.7943262411347518
confusion_matrix 
 [[ 4  1  0  0  0]
 [ 1 36  0  0  4]
 [ 0  1 19  2  4]
 [ 1  0  1 11  5]
 [ 0  5  1  3 42]]
Layers 5


HBox(children=(FloatProgress(value=0.0, max=141.0), HTML(value='')))


f1_score 0.7681348030216923
precision_score 0.7757245709373369
recall_score 0.7730496453900709
confusion_matrix 
 [[ 4  1  0  0  0]
 [ 1 37  1  0  2]
 [ 0  1 18  0  7]
 [ 1  0  2  9  6]
 [ 0  5  3  2 41]]
