In [7]:
%load_ext autoreload
%autoreload 2
import os
import numpy as np
import math
import pandas as pd

import sys
sys.path.append('../')
import torch
from torchvision import utils
import matplotlib.pyplot as plt
from src.pl_module import MelanomaModel
from src.datasets.melanoma_dataset import MelanomaDataset
from src.models.layers import AdaptiveConcatPool2d
from src.transforms.albu import get_valid_transforms_with_resize
import albumentations as A
import torch.nn as nn
from tqdm.auto import tqdm
from catalyst.utils import set_global_seed
import skimage.io
import pandas as pd
from torch.autograd import Variable
import timm
import cv2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
train_data = pd.read_csv('../data/train.csv')
train_data.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,0,6000,4000
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,0,6000,4000
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,6,1872,1053
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0,0,1872,1053
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0,11,6000,4000


In [14]:
train_data.anatom_site_general_challenge.value_counts()

torso              16632
lower extremity     8318
upper extremity     4902
head/neck           1815
palms/soles          375
oral/genital         124
Name: anatom_site_general_challenge, dtype: int64

In [9]:
train_data.sex.value_counts()

male      16868
female    15783
Name: sex, dtype: int64

In [5]:
test_data = pd.read_csv('../data/test.csv')
test_data.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,width,height
0,ISIC_0052060,IP_3579794,male,70.0,,6000,4000
1,ISIC_0052349,IP_7782715,male,40.0,lower extremity,6000,4000
2,ISIC_0058510,IP_7960270,female,55.0,torso,6000,4000
3,ISIC_0073313,IP_6375035,female,50.0,torso,6000,4000
4,ISIC_0073502,IP_0589375,female,45.0,lower extremity,1920,1080


In [15]:
test_data.anatom_site_general_challenge.value_counts()

torso              5847
lower extremity    2501
upper extremity    1573
head/neck           576
palms/soles         108
oral/genital         26
Name: anatom_site_general_challenge, dtype: int64

In [16]:
class ClassificationSingleHeadMax(nn.Module):
    def __init__(self, model_name='resnet34', num_classes=1, dim_embedding_sex=64, dim_embedding_age=64, dim_embedding_site=64):
        super().__init__()
        m = timm.create_model(
            model_name,
            pretrained=True)
        self.enc = nn.Sequential(*list(m.children())[:-2], )
        self.embedding_age = nn.Embedding(9, dim_embedding_age)
        self.embedding_sex = nn.Embedding(3, dim_embedding_sex)
        self.embedding_site = nn.Embedding(6, dim_embedding_site)
        nc = list(m.children())[-1].in_features  + dim_embedding_age + dim_embedding_sex + dim_embedding_site
        self.image_head = nn.Sequential(
            nn.AdaptiveMaxPool2d((1, 1)),
            nn.LeakyReLU(),
            nn.Flatten())
        self.head = nn.Sequential(
                    nn.Dropout(0.5),
                    nn.Linear(nc, num_classes)
        )

    def forward(self, image, sex, age, site):
        sex_embedding = self.embedding_sex(sex)
        age_embedding = self.embedding_sex(age)
        site_embedding = self.embedding_site(site)
        image_features = self.enc(image)
        image_features = self.image_head(image_features)
        features = torch.cat([image_features, site_embedding, age_embedding, sex_embedding], dim=1)
        x = self.head(x)
        return(x)

In [17]:
# now encode the categorical data
# coding: utf-8
from argparse import Namespace, ArgumentParser
from typing import Tuple
import pandas as pd
import skimage.io
import torch
from torch.utils.data import Dataset
import numpy as np


def onehot(size, target):
    vec = torch.zeros(size, dtype=torch.float32)
    vec[target] = 1.
    return vec


class MelanomaDataset(Dataset):
    def __init__(self, mode: str, config: Namespace, transform=None, use_external=False):
        super().__init__()
        self.mode = mode
        if mode not in ['train', 'val']:
            raise NotImplementedError("Not implemented dataset configuration")
        self.image_folder = config.image_folder
        self.fold = config.fold
        self.df = pd.read_csv(f"{config.data_path}/{mode}_{config.fold}.csv")
        self.df.loc[:, 'data_t'] = 'competition'
        if use_external:
            print(f'Will use external data for {mode}')
            self.external_df = pd.read_csv(f"{config.data_path}/external_{mode}_{config.fold}.csv")
            self.external_df.loc[:, 'data_t'] = 'external'
            self.df = pd.concat([self.df, self.external_df])
            self.external_image_folder = config.external_image_folder
        self.transform = transform
        self.targets = self.df.target.values
        self.target_counts = self.df.target.value_counts().values

    def __len__(self) -> int:
        return self.df.shape[0]

    def __getitem__(self, index) -> Tuple[torch.Tensor, torch.Tensor]:
        row = self.df.iloc[index]
        img_id = row.image_name
        img_type = row.data_t
        if img_type == 'competition':
            img_path = f"{self.image_folder}/{img_id}.jpg"
        else:
            img_path = f"{self.external_image_folder}/{img_id}.jpg"
        image = skimage.io.imread(img_path)
        if self.transform is not None:
            image = self.transform(image=image)['image']
        image = image.transpose(2, 0, 1)
        image = torch.from_numpy(image)
        label = row.target
        # target = onehot(2, label)
        target = torch.tensor(np.expand_dims(label, 0)).float()
        return{'features': image, 'target': target}