In [18]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

HOME_DIRECTORY = 'kaggle/input/isic-2024-challenge/'
train_metadata = pd.read_csv('/kaggle/input/isic-2024-challenge/train-metadata.csv',low_memory = True)
test_metadata = pd.read_csv('/kaggle/input/isic-2024-challenge/test-metadata.csv',low_memory =True)

from category_encoders.hashing import HashingEncoder
from sklearn.pipeline import make_pipeline 
from sklearn.impute import KNNImputer
from lightgbm import LGBMClassifier


lgb_params = {
    'objective': 'binary',
    "random_state": 42,
    "n_estimators": 500,
    'learning_rate': 0.01,
    'bagging_freq': 1,
    'pos_bagging_fraction': 0.75,
    'neg_bagging_fraction': 0.05,
    'feature_fraction': 0.8,
    'lambda_l1': 0.8,
    'lambda_l2': 0.8,
    "verbosity": -1,
    # "extra_trees": True
}

train_only_features = ['lesion_id',
                       'iddx_full',
                       'iddx_1',
                       'iddx_2',
                       'iddx_3',
                       'iddx_4',
                       'iddx_5',
                       'mel_mitotic_index',
                       'mel_thick_mm',
                       'tbp_lv_dnn_lesion_confidence']

train_compressed = train_metadata.drop(train_only_features, axis = 1)

pip = make_pipeline(HashingEncoder(),
                    KNNImputer(),
                    LGBMClassifier(**lgb_params)) 

In [5]:
%time
pip.fit(train_compressed.drop('target',
                               axis = 1), train_compressed['target'])


prob_encoder = pip.predict_proba(test_metadata)[:,1]

In [6]:
import pickle

with open('metadata-model.pkl','wb') as f:
    pickle.dump(pip,f)

In [8]:
import torchvision.models as models
import torch
import torch.nn as nn

model = models.efficientnet_b0(weights = None)

In [9]:
sum([p.numel() for p in model.features.parameters()]) + sum([p.numel() for p in model.classifier.parameters()])
  

5288548

In [10]:
model

EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): MBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): Conv2dNormActivat

In [17]:
for p in model.features.parameters():
    p.requires_grad = True

model.classifier.add_module('flatten', nn.Flatten())
model.classifier.add_module('fc', nn.Linear(1000,1))
model.classifier.add_module('sigmoid',nn.Sigmoid())

AttributeError: 'DataParallel' object has no attribute 'features'

In [14]:
# import pandas as pd 

# HOME_DIRECTORY = '/kaggle/input/isic-2024-challenge/'
# df = pd.read_csv(HOME_DIRECTORY + 'train-metadata.csv')

In [15]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [16]:
print("Available devices:")
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"CUDA:{i} - {torch.cuda.get_device_name(i)}")
    model = nn.DataParallel(model).to(device)
else:
    print("CPU")

# Get the current device
current_device = torch.cuda.current_device()
print(f"\nCurrent device: {current_device}")

# Get the default device
default_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Default device: {default_device}")


Available devices:
CUDA:0 - Tesla P100-PCIE-16GB

Current device: 0
Default device: cuda


In [8]:
import torch
import torchvision.transforms as transforms
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import io
import h5py
import cv2

class SkinDataset(Dataset):
    """Predict melanoma cancer"""

    def __init__(self, root_dir, transform=None, train_metadata = None, is_h5py = False, is_train= True):
        """
        Arguments:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.root_dir = root_dir
        self.train_metadata = train_metadata
        self.is_h5py = is_h5py
        self.is_train = is_train 
        
        if self.is_h5py:
            self.file = h5py.File(root_dir, 'r')
        
        self.transform = transform
        

    def __len__(self):
        return len(self.train_metadata)

    def __getitem__(self, idx):
        
        img_id = self.train_metadata.iloc[idx, 0]
        
        if not self.is_h5py:
            img_name = os.path.join(self.root_dir,
                                    img_id + '.jpg')
            image = Image.open(img_name).convert('RGB')
        else:
            image_data = self.file[img_id][()]
            image = Image.open(io.BytesIO(image_data)).convert('RGB')
        

        if self.transform:
            image = self.hair_remove(image)
            image = self.transform(image)
        
        if self.is_train:
            targets = self.train_metadata.iloc[idx, 1]
            targets = torch.Tensor([targets]).float()
            return image,targets
        else:
            return image
            
    def hair_remove(self,image):
        # Apply Black hat transformation to image. 
        grayScale = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        kernel = cv2.getStructuringElement(1,(17,17))
        blackhat = cv2.morphologyEx(grayScale, cv2.MORPH_BLACKHAT, kernel)
        _,threshold = cv2.threshold(blackhat,10,255,cv2.THRESH_BINARY)
        final_image = cv2.inpaint(image,threshold,1,cv2.INPAINT_TELEA)

    return final_image
    
    def __del__(self):
        if self.is_h5py:
            self.file.close()
    

SAMPLE_IMAGE_DIRECTORY = '/kaggle/input/isic-2024-challenge/train-image/image/'
HOME_DIRECTORY = '/kaggle/input/isic-2024-challenge/'
SAMPLE_TRAIN_IMAGE_DIRECTORY = '/kaggle/input/isic-2024-challenge/train-image.hdf5'
SAMPLE_TEST_IMAGE_DIRECTORY = '/kaggle/input/isic-2024-challenge/test-image.hdf5'

In [117]:
transform = transforms.Compose([
                        transforms.Resize(256),
                        transforms.CenterCrop(224),
                        transforms.ToTensor(),
                        transforms.Normalize(mean=[0.485, 0.456, 0.406], # Standards used by other datasets
                                             std=[0.229, 0.224, 0.225]) # Sta
                    ])

df = pd.read_csv(HOME_DIRECTORY + 'train-metadata.csv', 
                             usecols = ['isic_id','target'])

test_metadata = pd.read_csv(HOME_DIRECTORY + 'test-metadata.csv', 
                             usecols = ['isic_id'])

train_metadata, val_metadata = train_test_split(df)

train_dataset = SkinDataset(SAMPLE_TRAIN_IMAGE_DIRECTORY, 
                             transform , 
                            train_metadata, 
                            is_h5py = True,
                            is_train = True)

val_dataset = SkinDataset( SAMPLE_TRAIN_IMAGE_DIRECTORY, 
                              transform , 
                              val_metadata, 
                              is_h5py = True,
                              is_train = True)

In [118]:
test_dataset = SkinDataset( SAMPLE_TEST_IMAGE_DIRECTORY, 
                            transform , 
                            test_metadata,
                            is_h5py = True,
                            is_train = False)

In [119]:
def calc_sample_weights(df):
    class_counts = df['target'].value_counts().to_dict()
    class_weights = {cls: 1.0 / count for cls, count in class_counts.items()}

    sample_weights = [class_weights[target] for target in df['target']]

    class_weights
    return sample_weights, class_weights

In [120]:
from torch.utils.data import  WeightedRandomSampler

sample_weights,class_weights = calc_sample_weights(train_metadata)
sampler = WeightedRandomSampler(
                                weights=sample_weights, 
                                num_samples = len(sample_weights), 
                                replacement=True)

train_loader = DataLoader(train_dataset,batch_size = 32 , sampler = sampler)
val_loader = DataLoader(val_dataset,batch_size = 32 )
test_loader = DataLoader(test_dataset, batch_size = 32)

In [121]:
from sklearn.metrics import roc_curve, auc

def score(solution: np.array, submission: np.array, min_tpr: float=0.80) -> float:

    # rescale the target. set 0s to 1s and 1s to 0s (since sklearn only has max_fpr)
    v_gt = abs(solution-1)
    
    # flip the submissions to their compliments
    v_pred = -1.0 * submission

    max_fpr = abs(1-min_tpr)

    # using sklearn.metric functions: (1) roc_curve and (2) auc
    fpr, tpr, _ = roc_curve(v_gt, v_pred, sample_weight=None)
    if max_fpr is None or max_fpr == 1:
        return auc(fpr, tpr)
    if max_fpr <= 0 or max_fpr > 1:
        raise ValueError("Expected min_tpr in range [0, 1), got: %r" % min_tpr)
        
    # Add a single point at max_fpr by linear interpolation
    stop = np.searchsorted(fpr, max_fpr, "right")
    x_interp = [fpr[stop - 1], fpr[stop]]
    y_interp = [tpr[stop - 1], tpr[stop]]
    tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp))
    fpr = np.append(fpr[:stop], max_fpr)
    partial_auc = auc(fpr, tpr)
    
    return partial_auc

In [122]:
# https://www.kaggle.com/code/edomingo/isic-24-scd-basic-eda-pytorch-simple-cnn?scriptVersionId=187691769&cellId=30
# https://www.kaggle.com/code/ayrgthonsoraca/skin-cancer-detection-cnn-img-csv

In [124]:
from torch.optim import Adam,SGD
from torch.nn import BCELoss,BCEWithLogitsLoss
import os
import torch.nn.functional as F
from tqdm import tqdm

optimizer = SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001, momentum = 0.9)
criterion = BCELoss()


debug = False

# Train the model
model.train()
for epoch in range(1):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(tqdm(train_loader)):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data[0].to(device), data[1].to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs).flatten()
        loss = criterion(outputs.unsqueeze(1), labels.float())
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        n = 1000
        if i % n == n-1:    # print every 2000 mini-batches
            print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss/n:.4f}")
            running_loss = 0.0
        
        if debug:
            if i > n: break
            
    correct = 0
    total = 0
    all_probs = []
    all_labels = []
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for i, data in enumerate(tqdm(val_loader)):
            images, labels = data[0].to(device), data[1].float().to(device)
            # calculate outputs by running images through the network
            probs = model(images).flatten()
            all_probs.extend(probs.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
            # the class with the highest energy is what we choose as prediction
            predicted = (probs >= 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            if debug:
                if i > 3: break

    print(len(all_labels), len(all_probs))
    print(all_labels[:5], all_probs[:5])
                  
    print(f"Accuracy of the model: {100 * correct // total} %")
    print(f"AUC of the model: {score(np.array(all_labels), np.array(all_probs), min_tpr=0.0)}")
    print(f"pAUC-TPR(0.8) of the model: {score(np.array(all_labels), np.array(all_probs))}")

print('Finished Training')

  1%|          | 100/9400 [00:28<41:58,  3.69it/s]

[1,   100] loss: 0.6788


  2%|▏         | 200/9400 [00:55<41:40,  3.68it/s]

[1,   200] loss: 0.6367


  3%|▎         | 300/9400 [01:22<40:16,  3.77it/s]

[1,   300] loss: 0.5963


  4%|▍         | 400/9400 [01:48<38:43,  3.87it/s]

[1,   400] loss: 0.5804


  4%|▍         | 420/9400 [01:54<40:47,  3.67it/s]


KeyboardInterrupt: 

In [18]:
OUTPUT_PATH = '/kaggle/working/efficient_net.pt'
torch.save(model, OUTPUT_PATH)
#### Saving entire model for simplicity. 

model = torch.load(OUTPUT_PATH)

DataParallel(
  (module): EfficientNet(
    (features): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): SiLU(inplace=True)
      )
      (1): Sequential(
        (0): MBConv(
          (block): Sequential(
            (0): Conv2dNormActivation(
              (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
              (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              (2): SiLU(inplace=True)
            )
            (1): SqueezeExcitation(
              (avgpool): AdaptiveAvgPool2d(output_size=1)
              (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
              (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
              (activation): SiLU(inplace=True)
              (scale_a

In [96]:
model.eval()

preds = []
for i,data in enumerate(test_loader):
    preds.append(model(data).to('cpu').detach().numpy())

preds = np.concatenate(preds)
probs = pip.predict_proba(test_metadata)[:,1]

In [97]:
submission_metadata = pd.read_csv(HOME_DIRECTORY + 'test-metadata.csv', 
                             usecols = ['isic_id'])
submission_metadata['target'] = np.max([probs, preds], axis = 0) # Pick max probability of cancer. 
submission_metadata.to_csv('submission.csv', index = False)