In [1]:
from fastai.vision.all import *
from fastcore.parallel import *
import fastai
import pandas as pd
from pathlib import Path

import torch
from torch.utils.data import Dataset
 
 

In [2]:
import timm
#model = timm.create_model("efficientformerv2_s2", pretrained=True)
#torch.save(model.state_dict(), "efficientformerv2_s2_weights.pth")

In [3]:
metadata_path =Path("/kaggle/input/medium110k-to-10k/Medium 100kto10k.csv")
 
hdf5_file=Path('/kaggle/input/medium110k-to-10k/medium_hdf5')

In [4]:
df=pd.read_csv(metadata_path)

In [5]:
# Drop specified columns
columns_to_drop = ['copyright_license', 'attribution', 'image_type', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4',
                       'iddx_5', 'iddx_full', 'mel_mitotic_index', 'mel_thick_mm', 'tbp_tile_type', 
                       'tbp_lv_dnn_lesion_confidence', 'lesion_id']

# Define categorical and continuous columns
cat_names = [ 'sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']
new_cat_columns = pd.read_csv('/kaggle/input/small-data/new_cat_columns.csv')['new_cat_columns'].tolist()
cont_names = [x for x in df.columns if x not in (cat_names + ['target', 'isic_id','patient_id']+columns_to_drop)]
y_col = 'target'
image_col = 'isic_id'

# Create a custom dataset that includes both image and tabular data:

In [6]:
import h5py
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image
import pandas as pd

class ImageTabDataset(Dataset):
    def __init__(self, df, hdf5_file, new_cat_columns, cont_names, y_col, transform=None):
        self.df = df
        self.hdf5_file = hdf5_file  # Path to the HDF5 file
        self.new_cat_columns = new_cat_columns
        self.cont_names = cont_names
        self.y_col = y_col
        
        # Define a default transform including resizing
        self.transform = transform or transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor()
        ])
        
        # Open the HDF5 file
        self.hdf = h5py.File(self.hdf5_file, 'r')
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Ensure self.df is a DataFrame
        if not isinstance(self.df, pd.DataFrame):
            raise TypeError(f"Expected self.df to be a DataFrame, but got {type(self.df).__name__}")

        row = self.df.iloc[idx]
        isic_id = row['isic_id']
        
        # Retrieve the image from the HDF5 file
        img = self.hdf['oversampled_images'][isic_id][:]
        img = Image.fromarray(img).convert('RGB')
        
        if self.transform:
            img = self.transform(img)

        # Convert categorical and continuous columns to tensors
        new_cat_columns = torch.tensor(row[self.new_cat_columns].values.astype(float)).float()
        cont = torch.tensor(row[self.cont_names].values.astype(float)).float()
        y = torch.tensor(row[self.y_col]).long()

        return img, new_cat_columns, cont, y
    
    def __del__(self):
        # Close the HDF5 file when the dataset object is destroyed
        self.hdf.close()

# Combine image and tabular data into a DataBlock:

In [7]:
import torch
import torch.nn as nn
import torchvision.models as models
from fastai.tabular.all import TabularPandas, get_emb_sz, TabularModel, Learner, CrossEntropyLossFlat, accuracy
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import torch.optim as optim

# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Stratified split
def stratified_splitter(df, valid_pct=0.25, seed=42):
    train_indices, valid_indices = train_test_split(
        df.index,
        test_size=valid_pct,
        stratify=df[y_col],  # stratify by target column
        random_state=seed
    )
    return train_indices, valid_indices

# Perform the split
train_indices, valid_indices = stratified_splitter(df)

# Create train and validation dataframes
train_df = df.iloc[train_indices]
valid_df = df.iloc[valid_indices]

# Define the datasets
train_dataset = ImageTabDataset(train_df, hdf5_file, new_cat_columns, cont_names, y_col)
valid_dataset = ImageTabDataset(valid_df, hdf5_file, new_cat_columns, cont_names, y_col)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4)
valid_loader = DataLoader(valid_dataset, batch_size=128, shuffle=False, num_workers=4)

# Create a DataLoaders object
dls = DataLoaders(train_loader, valid_loader)

# Manually get embedding sizes
def get_emb_szs(df, new_cat_columns):
    return [(df[col].nunique() + 1, min(50, (df[col].nunique() + 1) // 2)) for col in new_cat_columns]

# Calculate embedding sizes
emb_szs = get_emb_szs(train_df, new_cat_columns)
n_cont = len(cont_names)
out_sz = len(train_df[y_col].unique())

# Define custom model
class ImageTabularModel_1(nn.Module):
    def __init__(self, emb_szs, n_cont, out_sz, layers, ps=0.5):
        super().__init__()
        self.cnn = models.resnet50(weights=None)
        self.cnn.load_state_dict(torch.load('/kaggle/input/resnet50_untrained/pytorch/default/1/resnet50-11ad3fa6.pth'))
        for param in self.cnn.parameters():
            param.requires_grad = False
        num_ftrs = self.cnn.fc.in_features
        self.cnn.fc = nn.Linear(num_ftrs, out_sz)
        for param in self.cnn.fc.parameters():
            param.requires_grad = True
        self.tab_net = TabularModel(emb_szs, n_cont, out_sz, layers, ps)
        self.head = nn.Linear(out_sz * 2, out_sz)  # Adjust as needed

    def forward(self, x_img, x_cat, x_cont):
        if x_cat.dtype != torch.long:
            x_cat = x_cat.long()
        img_out = self.cnn(x_img)
        tab_out = self.tab_net(x_cat, x_cont)
        combined = torch.cat([img_out, tab_out], dim=1)
        return self.head(combined)
    
class ImageTabularModel_2(nn.Module):
    def __init__(self, emb_szs, n_cont, out_sz, layers, ps=0.5):
        super().__init__()
        self.cnn = timm.create_model("efficientformerv2_s2", pretrained=False)
        self.cnn.load_state_dict(torch.load('/kaggle/input/efficientformerv2/pytorch/default/1/efficientformerv2_s2_weights.pth'))
        self.fc = nn.Linear(self.cnn.num_features, out_sz)
        self.tab_net = TabularModel(emb_szs, n_cont, out_sz, layers, ps)
        self.head = nn.Linear(1002, out_sz)  # Adjust as needed

    def forward(self, x_img, x_cat, x_cont):
        if x_cat.dtype != torch.long:
            x_cat = x_cat.long()
        img_out = self.cnn(x_img)
        tab_out = self.tab_net(x_cat, x_cont)
        combined = torch.cat([img_out, tab_out], dim=1)
        return self.head(combined)

# Create the model
model_1 = ImageTabularModel_1(emb_szs, n_cont, out_sz, layers=[512, 256, 128], ps=0.5).to(device)
model_2 = ImageTabularModel_2(emb_szs, n_cont, out_sz, layers=[512, 256, 128], ps=0.5).to(device)

# Make the model parallel
model_1 = torch.nn.DataParallel(model_1)
model_2 = torch.nn.DataParallel(model_2)

# Add optimizer and loss function
loss_func = CrossEntropyLossFlat()

# Create Learner
learn_1 = Learner(dls, model_1, loss_func=loss_func, opt_func=partial(Adam, lr=0.001), metrics=accuracy, cbs=SaveModelCallback(monitor='valid_loss'),wd=1e-3)
learn_2 = Learner(dls, model_2, loss_func=loss_func, opt_func=partial(Adam, lr=0.001), metrics=accuracy, cbs=SaveModelCallback(monitor='valid_loss'), wd=1e-3)

In [8]:
#Find the learning rate
#learn_1.lr_find(suggest_funcs=(slide, valley))

In [9]:
#learn_2.lr_find(suggest_funcs=(slide, valley),start_lr=1e-4, end_lr=1e-1)

In [10]:
#Define the lr
custom_lr=0.005

# Train and fine-tune the model
#learn_1.fit_one_cycle(1, lr_max=custom_lr)
learn_1.fine_tune(5)

#learn_2.fit_one_cycle(3, lr_max=custom_lr)
learn_2.fine_tune(5)


 

epoch,train_loss,valid_loss,accuracy,time
0,0.031753,0.040363,0.994945,04:36


Better model found at epoch 0 with valid_loss value: 0.04036315530538559.


epoch,train_loss,valid_loss,accuracy,time
0,0.016618,0.041596,0.995091,04:17
1,0.017903,0.027368,0.995564,04:17
2,0.018596,0.028635,0.993382,04:18
3,0.014283,0.024755,0.995564,04:18
4,0.014458,0.031059,0.992691,04:18


Better model found at epoch 0 with valid_loss value: 0.04159565269947052.
Better model found at epoch 1 with valid_loss value: 0.027367789298295975.
Better model found at epoch 3 with valid_loss value: 0.024754874408245087.


epoch,train_loss,valid_loss,accuracy,time
0,0.039769,0.093914,0.994836,10:35


Better model found at epoch 0 with valid_loss value: 0.09391359984874725.


epoch,train_loss,valid_loss,accuracy,time
0,0.026687,0.066093,0.994473,10:33
1,0.019152,0.052576,0.994691,10:36
2,0.020656,0.039013,0.995527,10:36
3,0.008825,0.037884,0.995527,10:36
4,0.00334,0.025482,0.995855,10:36


Better model found at epoch 0 with valid_loss value: 0.06609296053647995.
Better model found at epoch 1 with valid_loss value: 0.05257594957947731.
Better model found at epoch 2 with valid_loss value: 0.03901294991374016.
Better model found at epoch 3 with valid_loss value: 0.03788415342569351.
Better model found at epoch 4 with valid_loss value: 0.025482233613729477.


#  Test part

In [11]:
def process_data(df, cat_names):
    
    
    # Drop columns 
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])
    
    # Add number of pictures for each patient
    df['numb_pic'] = df.groupby('patient_id')['patient_id'].transform('count')
    
    # Fill missing values with the mode
    if 'age_approx' in df.columns:
        mode_age = df['age_approx'].mode()[0]
        df['age_approx'] = df['age_approx'].fillna(mode_age)
    
    if 'sex' in df.columns:
        mode_sex = df['sex'].mode()[0]
        df['sex'] = df['sex'].fillna(mode_sex)
    
    # Convert categorical columns to dummies
    df = pd.get_dummies(df, columns=cat_names, prefix=cat_names)
    
    # Get new categorical column names
    new_cat_columns = [col for col in df.columns if any(col.startswith(name + '_') for name in cat_names)]
    
    # Ensure 'isic_id' in df has the correct file extension
    # if 'isic_id' in df.columns:
    #     df['isic_id'] = df['isic_id'].apply(lambda x: x.strip() + '.jpg')
    
    return df, new_cat_columns

In [12]:
ROOT_DIR = "/kaggle/input/isic-2024-challenge"
TEST_HDF = f'{ROOT_DIR}/test-image.hdf5'
TEST_CSV = f'{ROOT_DIR}/test-metadata.csv'

df_test = pd.read_csv(TEST_CSV)

#Apply to df
df_test, _= process_data(df_test,cat_names)  

# Ensure the test set has the same dummy variable columns as the training set
for col in new_cat_columns:
    if col not in df_test:
        df_test[col] = 0




# Check that the data fits the format of the training set 
train_columns = [col for col in df.columns if col != 'target']

# Reorder the test set columns to match the training set columns (excluding 'target')
df_test = df_test[train_columns]

In [13]:
import h5py
import numpy as np
from PIL import Image
from io import BytesIO
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T

from torch.utils.data import Dataset
from PIL import Image
import h5py
from io import BytesIO
import torch

class CombinedDataset_test(Dataset):
    def __init__(self, df, file_hdf, cat_names, train_columns, transforms=None, target_size=(137, 137)):
        self.df = df
        self.fp_hdf = h5py.File(file_hdf, mode="r")
        self.isic_ids = df['isic_id'].values
        self.transforms = transforms
        self.target_size = target_size
        
        self.cat_names = cat_names
        self.cont_names = cont_names
        
        # Convert categorical columns to numerical codes
        for cat in cat_names:
            if cat in self.df.columns:
                self.df[cat] = self.df[cat].astype('category').cat.codes
        
        # Convert all other columns to numeric
        for col in df.columns:
            if col not in cat_names  + ['isic_id']:
                self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
        
      
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        # Get image
        isic_id = self.isic_ids[index]
        img = Image.open(BytesIO(self.fp_hdf[isic_id][()])).convert('RGB')
        img = img.resize(self.target_size, Image.LANCZOS)

        if self.transforms:
            img = self.transforms(img)

        # Get tabular data
        tabular_data = self.df.iloc[index]
        
        # Separate categorical and continuous data
        cat_data = tabular_data[self.cat_names] 
        cont_data = tabular_data[self.cont_names] 
        
        # Convert to tensors
        cat_tensor = torch.tensor(cat_data.values.astype(int), dtype=torch.long)
        cont_tensor = torch.tensor(cont_data.values.astype(float), dtype=torch.float32)
        y=torch.tensor(0.7, dtype=torch.long) # Dummy variable to avoid errors
        return img, cat_tensor, cont_tensor,y

    

In [14]:
from torchvision import transforms as T
from PIL import Image

class ResizeOrPad:
    def __init__(self, img_size):
        self.img_size = img_size
    
    def __call__(self, img):
        return self.resize_or_pad(img)
    
    def resize_or_pad(self, img):
        width, height = img.size
        target_width, target_height = self.img_size
        if width > target_width or height > target_height:
            img = T.Resize(self.img_size)(img)
        else:
            padding = self.calculate_padding(img)
            img = T.Pad(padding=padding, fill=0, padding_mode='constant')(img)
            img = T.Resize(self.img_size)(img)
        return img
    
    def calculate_padding(self, img):
        width, height = img.size
        new_width, new_height = self.img_size
        pad_left = (new_width - width) // 2
        pad_top = (new_height - height) // 2
        pad_right = new_width - width - pad_left
        pad_bottom = new_height - height - pad_top
        return (pad_left, pad_top, pad_right, pad_bottom)

In [15]:
 # Define the transformations
# Define your custom transform along with other transformations
data_transforms = T.Compose([
    ResizeOrPad((224, 224)),  # Custom transform to resize or pad the image
    T.ToTensor(),             # Convert the image to a tensor
])


# Configuration
CONFIG = {
    'valid_batch_size': 64,  # Batch size for validation
}

# Create the combined dataset
combined_dataset = CombinedDataset_test(df_test, TEST_HDF, new_cat_columns, cont_names, transforms=data_transforms)

# Define DataLoader
test_loader = DataLoader(combined_dataset, batch_size=CONFIG['valid_batch_size'], 
                         num_workers=4, shuffle=False, pin_memory=True)



# Set the model to evaluation mode
learn_1.model.eval()
learn_2.model.eval()


DataParallel(
  (module): ImageTabularModel_2(
    (cnn): EfficientFormerV2(
      (stem): Stem4(
        (conv1): ConvNormAct(
          (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (bn): BatchNormAct2d(
            16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
            (drop): Identity()
            (act): GELU()
          )
        )
        (conv2): ConvNormAct(
          (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (bn): BatchNormAct2d(
            32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
            (drop): Identity()
            (act): GELU()
          )
        )
      )
      (stages): Sequential(
        (0): EfficientFormerV2Stage(
          (downsample): Identity()
          (blocks): Sequential(
            (0): EfficientFormerV2Block(
              (mlp): ConvMlpWithNorm(
                (fc1): ConvNormAct(
                  (conv): Conv2d(32, 128

In [16]:
import torch.nn.functional as F

# Initialize lists to collect predictions and ids
# Initialize lists to collect predictions and ids
all_probs_class_1 = []
all_isic_ids = []

def process_probabilities(probabilities):
    # Replace NaN values with 0
    probabilities = np.nan_to_num(probabilities, nan=0.0)
    
    # Ensure probabilities are within the range [0, 1]
    probabilities = np.clip(probabilities, 0, 1)
    
    return probabilities

# Function to get predictions from a model
def get_model_predictions(learn, test_loader):
    with torch.no_grad():
        logits, _ = learn.get_preds(dl=test_loader)  # The second output is targets, which is ignored
        probabilities = F.softmax(logits, dim=1)
        return probabilities[:, 1].detach().cpu().numpy()

# Get predictions from both models
prob_class_1_model_1 = get_model_predictions(learn_1, test_loader)
prob_class_1_model_2 = get_model_predictions(learn_2, test_loader)

# Process probabilities to ensure they are between 0 and 1
prob_class_1_model_1 = process_probabilities(prob_class_1_model_1)
prob_class_1_model_2 = process_probabilities(prob_class_1_model_2)

# Average predictions from both models
mean_prob_class_1 = (prob_class_1_model_1 + prob_class_1_model_2) / 2

# Collect the probabilities and ids
all_probs_class_1.extend(mean_prob_class_1)
all_isic_ids.extend(combined_dataset.isic_ids[:len(mean_prob_class_1)])  # Use the dataset directly

# Convert lists to DataFrame
results = pd.DataFrame({
    'isic_id': all_isic_ids,
    'target': all_probs_class_1
})
 

In [17]:
results.to_csv("submission.csv", index=False)

In [18]:
results

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.269712
1,ISIC_0015729,0.270131
2,ISIC_0015740,0.271533
