In [1]:
from fastai.vision.all import *
from fastcore.parallel import *
import fastai
import pandas as pd
from pathlib import Path

import torch
from torch.utils.data import Dataset

 

# Set paths
path = Path("/kaggle/input/isic-2024-challenge/")
train_metadata_path = path / 'train-metadata.csv'
images_path = path / 'train-image/image/'

# Preparing metadata file and merge with images

In [2]:
import pandas as pd
from pathlib import Path

#  Step 1: Define the full path to the CSV file
train_metadata_path = Path("C:/Users/Yashwanth/isic/train-metadata.csv")

#  Step 2: Load the metadata CSV
df = pd.read_csv(train_metadata_path, low_memory=False)

#  Step 3: Drop unnecessary columns
columns_to_drop = [
    'copyright_license', 'attribution', 'image_type', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4',
    'iddx_5', 'iddx_full', 'mel_mitotic_index', 'mel_thick_mm', 'tbp_tile_type', 
    'tbp_lv_dnn_lesion_confidence', 'lesion_id'
]

df.drop(columns=columns_to_drop, inplace=True, errors='ignore')  # `errors='ignore'` avoids crash if column missing

#  Step 4: Define categorical and continuous feature columns
cat_names = ['sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']
cont_names = [x for x in df.columns if x not in cat_names + ['target', 'isic_id', 'patient_id'] + columns_to_drop]

#  Step 5: Define target and image ID columns
y_col = 'target'
image_col = 'isic_id'

# Preview
print("Categorical columns:", cat_names)
print("Continuous columns:", cont_names)
print("Target column:", y_col)


Categorical columns: ['sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']
Continuous columns: ['age_approx', 'clin_size_long_diam_mm', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color', 'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z']
Target column: target


In [3]:
def process_data(df, cat_names):
    
    
    # Drop columns 
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])
    
    # Add number of pictures for each patient
    df['numb_pic'] = df.groupby('patient_id')['patient_id'].transform('count')
    
    # Fill missing values with the mode
    if 'age_approx' in df.columns:
        mode_age = df['age_approx'].mode()[0]
        df['age_approx'] = df['age_approx'].fillna(mode_age)
    
    if 'sex' in df.columns:
        mode_sex = df['sex'].mode()[0]
        df['sex'] = df['sex'].fillna(mode_sex)
    
    # Convert categorical columns to dummies
    df = pd.get_dummies(df, columns=cat_names, prefix=cat_names)
    
    # Get new categorical column names
    new_cat_columns = [col for col in df.columns if any(col.startswith(name + '_') for name in cat_names)]
    
    # Ensure 'isic_id' in df has the correct file extension
    # if 'isic_id' in df.columns:
    #     df['isic_id'] = df['isic_id'].apply(lambda x: x.strip() + '.jpg')
    
    return df, new_cat_columns

#Apply to df
df, new_cat_columns= process_data(df,cat_names)  

# Load images and create DataFrame
images = get_image_files(images_path)


# Create a custom dataset that includes both image and tabular data:

In [4]:
from PIL import Image
import torchvision.transforms as transforms



class ImageTabDataset(Dataset):
    def __init__(self, df, image_files, new_cat_columns, cont_names, y_col, img_size=(137, 137), transform=None):
        self.df = df
        self.image_files = [Path(img) for img in image_files]
        self.new_cat_columns = new_cat_columns
        self.cont_names = cont_names
        self.y_col = y_col
        self.img_size = img_size
        self.transform = transform or transforms.Compose([
            transforms.Resize(self.img_size),
            transforms.ToTensor()
        ])
        self.image_dict = {img.stem: img for img in self.image_files}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Verify that self.df is a DataFrame
        if not isinstance(self.df, pd.DataFrame):
            raise TypeError(f"Expected self.df to be a DataFrame, but got {type(self.df).__name__}")

        row = self.df.iloc[idx]
        isic_id = row['isic_id']
        if isic_id not in self.image_dict:
            raise KeyError(f"Image ID {isic_id} not found in image_dict.")
        img_path = self.image_dict[isic_id]
        img = Image.open(img_path).convert('RGB')
        if self.transform:
            img = self.transform(img)

        new_cat_columns = torch.tensor(row[self.new_cat_columns].values.astype(float)).float()
        cont = torch.tensor(row[self.cont_names].values.astype(float)).float()
        y = torch.tensor(row[self.y_col]).long()

        return img, new_cat_columns, cont, y