In [1]:
from fastai.vision.all import *
from fastcore.parallel import *
import fastai
import pandas as pd
from pathlib import Path

import torch
from torch.utils.data import Dataset

 

# Set paths
path = Path("/kaggle/input/isic-2024-challenge/")
train_metadata_path = path / 'train-metadata.csv'
images_path = path / 'train-image/image/'

# Preparing metadata file and merge with images

In [2]:
import pandas as pd
from pathlib import Path

#  Step 1: Define the full path to the CSV file
train_metadata_path = Path("C:/Users/Yashwanth/isic/train-metadata.csv")

#  Step 2: Load the metadata CSV
df = pd.read_csv(train_metadata_path, low_memory=False)

#  Step 3: Drop unnecessary columns
columns_to_drop = [
    'copyright_license', 'attribution', 'image_type', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4',
    'iddx_5', 'iddx_full', 'mel_mitotic_index', 'mel_thick_mm', 'tbp_tile_type', 
    'tbp_lv_dnn_lesion_confidence', 'lesion_id'
]

df.drop(columns=columns_to_drop, inplace=True, errors='ignore')  # `errors='ignore'` avoids crash if column missing

#  Step 4: Define categorical and continuous feature columns
cat_names = ['sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']
cont_names = [x for x in df.columns if x not in cat_names + ['target', 'isic_id', 'patient_id'] + columns_to_drop]

#  Step 5: Define target and image ID columns
y_col = 'target'
image_col = 'isic_id'

# Preview
print("Categorical columns:", cat_names)
print("Continuous columns:", cont_names)
print("Target column:", y_col)


Categorical columns: ['sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']
Continuous columns: ['age_approx', 'clin_size_long_diam_mm', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color', 'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z']
Target column: target


In [3]:
def process_data(df, cat_names):
    
    
    # Drop columns 
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])
    
    # Add number of pictures for each patient
    df['numb_pic'] = df.groupby('patient_id')['patient_id'].transform('count')
    
    # Fill missing values with the mode
    if 'age_approx' in df.columns:
        mode_age = df['age_approx'].mode()[0]
        df['age_approx'] = df['age_approx'].fillna(mode_age)
    
    if 'sex' in df.columns:
        mode_sex = df['sex'].mode()[0]
        df['sex'] = df['sex'].fillna(mode_sex)
    
    # Convert categorical columns to dummies
    df = pd.get_dummies(df, columns=cat_names, prefix=cat_names)
    
    # Get new categorical column names
    new_cat_columns = [col for col in df.columns if any(col.startswith(name + '_') for name in cat_names)]
    
    # Ensure 'isic_id' in df has the correct file extension
    # if 'isic_id' in df.columns:
    #     df['isic_id'] = df['isic_id'].apply(lambda x: x.strip() + '.jpg')
    
    return df, new_cat_columns

#Apply to df
df, new_cat_columns= process_data(df,cat_names)  

# Load images and create DataFrame
images = get_image_files(images_path)


# Create a custom dataset that includes both image and tabular data:

In [4]:
from PIL import Image
import torchvision.transforms as transforms



class ImageTabDataset(Dataset):
    def __init__(self, df, image_files, new_cat_columns, cont_names, y_col, img_size=(137, 137), transform=None):
        self.df = df
        self.image_files = [Path(img) for img in image_files]
        self.new_cat_columns = new_cat_columns
        self.cont_names = cont_names
        self.y_col = y_col
        self.img_size = img_size
        self.transform = transform or transforms.Compose([
            transforms.Resize(self.img_size),
            transforms.ToTensor()
        ])
        self.image_dict = {img.stem: img for img in self.image_files}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Verify that self.df is a DataFrame
        if not isinstance(self.df, pd.DataFrame):
            raise TypeError(f"Expected self.df to be a DataFrame, but got {type(self.df).__name__}")

        row = self.df.iloc[idx]
        isic_id = row['isic_id']
        if isic_id not in self.image_dict:
            raise KeyError(f"Image ID {isic_id} not found in image_dict.")
        img_path = self.image_dict[isic_id]
        img = Image.open(img_path).convert('RGB')
        if self.transform:
            img = self.transform(img)

        new_cat_columns = torch.tensor(row[self.new_cat_columns].values.astype(float)).float()
        cont = torch.tensor(row[self.cont_names].values.astype(float)).float()
        y = torch.tensor(row[self.y_col]).long()

        return img, new_cat_columns, cont, y

# Combine image and tabular data into a DataBlock:

In [6]:
import torch
import torch.nn as nn
import torchvision.models as models
from fastai.tabular.all import TabularPandas, get_emb_sz, TabularModel, Learner, CrossEntropyLossFlat, accuracy
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

 

# Stratified split
def stratified_splitter(df, valid_pct=0.25, seed=42):
    train_indices, valid_indices = train_test_split(
        df.index,
        test_size=valid_pct,
        stratify=df[y_col],  # stratify by target column
        random_state=seed
    )
    return train_indices, valid_indices

# Perform the split
train_indices, valid_indices = stratified_splitter(df)

# Create train and validation dataframes
train_df = df.iloc[train_indices]
valid_df = df.iloc[valid_indices]

# Define the datasets
train_dataset = ImageTabDataset(train_df,images, new_cat_columns, cont_names, y_col)
valid_dataset = ImageTabDataset(valid_df,images, new_cat_columns, cont_names, y_col)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4)
valid_loader = DataLoader(valid_dataset, batch_size=128, shuffle=False, num_workers=4)

# Create a DataLoaders object
dls = DataLoaders(train_loader, valid_loader)

# Manually get embedding sizes
def get_emb_szs(df, new_cat_columns):
    return [(df[col].nunique() + 1, min(50, (df[col].nunique() + 1) // 2)) for col in new_cat_columns]

# Calculate embedding sizes
emb_szs = get_emb_szs(train_df, new_cat_columns)
n_cont = len(cont_names)
out_sz = len(train_df[y_col].unique())

# Define custom model
class ImageTabularModel(nn.Module):
    def __init__(self, emb_szs, n_cont, out_sz, layers, ps=0.5):
        super().__init__()
        # Initialize ResNet50 without pre-trained weights
        self.cnn = models.resnet50(weights=None)  # Use weights=None
        
        # Load custom weights
        self.cnn.load_state_dict(torch.load('C:/Users/Yashwanth/ISIC24_Skin_Cancer_Detection/Fastai/resnet50-11ad3fa6.pth'))
        
        # Adjust the final layer if needed
        num_ftrs = self.cnn.fc.in_features
        self.cnn.fc = nn.Linear(num_ftrs, out_sz)  # Set the number of output features
        
        # Initialize the tabular model
        self.tab_net = TabularModel(emb_szs, n_cont, out_sz, layers, ps)
        
        # Define the head that combines image and tabular outputs
        self.head = nn.Linear(out_sz * 2, out_sz)  # Adjust as needed

    def forward(self, x_img, x_cat, x_cont):
        if x_cat.dtype != torch.long:
            x_cat = x_cat.long()
        
        img_out = self.cnn(x_img)
        tab_out = self.tab_net(x_cat, x_cont)
        combined = torch.cat([img_out, tab_out], dim=1)
        return self.head(combined)

# Create the model
model = ImageTabularModel(emb_szs, n_cont, out_sz, layers=[512, 256, 128], ps=0.5).to(device)

# Make the model parallel
model = torch.nn.DataParallel(model)

# Create Learner
learn = Learner(dls, model, loss_func=CrossEntropyLossFlat(), metrics=accuracy)

In [7]:
# Define the path to your model file
model_path_out = Path('/kaggle/working/models/resnet50_full')
model_path_in = Path('/kaggle/input/resnet50/pytorch/resnet-3-epochs/1/resnet50_full.pth')

In [None]:
#Find the learning rate
#learn.lr_find(suggest_funcs=(slide, valley))

In [None]:
# Check if the model file exists
#Define the number of epochs
numb_epochs=3

# Define custom learning rate
custom_lr =0.3
learn.fine_tune(1)
 
if not model_path_in.exists():
    # Model does not exist, so train and save the model
    learn.fit_one_cycle(3, lr_max=custom_lr)
    learn.save(model_path_out.stem)
else:
    # Model exists, so load, fine-tune, and save it
    learn.load(model_path_in.stem)
    learn.fine_tune(1)
    learn.save(model_path_out.stem)

epoch,train_loss,valid_loss,accuracy,time
