In [1]:
from fastai.vision.all import *
from fastcore.parallel import *
import fastai
import pandas as pd
from pathlib import Path

import torch
from torch.utils.data import Dataset

 

# Set paths
path = Path("/home/webadmin/Desktop/isic/")
train_metadata_path = path / 'train-metadata.csv'
images_path = path / '/home/webadmin/Desktop/isic/image'

# Preparing metadata file and merge with images

In [2]:
# Load metadata append

df = pd.read_csv(train_metadata_path,low_memory=False)

# Drop specified columns
columns_to_drop = ['copyright_license', 'attribution', 'image_type', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4',
                       'iddx_5', 'iddx_full', 'mel_mitotic_index', 'mel_thick_mm', 'tbp_tile_type', 
                       'tbp_lv_dnn_lesion_confidence', 'lesion_id']

# Define categorical and continuous columns
cat_names = [ 'sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']
cont_names = [x for x in df.columns if x not in cat_names + ['target', 'isic_id','patient_id']+columns_to_drop]
y_col = 'target'
image_col = 'isic_id'

In [3]:
def process_data(df, cat_names):
    
    
    # Drop columns 
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])
    
    # Add number of pictures for each patient
    df['numb_pic'] = df.groupby('patient_id')['patient_id'].transform('count')
    
    # Fill missing values with the mode
    if 'age_approx' in df.columns:
        mode_age = df['age_approx'].mode()[0]
        df['age_approx'] = df['age_approx'].fillna(mode_age)
    
    if 'sex' in df.columns:
        mode_sex = df['sex'].mode()[0]
        df['sex'] = df['sex'].fillna(mode_sex)
    
    # Convert categorical columns to dummies
    df = pd.get_dummies(df, columns=cat_names, prefix=cat_names)
    
    # Get new categorical column names
    new_cat_columns = [col for col in df.columns if any(col.startswith(name + '_') for name in cat_names)]
    
    # Ensure 'isic_id' in df has the correct file extension
    if 'isic_id' in df.columns:
         df['isic_id'] = df['isic_id'].apply(lambda x: x.strip() + '.jpg')
    
    return df, new_cat_columns

#Apply to df
df, new_cat_columns= process_data(df,cat_names)  

# Load images and create DataFrame
images = get_image_files(images_path)


# Create a custom dataset that includes both image and tabular data:

In [4]:
from PIL import Image
import torchvision.transforms as transforms
from pathlib import Path
import torch
import pandas as pd

class ImageTabDataset(Dataset):
    def __init__(self, df, image_files, new_cat_columns, cont_names, y_col, img_size=(137, 137), transform=None):
        self.df = df
        self.image_files = [Path(img) for img in image_files]
        self.new_cat_columns = new_cat_columns
        self.cont_names = cont_names
        self.y_col = y_col
        self.img_size = img_size
        self.transform = transform or transforms.Compose([
            transforms.Resize(self.img_size),
            transforms.ToTensor()
        ])
        # keep both with-extension and stem keys, lowercased for robust matching
        self.image_dict = {}
        for img in self.image_files:
            self.image_dict[img.name.lower()] = img
            self.image_dict[img.stem.lower()] = img

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Verify that self.df is a DataFrame
        if not isinstance(self.df, pd.DataFrame):
            raise TypeError(f"Expected self.df to be a DataFrame, but got {type(self.df).__name__}")

        row = self.df.iloc[idx]

        # --- robust ID lookup ---
        isic_id_raw = row['isic_id']
        key = str(isic_id_raw).strip().lower()

        # try direct
        if key not in self.image_dict:
            # add common extensions if missing
            if not (key.endswith('.jpg') or key.endswith('.jpeg') or key.endswith('.png')):
                if key + '.jpg' in self.image_dict:
                    key = key + '.jpg'
                elif key + '.jpeg' in self.image_dict:
                    key = key + '.jpeg'
                elif key + '.png' in self.image_dict:
                    key = key + '.png'

        if key not in self.image_dict:
            raise KeyError(f"Image ID {isic_id_raw} not found in image_dict.")

        img_path = self.image_dict[key]
        # ------------------------

        img = Image.open(img_path).convert('RGB')
        if self.transform:
            img = self.transform(img)

        new_cat_columns = torch.tensor(row[self.new_cat_columns].values.astype(float)).float()
        cont = torch.tensor(row[self.cont_names].values.astype(float)).float()
        y = torch.tensor(row[self.y_col]).long()

        return img, new_cat_columns, cont, y


# Combine image and tabular data into a DataBlock:

In [5]:
import torch
import torch.nn as nn
import torchvision.models as models
from fastai.tabular.all import TabularPandas, get_emb_sz, TabularModel, Learner, CrossEntropyLossFlat, accuracy
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

 

# Stratified split
def stratified_splitter(df, valid_pct=0.25, seed=42):
    train_indices, valid_indices = train_test_split(
        df.index,
        test_size=valid_pct,
        stratify=df[y_col],  # stratify by target column
        random_state=seed
    )
    return train_indices, valid_indices

# Perform the split
train_indices, valid_indices = stratified_splitter(df)

# Create train and validation dataframes
train_df = df.iloc[train_indices]
valid_df = df.iloc[valid_indices]

# Define the datasets
train_dataset = ImageTabDataset(train_df,images, new_cat_columns, cont_names, y_col)
valid_dataset = ImageTabDataset(valid_df,images, new_cat_columns, cont_names, y_col)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4)
valid_loader = DataLoader(valid_dataset, batch_size=128, shuffle=False, num_workers=4)

# Create a DataLoaders object
dls = DataLoaders(train_loader, valid_loader)

# Manually get embedding sizes
def get_emb_szs(df, new_cat_columns):
    return [(df[col].nunique() + 1, min(50, (df[col].nunique() + 1) // 2)) for col in new_cat_columns]

# Calculate embedding sizes
emb_szs = get_emb_szs(train_df, new_cat_columns)
n_cont = len(cont_names)
out_sz = len(train_df[y_col].unique())

# Define custom model
class ImageTabularModel(nn.Module):
    def __init__(self, emb_szs, n_cont, out_sz, layers, ps=0.5):
        super().__init__()
        # Initialize ResNet50 without pre-trained weights
        self.cnn = models.resnet50(weights=None)  # Use weights=None
        
        # Load custom weights
        self.cnn.load_state_dict(torch.load('/home/webadmin/Desktop/ISIC24_Skin_Cancer_Detection/Fastai/resnet50-11ad3fa6.pth'))
        
        # Adjust the final layer if needed
        num_ftrs = self.cnn.fc.in_features
        self.cnn.fc = nn.Linear(num_ftrs, out_sz)  # Set the number of output features
        
        # Initialize the tabular model
        self.tab_net = TabularModel(emb_szs, n_cont, out_sz, layers, ps)
        
        # Define the head that combines image and tabular outputs
        self.head = nn.Linear(out_sz * 2, out_sz)  # Adjust as needed

    def forward(self, x_img, x_cat, x_cont):
        if x_cat.dtype != torch.long:
            x_cat = x_cat.long()
        
        img_out = self.cnn(x_img)
        tab_out = self.tab_net(x_cat, x_cont)
        combined = torch.cat([img_out, tab_out], dim=1)
        return self.head(combined)

# Create the model
model = ImageTabularModel(emb_szs, n_cont, out_sz, layers=[512, 256, 128], ps=0.5).to(device)

# Make the model parallel
model = torch.nn.DataParallel(model)

# Create Learner
learn = Learner(dls, model, loss_func=CrossEntropyLossFlat(), metrics=accuracy)

In [6]:
# Define the path to your model file
model_path_out = Path('/kaggle/working/models/resnet50_full')
model_path_in = Path('/kaggle/input/resnet50/pytorch/resnet-3-epochs/1/resnet50_full.pth')

In [7]:
#Find the learning rate
#learn.lr_find(suggest_funcs=(slide, valley))

In [8]:
# Check if the model file exists
#Define the number of epochs
numb_epochs=3

# Define custom learning rate
custom_lr =0.3
learn.fine_tune(1)
'''
if not model_path_in.exists():
    # Model does not exist, so train and save the model
    learn.fit_one_cycle(3, lr_max=custom_lr)
    learn.save(model_path_out.stem)
else:
    # Model exists, so load, fine-tune, and save it
    learn.load(model_path_in.stem)
    learn.fine_tune(1)
    learn.save(model_path_out.stem)'''

epoch,train_loss,valid_loss,accuracy,time
0,0.010049,0.0065,0.999023,25:50


epoch,train_loss,valid_loss,accuracy,time
0,0.006038,0.006171,0.999033,25:34


'\nif not model_path_in.exists():\n    # Model does not exist, so train and save the model\n    learn.fit_one_cycle(3, lr_max=custom_lr)\n    learn.save(model_path_out.stem)\nelse:\n    # Model exists, so load, fine-tune, and save it\n    learn.load(model_path_in.stem)\n    learn.fine_tune(1)\n    learn.save(model_path_out.stem)'

# Test part


In [18]:
ROOT_DIR = "/home/webadmin/Desktop/isic/"
TEST_HDF = f'{ROOT_DIR}/test-image.hdf5'
TEST_CSV = f'{ROOT_DIR}/test-metadata.csv'

df_test = pd.read_csv(TEST_CSV)

#Apply to df
df_test, _= process_data(df_test,cat_names)  

# Ensure the test set has the same dummy variable columns as the training set
for col in new_cat_columns:
    if col not in df_test:
        df_test[col] = 0

# Create a list of columns from the training set excluding 'target'
train_columns = [col for col in df.columns if col != 'target']

# Reorder the test set columns to match the training set columns (excluding 'target')
df_test = df_test[train_columns]





In [19]:
import h5py, numpy as np
from PIL import Image
from io import BytesIO
import torch
from torch.utils.data import Dataset

def _fetch_h5_image(fp, isic_id):
    """
    Try several key variants and groups:
      - 'ISIC_xxxxxx' (no extension), then 'ISIC_xxxxxx.jpg/.jpeg/.png'
      - groups: 'oversampled_images', 'images', 'image', 'train_images', 'train', 'test', root
    Returns a PIL.Image in RGB.
    """
    raw = str(isic_id).strip()
    stem = raw.rsplit('.', 1)[0]  # drop extension if present
    name_candidates = [stem, stem + '.jpg', stem + '.jpeg', stem + '.png']
    group_candidates = ['oversampled_images', 'images', 'image', 'train_images', 'train', 'test', None]

    for grp in group_candidates:
        if grp is not None and grp not in fp:
            continue
        node = fp[grp] if grp is not None else fp
        for name in name_candidates:
            if name in node:
                ds = node[name]
                try:
                    arr = ds[()]  # dataset -> numpy or bytes
                except Exception:
                    # if it’s a group with one child dataset
                    kids = list(ds.keys())
                    if not kids: 
                        continue
                    arr = ds[kids[0]][()]

                if isinstance(arr, (bytes, bytearray)):         # raw encoded image bytes
                    return Image.open(BytesIO(arr)).convert('RGB')
                else:                                           # numpy array (H,W,3) or (3,H,W)
                    if arr.ndim == 3 and arr.shape[0] in (1,3) and arr.shape[0] != arr.shape[-1]:
                        arr = np.moveaxis(arr, 0, -1)          # CHW -> HWC
                    return Image.fromarray(arr).convert('RGB')
    # Nothing matched -> good error message
    tops = list(fp.keys())
    raise KeyError(f"{raw} not found in HDF5. Top-level keys: {tops[:8]}{'...' if len(tops)>8 else ''}")


class CombinedDataset_test(Dataset):
    # keep your call-site the same: CombinedDataset_test(df_test, TEST_HDF, new_cat_columns, cont_names, ...)
    def __init__(self, df, file_hdf, cat_names, cont_names, transforms=None, target_size=(137, 137)):
        self.df = df
        self.fp_hdf = h5py.File(file_hdf, mode="r")
        self.isic_ids = df['isic_id'].values
        self.transforms = transforms
        self.target_size = target_size

        self.cat_names = cat_names
        self.cont_names = cont_names

        # Convert categorical columns to codes if present
        for cat in cat_names:
            if cat in self.df.columns:
                self.df[cat] = self.df[cat].astype('category').cat.codes

        # Convert all other columns to numeric (except isic_id)
        for col in df.columns:
            if col not in cat_names + ['isic_id']:
                self.df[col] = pd.to_numeric(self.df[col], errors='coerce')

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        isic_id = self.isic_ids[index]

        # --- robust HDF5 lookup (handles missing .jpg and groups) ---
        img = _fetch_h5_image(self.fp_hdf, isic_id)
        img = img.resize(self.target_size, Image.LANCZOS)

        if self.transforms:
            img = self.transforms(img)

        # tabular split
        row = self.df.iloc[index]
        cat_data  = row[self.cat_names]      # already integer-coded above
        cont_data = row[self.cont_names]

        cat_tensor  = torch.tensor(cat_data.values.astype(int),    dtype=torch.long)
        cont_tensor = torch.tensor(cont_data.values.astype(float), dtype=torch.float32)
        y = torch.tensor(0, dtype=torch.long)  # dummy target for API compatibility

        return img, cat_tensor, cont_tensor, y

    def __del__(self):
        try:
            self.fp_hdf.close()
        except Exception:
            pass


In [20]:
 # Define the transformations
data_transforms = T.Compose([
    T.Resize((137, 137)),
    T.ToTensor(),
])

# Configuration
CONFIG = {
    'valid_batch_size': 64,  # Batch size for validation
}

# Create the combined dataset
combined_dataset = CombinedDataset_test(df_test, TEST_HDF, new_cat_columns, cont_names, transforms=data_transforms)

# Define DataLoader
test_loader = DataLoader(combined_dataset, batch_size=CONFIG['valid_batch_size'], 
                         num_workers=4, shuffle=False, pin_memory=True)



# Set the model to evaluation mode
learn.model.eval()

# Run predictions on the test DataLoader
preds, targs = learn.get_preds(dl=test_loader)

In [21]:
import torch.nn.functional as F

# Initialize lists to collect predictions and ids
all_probs_class_1 = []
all_isic_ids = []

# Example function to handle probabilities
def process_probabilities(probabilities):
    # Replace NaN values with 0
    probabilities = np.nan_to_num(probabilities, nan=0.0)
    
    # Ensure probabilities are within the range [0, 1]
    probabilities = np.clip(probabilities, 0, 1)
    
    return probabilities

# Get predictions from the DataLoader
with torch.no_grad():
    # Obtain predictions for the entire test set
    logits, _ = learn.get_preds(dl=test_loader)  # The second output is targets, which is ignored

    # Apply softmax to get probabilities
    probabilities = F.softmax(logits, dim=1)

    # Extract the probabilities for class ID 1
    prob_class_1 = probabilities[:, 1].detach().cpu().numpy()
    
    #process probabilities to ensure they are between 0 and 1
    prob_class_1 = process_probabilities(prob_class_1)
    
    # Collect the probabilities and ids
    all_probs_class_1.extend(prob_class_1)
    all_isic_ids.extend(combined_dataset.isic_ids[:len(prob_class_1)])  # Use the dataset directly

# Convert lists to arrays or DataFrame if needed
import pandas as pd
results = pd.DataFrame({
    'isic_id': all_isic_ids,
    'target': all_probs_class_1
})


In [22]:
results.to_csv("submission.csv", index=False)

In [23]:
results

Unnamed: 0,isic_id,target
0,ISIC_0015657.jpg,0.269085
1,ISIC_0015729.jpg,0.269083
2,ISIC_0015740.jpg,0.269101
