In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input, Concatenate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import kagglehub
from tensorflow.keras.optimizers import Adam  # <-- import here
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


2025-08-13 12:24:40.776949: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755087880.986212      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755087881.047768      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:

# Download dataset
path = kagglehub.dataset_download("nroman/melanoma-external-malignant-256")

# Paths
train_dir = os.path.join(path, 'train/train')
test_dir = os.path.join(path, 'test/test')
csv_path = os.path.join(path, 'train_concat.csv')

# Load CSV
df = pd.read_csv(csv_path)
df['image_name'] = df['image_name'].apply(lambda x: x + '.jpg' if not x.endswith('.jpg') else x)
df['target'] = df['target'].astype(int)  # keep as int for binary

# Features
tab_features = ['anatom_site_general_challenge', 'sex', 'age_approx']

# Train/val split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['target'])


In [3]:
# Image generators (images only)
train_datagen = ImageDataGenerator(
    rescale=1./255, rotation_range=20, width_shift_range=0.2,
    height_shift_range=0.2, horizontal_flip=True, fill_mode='nearest'
)
val_datagen = ImageDataGenerator(rescale=1./255)

# Image generators (images only) -- keep shuffle=False for alignment
train_img_flow = train_datagen.flow_from_dataframe(
    dataframe=train_df.sort_values('image_name'),  # sort to match tabular order
    directory=train_dir,
    x_col='image_name',
    y_col=None,
    target_size=(256, 256),
    class_mode=None,
    batch_size=32,
    shuffle=False
)
val_img_flow = val_datagen.flow_from_dataframe(
    dataframe=val_df.sort_values('image_name'),
    directory=train_dir,
    x_col='image_name',
    y_col=None,
    target_size=(256, 256),
    class_mode=None,
    batch_size=32,
    shuffle=False
)


# Combined generator fix
def combined_gen(img_gen, tab_data, labels):
    tab_data = np.array(tab_data, dtype=np.float32)
    labels = np.array(labels, dtype=np.float32)
    while True:
        for i in range(len(img_gen)):
            img_batch = img_gen[i]
            start = i * img_gen.batch_size
            end = start + img_gen.batch_size
            yield (img_batch.astype(np.float32), tab_data[start:end]), labels[start:end]



Found 30118 validated image filenames.
Found 7530 validated image filenames.


In [4]:

# ----- Process Tabular Data -----
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), ['anatom_site_general_challenge', 'sex']),
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), ['age_approx'])
    ]
)
# Also sort tabular data in same order
train_df_sorted = train_df.sort_values('image_name')
val_df_sorted = val_df.sort_values('image_name')
X_train_tab = preprocessor.fit_transform(train_df_sorted[tab_features])
X_val_tab = preprocessor.transform(val_df_sorted[tab_features])
y_train = train_df_sorted['target'].values
y_val = val_df_sorted['target'].values

y_train = train_df['target'].values
y_val = val_df['target'].values


# print("NaNs in X_train_tab:", np.isnan(X_train_tab).any())
# print("Infs in X_train_tab:", np.isinf(X_train_tab).any())
# print("NaNs in y_train:", np.isnan(y_train).any())
# print("Infs in y_train:", np.isinf(y_train).any())
# print("NaNs in X_val_tab:", np.isnan(X_val_tab).any())
# print("Infs in X_val_tab:", np.isinf(X_val_tab).any())


In [5]:

# ----- Model -----
# CNN for images
image_input = Input(shape=(256, 256, 3))
x = Conv2D(32, (3, 3), activation='relu')(image_input)
x = MaxPooling2D(2, 2)(x)
x = Conv2D(64, (3, 3), activation='relu')(x)
x = MaxPooling2D(2, 2)(x)
x = Conv2D(128, (3, 3), activation='relu')(x)
x = MaxPooling2D(2, 2)(x)
x = Flatten()(x)

# Dense for tabular
tab_input = Input(shape=(X_train_tab.shape[1],))
t = Dense(32, activation='relu')(tab_input)

# Merge
merged = Concatenate()([x, t])
merged = Dense(512, activation='relu')(merged)
merged = Dropout(0.5)(merged)
output = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[image_input, tab_input], outputs=output)

model.compile(
    optimizer=Adam(learning_rate=0.001),  # lower LR from 0.001 to 0.0001
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC()]
)


I0000 00:00:1755087976.996810      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1755087976.997460      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [6]:

train_dataset = tf.data.Dataset.from_generator(
    lambda: combined_gen(train_img_flow, X_train_tab, y_train),
    output_signature=(
        (
            tf.TensorSpec(shape=(None, 256, 256, 3), dtype=tf.float32),
            tf.TensorSpec(shape=(None, X_train_tab.shape[1]), dtype=tf.float32)
        ),
        tf.TensorSpec(shape=(None,), dtype=tf.float32)
    )
)

val_dataset = tf.data.Dataset.from_generator(
    lambda: combined_gen(val_img_flow, X_val_tab, y_val),
    output_signature=(
        (
            tf.TensorSpec(shape=(None, 256, 256, 3), dtype=tf.float32),
            tf.TensorSpec(shape=(None, X_val_tab.shape[1]), dtype=tf.float32)
        ),
        tf.TensorSpec(shape=(None,), dtype=tf.float32)
    )
)

history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5,
    steps_per_epoch=len(train_img_flow),
    validation_steps=len(val_img_flow)
)
val_steps = len(val_img_flow)

# Evaluate
val_loss, val_accuracy, val_auc = model.evaluate(
    combined_gen(val_img_flow, X_val_tab, y_val),
    steps=val_steps
)
print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation AUC: {val_auc:.4f}")

# Save
model.save('melanoma_model_multi_input.h5')


Epoch 1/5


I0000 00:00:1755087982.140889      62 service.cc:148] XLA service 0x79e3f8004000 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1755087982.141673      62 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1755087982.141696      62 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1755087982.592516      62 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m  2/942[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:04[0m 69ms/step - accuracy: 0.8047 - auc: 0.4051 - loss: 2.5132  

I0000 00:00:1755087988.773179      62 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m942/942[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m626s[0m 654ms/step - accuracy: 0.8654 - auc: 0.5025 - loss: 0.5017 - val_accuracy: 0.8644 - val_auc: 0.4978 - val_loss: 0.4015
Epoch 2/5
[1m942/942[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m450s[0m 478ms/step - accuracy: 0.8665 - auc: 0.5051 - loss: 0.4000 - val_accuracy: 0.8644 - val_auc: 0.4955 - val_loss: 0.3993
Epoch 3/5
[1m942/942[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m456s[0m 485ms/step - accuracy: 0.8665 - auc: 0.5082 - loss: 0.3986 - val_accuracy: 0.8644 - val_auc: 0.5017 - val_loss: 0.4019
Epoch 4/5
[1m942/942[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m467s[0m 496ms/step - accuracy: 0.8665 - auc: 0.5026 - loss: 0.3978 - val_accuracy: 0.8644 - val_auc: 0.5001 - val_loss: 0.3998
Epoch 5/5
[1m942/942[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m457s[0m 486ms/step - accuracy: 0.8665 - auc: 0.5138 - loss: 0.3959 - val_accuracy: 0.8644 - val_auc: 0.4988 - val_loss: 0.3996
[1m236/236[0m [32m━━━━

# Validation Accuracy : 86.44 %