In [1]:
# 1) Install GCS connector
!pip install -q gcsfs

# 2) Auth to your Google account
from google.colab import auth
auth.authenticate_user()

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import os, random
import numpy as np
import tensorflow as tf

random_seed = 999

# 1) Load metadata (works for gs:// via fsspec if gcsfs is installed)
META = "gs://ucm_tfm_datos_carlotatrigo/metadata_clean.csv"
df_meta = pd.read_csv(META)
df = df_meta.copy()
df = df.drop(columns = ['origin_dataset',  'diagnosis', 'body_region',
                          'age', 'gender', 'width', 'height', 'brightness', 'blur_var',
                          'hue_entropy', 'hair_ratio', 'r_mean', 'g_mean', 'b_mean',
                          'image_path', 'image_relpath'])

# Ensure required columns exist
assert {'image_id','diagnosis_grouped','lesion_type'}.issubset(df.columns)

In [3]:
df.columns

Index(['image_id', 'lesion_type', 'diagnosis_grouped'], dtype='object')

In [4]:
# 2) (Optional) Create/verify image_uri
BASE = "gs://ucm_tfm_datos_carlotatrigo/images/images"
if 'image_uri' not in df.columns:
    # If your image_id already includes extension, this is fine.
    # If not, add the right extension in your pipeline, or fallback to trying .jpg/.png in the Dataset.
    df['image_uri'] = df['image_id'].apply(lambda x: f"{BASE}/{x}")

# 3) Define label vocabularies
diag_classes = df['diagnosis_grouped'].unique()
diag_to_idx = {c:i for i,c in enumerate(diag_classes)}

coarse_classes = df['lesion_type'].unique()
coarse_to_idx = {c:i for c,i in zip(coarse_classes, range(len(coarse_classes)))}


coarse_to_idx

{'benign': 0, 'malignant': 1, 'no_lesion': 2}

In [5]:
# 4) Map labels to indices
df = df[df['diagnosis_grouped'].isin(diag_classes)]  # keep only known labels incl. 'unknown'
df['y_diag'] = df['diagnosis_grouped'].map(diag_to_idx)
df['y_coarse'] = df['lesion_type'].map(coarse_to_idx)

# 5) Stratified split by diagnosis
train_df, test_df = train_test_split(df, test_size=0.15, random_state=random_seed, stratify=df['y_diag'])
train_df, val_df  = train_test_split(train_df, test_size=0.1765, random_state=random_seed, stratify=train_df['y_diag'])
# 0.1765 of 85% ≈ 15% so final ≈ 70/15/15

# 6) Class weights (effective number) for both heads
def class_balanced_weights(y, num_classes, beta=0.999):
    counts = np.bincount(y, minlength=num_classes)
    weights = (1 - beta) / (1 - np.power(beta, np.maximum(1, counts)))
    # Normalize to mean=1 for stability
    return weights * (num_classes / weights.sum())

w_diag = class_balanced_weights(train_df['y_diag'].values, num_classes=len(diag_classes))
w_coarse = class_balanced_weights(train_df['y_coarse'].values, num_classes=len(coarse_classes))
print("diag weights:", w_diag)
print("coarse weights:", w_coarse)

diag weights: [0.4107018  0.41625343 0.43460411 0.41070156 1.7183363  0.46429583
 0.41180739 1.73494116 3.41673395 0.58162446]
coarse weights: [0.87817462 0.8781782  1.24364718]


In [6]:
W_COARSE_T = tf.constant(w_coarse)
W_DIAG_T = tf.constant(w_diag)
print("Class weights — coarse:", w_coarse)
print("Class weights — diag :", w_diag)

Class weights — coarse: [0.87817462 0.8781782  1.24364718]
Class weights — diag : [0.4107018  0.41625343 0.43460411 0.41070156 1.7183363  0.46429583
 0.41180739 1.73494116 3.41673395 0.58162446]


In [7]:
train_df

Unnamed: 0,image_id,lesion_type,diagnosis_grouped,image_uri,y_diag,y_coarse
49446,ISIC_5268041.jpg,benign,unknown,gs://ucm_tfm_datos_carlotatrigo/images/images/...,3,0
53938,ISIC_6299415.jpg,benign,unknown,gs://ucm_tfm_datos_carlotatrigo/images/images/...,3,0
41304,ISIC_3397174.jpg,benign,unknown,gs://ucm_tfm_datos_carlotatrigo/images/images/...,3,0
18592,ISIC_0060140.jpg,malignant,mel,gs://ucm_tfm_datos_carlotatrigo/images/images/...,1,1
35549,ISIC_2074396.jpg,benign,unknown,gs://ucm_tfm_datos_carlotatrigo/images/images/...,3,0
...,...,...,...,...,...,...
4294,ISIC_0025697.jpg,benign,nv,gs://ucm_tfm_datos_carlotatrigo/images/images/...,0,0
53540,ISIC_6208118.jpg,benign,nv,gs://ucm_tfm_datos_carlotatrigo/images/images/...,0,0
46862,ISIC_4671410.jpg,malignant,bcc,gs://ucm_tfm_datos_carlotatrigo/images/images/...,6,1
67554,ISIC_9443293.jpg,malignant,bcc,gs://ucm_tfm_datos_carlotatrigo/images/images/...,6,1


FIRST MODEL

In [8]:
random.seed(random_seed)
np.random.seed(random_seed)
tf.random.set_seed(random_seed)

In [9]:
IMG = 384 # 512 if your GPU allows
BATCH = 16
EPOCHS = 3 # start small for sanity check
LR = 3e-4
SEED = 42
AUTO_SPLIT = False
AUTOTUNE = tf.data.AUTOTUNE

In [10]:
def make_ds(frame: pd.DataFrame, training: bool):
  uris = frame['image_uri'].to_numpy()
  yc = frame['y_coarse'].to_numpy()
  yd = frame['y_diag'].to_numpy()


  ds = tf.data.Dataset.from_tensor_slices((uris, yc, yd))


  def _load(uri, y_coarse, y_diag):
    img_bytes = tf.io.read_file(uri)
    img = tf.io.decode_image(img_bytes, channels=3, expand_animations=False)

    img = tf.image.resize_with_pad(img, IMG, IMG, method=tf.image.ResizeMethod.BILINEAR)
    img = tf.cast(img, tf.float32) # 0..255 float

    # sample weights per-output via class-weight lookup
    sw_coarse = tf.gather(W_COARSE_T, y_coarse)
    sw_diag = tf.gather(W_DIAG_T, y_diag)
    sample_w = {"coarse": sw_coarse, "diag": sw_diag}
    labels = {"coarse": y_coarse, "diag": y_diag}
    return img, labels, sample_w


  ds = ds.map(_load, num_parallel_calls=AUTOTUNE)
  # if training:
  ds = ds.shuffle(4096, seed=random_seed)
  ds = ds.batch(BATCH).prefetch(AUTOTUNE)
  return ds


train_ds = make_ds(train_df, training=True)
val_ds = make_ds(val_df, training=False)
test_ds = make_ds(test_df, training=False)

In [11]:
inputs = tf.keras.Input(shape=(IMG, IMG, 3), name="input")

x = tf.keras.layers.RandomFlip("horizontal")(inputs)
x = tf.keras.layers.RandomRotation(0.04)(x)
x = tf.keras.layers.RandomZoom(0.05)(x)

x = tf.keras.applications.efficientnet.preprocess_input(x)

base = tf.keras.applications.EfficientNetB0(include_top=False, weights="imagenet", pooling="avg")
base.trainable = False # freeze for a quick baseline
feat = base(x)


coarse_logits = tf.keras.layers.Dense(len(coarse_classes), name="coarse")(feat)
diag_logits = tf.keras.layers.Dense(len(diag_classes), name="diag")(feat)


model = tf.keras.Model(inputs, {"coarse": coarse_logits, "diag": diag_logits})
model.summary()

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


In [12]:
losses = {
        "coarse": tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        "diag": tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        }
metrics = {
        "coarse": [tf.keras.metrics.SparseCategoricalAccuracy(name="acc")],
        "diag": [tf.keras.metrics.SparseCategoricalAccuracy(name="acc")],
        }
opt = tf.keras.optimizers.Adam(learning_rate=LR)


model.compile(optimizer=opt, loss=losses, metrics=metrics)


In [13]:
SAVE_PATH = "/content/drive/MyDrive/0. TFM/master_thesis_ucm/models"
best_model_path = os.path.join(SAVE_PATH, "keras_two_head_best.keras")
final_model_path = os.path.join(SAVE_PATH, "keras_two_head_final.keras")

In [None]:
# Load the best model before continuing training
# model = tf.keras.models.load_model(best_model_path)

ckpt = tf.keras.callbacks.ModelCheckpoint(
    filepath=best_model_path,
    monitor="val_diag_acc",
    mode="max",
    save_best_only=True,
    verbose=1
)

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=[ckpt],
    verbose=1,
)


print(f"Saving final model to: {final_model_path}")
model.save(final_model_path)

Epoch 1/3
[1m 262/3138[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m3:02:31[0m 4s/step - coarse_acc: 0.7411 - coarse_loss: 0.5711 - diag_acc: 0.4395 - diag_loss: 0.7742 - loss: 1.3454

In [None]:
# model = tf.keras.models.load_model(best_model_path)
# model.evaluate(test_ds, verbose=1)

In [None]:
base.trainable = True

for layer in base.layers[:-20]:
  layer.trainable = False
  model.compile(optimizer=tf.keras.optimizers.Adam(1e-4), loss=losses, metrics=metrics)
  model.fit(train_ds, validation_data=val_ds, epochs=1, verbose=1)

In [None]:
print("Test metrics:")
model.load_weights("keras_two_head_best")
model.evaluate(test_ds, verbose=1)